In [None]:
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("../input/huggingface-bert-variants/bert-base-cased/bert-base-cased")

In [None]:
import pandas as pd
path=[
    "../input/commonlitreadabilityprize/sample_submission.csv",
    "../input/commonlitreadabilityprize/test.csv",
    "../input/commonlitreadabilityprize/train.csv"
]

df_train = pd.read_csv(path[2])
df_test = pd.read_csv(path[1])
df_ss = pd.read_csv(path[0])

In [None]:
df_train = df_train.drop(['url_legal','license'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')

In [None]:
X= df_train['excerpt']
y=df_train['target'].values

X_test = df_test['excerpt']

In [None]:
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

n_splits = 5
df_train['bin'] = pd.cut(df_train.target,20, labels=[i for i in range(20)])
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
gen_skf = skf.split(df_train.id, y=df_train.bin)


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import os
SEEDS = [1]
cv_pred = np.zeros((len(X),))
test_pred = np.zeros((len(X_test)))
for seed in SEEDS:
    skf = StratifiedKFold(n_splits=n_splits,random_state=seed,shuffle=True)
    gen_skf = skf.split(df_train.id, y=df_train.bin)
    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        X_train = X[idx_train]
        X_labels = y[idx_train]
        X_val = X[idx_val]
        val_labels = y[idx_val] 
        test_labels = np.zeros((len(X_test),))
        
        #training_features = tokenizer(list(X_train),padding="max_length", truncation=True,max_length=512,return_tensors='tf')
        val_features = tokenizer(list(X_val),padding="max_length", truncation=True,max_length=512,return_tensors='tf')
        testing_features = tokenizer(list(X_test),padding="max_length",max_length=512,truncation=True,return_tensors='tf')   
        #training_features = {x:training_features[x] for x in tokenizer.model_input_names}
        val_features = {x:val_features[x] for x in tokenizer.model_input_names}
        
        testing_features =  {x:testing_features[x] for x in tokenizer.model_input_names}
        #training_set = tf.data.Dataset.from_tensor_slices((training_features, X_labels))
        #training_set = training_set.batch(10)
        val_set = tf.data.Dataset.from_tensor_slices((val_features,val_labels))
        val_set = val_set.batch(10)
        test_set = tf.data.Dataset.from_tensor_slices((testing_features,test_labels))
        test_set = test_set.batch(10) 

        transformer_model = TFAutoModelForSequenceClassification.from_pretrained("../input/huggingface-bert-variants/bert-base-cased/bert-base-cased",output_hidden_states=True)    
        input_ids = tf.keras.Input(shape=(512, ),dtype='int32')
        token_type_ids = tf.keras.Input(shape=(512, ),dtype='int32')
        attention_mask = tf.keras.Input(shape=(512, ), dtype='int32')
        transformer = transformer_model([{'input_ids':input_ids,'token_type_ids':token_type_ids,'attention_mask':attention_mask}])    
        hidden_states = transformer[1] # get output_hidden_states

        hidden_states_size = 2 # count of the last states 
        hiddes_states_ind = list(range(-hidden_states_size, 0, 1))
        selected_hiddes_states = tf.keras.layers.concatenate(tuple([hidden_states[i] for i in hiddes_states_ind]))
        x = tf.keras.layers.Dense(128, activation='relu')(selected_hiddes_states)
        x = tf.keras.layers.Dropout(0.5)(x)
        x = tf.keras.layers.Dense(1)(x)
        x = tf.math.reduce_mean(x,axis=1)
        output = tf.expand_dims(x, axis=1)
        model = tf.keras.models.Model(inputs = [{'input_ids':input_ids,'token_type_ids':token_type_ids,'attention_mask':attention_mask}], outputs = output)
 
        checkpoint_filepath = f'../input/finetune-bert-model-with-psuedo-labeling/model{seed}_{fold}'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                monitor='val_loss',
                mode="auto",
                save_weights_only=True,
                save_best_only=True)

        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=tf.keras.losses.MeanSquaredError())
        
        #model.fit(training_set,validation_data=val_set,epochs=8,callbacks=[model_checkpoint_callback])
        model.load_weights(checkpoint_filepath)
        cv_pred[idx_val] += model.predict(val_set)[:,0,0]/(len(SEEDS))
        print(f"seed{seed}-fold{fold}=",mean_squared_error(cv_pred[idx_val],y[idx_val]))
        test_pred += model.predict(test_set)[:,0,0]/(n_splits*len(SEEDS))
    print(f'cv{seed}=',mean_squared_error(cv_pred,y))
        


In [None]:
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")
submission.target = test_pred
submission.to_csv("submission.csv", index=False)