In [None]:
from transformers import AutoTokenizer,TFAutoModel,AutoConfig
tokenizer = AutoTokenizer.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base",max_length=256)
config = AutoConfig.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base")

In [None]:
import pandas as pd
external_df = pd.read_csv('../input/increase-the-external-dataset-for-pseudo-labeling/filtered_aug_dataset')

In [None]:
filterd_external_df = external_df[external_df['target'] != 'remove']
filterd_external_df = filterd_external_df.reset_index(drop=True)
del filterd_external_df["Unnamed: 0"]

In [None]:
import pandas as pd
path=[
    "../input/commonlitreadabilityprize/sample_submission.csv",
    "../input/commonlitreadabilityprize/test.csv",
    "../input/commonlitreadabilityprize/train.csv"
]

df_train = pd.read_csv(path[2])
df_test = pd.read_csv(path[1])
df_ss = pd.read_csv(path[0])

In [None]:
df_train = df_train.drop(['url_legal','license'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')

In [None]:
X= df_train['excerpt']
y=df_train['target'].values
X_external = filterd_external_df['excerpt']
y_external = filterd_external_df['target'].values
X_test = df_test['excerpt']

In [None]:
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

n_splits = 5
df_train['bin'] = pd.cut(df_train.target,12, labels=[i for i in range(12)])
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
gen_skf = skf.split(df_train.id, y=df_train.bin)


In [None]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
from sklearn.metrics import mean_squared_error
import keras
global best_score 
global scores 
class eval_on_batch(keras.callbacks.Callback):
    def __init__(self,val_set,target,filepath):
        self.val_set = val_set
        self.target = target
        self.best_score = float('inf')
        self.filepath = filepath
        
    def  on_train_batch_end(self,batch,logs=None):
            if batch % 10 == 0 and batch != 0:
                predictions = self.model.predict(self.val_set)[:,0]
                cv = root_mean_squared_error(self.target, predictions)
                if (cv < self.best_score and cv <= 0.6):
                    self.model.save_weights(self.filepath)
                    self.best_score = cv                 
                    print(f"\n validation error --> {cv}")

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import os
SEEDS = [42]
cv_pred = np.zeros((len(X),))
test_pred = np.zeros((len(X_test)))
for seed in SEEDS:
    skf = StratifiedKFold(n_splits=n_splits,random_state=seed,shuffle=True)
    gen_skf = skf.split(df_train.id, y=df_train.bin)
    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        X_train = X[idx_train]
        X_labels = y[idx_train]
        X_val = X[idx_val]
        val_labels = y[idx_val] 
        test_labels = np.zeros((len(X_test),))
        
        training_features = tokenizer(list(X_train),padding="max_length",max_length=256,truncation=True,return_tensors='tf')
        val_features = tokenizer(list(X_val),padding="max_length",max_length=256,truncation=True,return_tensors='tf')
        testing_features = tokenizer(list(X_test),padding="max_length",max_length=256,truncation=True,return_tensors='tf')   
        training_features = {x:training_features[x] for x in tokenizer.model_input_names}
        val_features = {x:val_features[x] for x in tokenizer.model_input_names}
   
        testing_features =  {x:testing_features[x] for x in tokenizer.model_input_names}
        training_set = tf.data.Dataset.from_tensor_slices((training_features, X_labels))
        training_set = training_set.batch(8)
        val_set = tf.data.Dataset.from_tensor_slices((val_features,val_labels))
        val_set = val_set.batch(8)
        test_set = tf.data.Dataset.from_tensor_slices((testing_features,test_labels))
        test_set = test_set.batch(8) 
        
        if fold == 0:
            external_features = tokenizer(list(X_external),padding="max_length",truncation=True,return_tensors='tf')
            external_features = {x:external_features[x] for x in tokenizer.model_input_names}
            external_set = tf.data.Dataset.from_tensor_slices((external_features,np.array(y_external,dtype=float)))
            external_set = external_set.batch(8)
            
        
        transformer_model = TFAutoModel.from_pretrained("../input/clrp-roberta-base/clrp_roberta_base",output_hidden_states=True,from_pt=True)    
        input_ids = tf.keras.Input(shape=(256, ),dtype='int32')
        attention_mask = tf.keras.Input(shape=(256, ),dtype='int32')
        transformer = transformer_model({'input_ids':input_ids,'attention_mask':attention_mask})    
        hidden_states = transformer[0] # get output_hidden_states
        W = tf.keras.layers.Dense(config.hidden_size)(hidden_states)
        att = tf.nn.tanh(W)
        V = tf.keras.layers.Dense(1)(att)
        attention_weights = tf.nn.softmax(V,axis=1)
        context_vector = attention_weights*hidden_states
        context_vector = tf.math.reduce_sum(context_vector,axis=1) 
        x = tf.keras.layers.Dropout(0.5)(context_vector)
        output = tf.keras.layers.Dense(1)(x)
        
        model = tf.keras.models.Model(inputs = [{'input_ids':input_ids,'attention_mask':attention_mask}],outputs = output)
        checkpoint_filepath = f'./model'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                monitor='val_loss',
                mode="auto",
                save_weights_only=True,
                save_best_only=True)
        
        RL_S = tf.keras.experimental.CosineDecayRestarts(initial_learning_rate=2e-5,first_decay_steps=(len(X_train)/8)*3,alpha=0.1)   
        RL_S_callback = tf.keras.callbacks.LearningRateScheduler(RL_S)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
            loss=root_mean_squared_error,
            metrics=root_mean_squared_error,
            steps_per_execution=1)
        
        if fold == 10:
            model.fit(external_set,validation_data=val_set,epochs=4,callbacks=[eval_on_batch_callback])
            
       # model.load_weights(checkpoint_filepath)
        
        checkpoint_filepath = f'./model{seed}_{fold}'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                monitor='val_loss',
                mode="auto",
                save_weights_only=True,
                save_best_only=True)
        
        eval_on_batch_callback = eval_on_batch(val_set = val_set,target = y[idx_val],filepath = checkpoint_filepath)
        print(f'__________________Fold{fold}__________________')
        model.fit(training_set,validation_data=val_set,epochs=3,verbose=1,callbacks=[RL_S_callback,eval_on_batch_callback])#callbacks=[RL_S_callback]
        model.load_weights(checkpoint_filepath)    
        cv_pred[idx_val] += model.predict(val_set)[:,0]/(len(SEEDS))
        print('cv = ',mean_squared_error(cv_pred[idx_val],y[idx_val]))
        test_pred += model.predict(test_set)[:,0]/(n_splits*len(SEEDS))
    print(f'OOF{seed}=',mean_squared_error(cv_pred,y))
        


In [None]:
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")
submission.target = test_pred
submission.to_csv("submission.csv", index=False)