# Models Inference and Simple ensemble

This notebook is all about inference and ensemble of all the models that I have tried and each model has 5 sub models i.e for each fold. None of my single models reached LB score of less than 0.5 except roberta base which managed to reach 0.488. But ensembling helped me get to LB score of 0.482 which is a great boost. Later by adding more and more different models I have realised more the models better the score you get.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_squared_error
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModel,TFAutoModelForSequenceClassification, TFRobertaModel

Here are some of the great kernels that helped me during this competition

* [ragnar](https://www.kaggle.com/ragnar123)'s kernel 1 [CommonLit Readability Roberta TF](https://www.kaggle.com/ragnar123/commonlit-readability-roberta-tf)

* [ragnar](https://www.kaggle.com/ragnar123)'s kernel 2 [CommonLit Readability Roberta TF Inference](https://www.kaggle.com/ragnar123/commonlit-readability-roberta-tf-inference)

* [Abhishek Thakur](https://www.kaggle.com/abhishek)'s [Step 1: Create Folds](https://www.kaggle.com/abhishek/step-1-create-folds)

* [Ayush Thakur](https://www.kaggle.com/ayuraj)'s [Transformer Baseline with TF/Keras and W&B](https://www.kaggle.com/ayuraj/how-to-use-tf-data-to-train-hf-transformer)

There are lots of amazing notebooks out there and I might be missing some of those too. Do check them out!!

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv('../input/clrp-train-folded/train_folds.csv') # this dataset was created after looking at the work of Abhishek Thakur's Create folds notebook.
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
def split(num):
    train_df = train.loc[train.fold!=num]
    X1 = list(train_df.excerpt.values)
    Y1 = list(train_df.target.values)
    val_df = train.loc[train.fold==num]
    X2 = list(val_df.excerpt.values)
    Y2 = list(val_df.target.values)
    return X1,X2,Y1,Y2

In [None]:
def encode(train_enc,target):
    return {'input_ids':train_enc['input_ids'],'attention_mask':train_enc['attention_mask']},tf.cast(target, tf.float32)

def encode_bert(train_enc,target):
    return {'input_ids':train_enc['input_ids'],'attention_mask':train_enc['attention_mask'],'token_type_ids':train_enc['token_type_ids']},tf.cast(target, tf.float32)
 
def tensors(X1,Y1,X2,Y2,batch_size,enc):
    train = tf.data.Dataset.from_tensor_slices((dict(X1),list(Y1)))
    val = tf.data.Dataset.from_tensor_slices((dict(X2),list(Y2)))
    train_tensor = (train.repeat().shuffle(1024).map(enc,num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE))
    valid_tensor = (val.map(enc,num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE))
    return train_tensor,valid_tensor

In [None]:
def build_model(path,optimizer,token_ids):
    
    with strategy.scope():
        
        input_ids = tf.keras.Input(shape=(256,),name='input_ids',dtype=tf.int32)
        attention_mask = tf.keras.Input(shape=(256,),name='attention_mask',dtype=tf.int32)
        
        if token_ids==True: # I have used token_type_ids for bert model only
            
            token_type_ids = tf.keras.Input(shape=(256,),name='token_type_ids',dtype=tf.int32)
            model = TFAutoModel.from_pretrained(path)
            model = model([input_ids,attention_mask,token_type_ids])[0]
            out = model[:,0,:]
            #out=tf.keras.layers.Dropout(0.1)(out)
            out = tf.keras.layers.Dense(1,activation='linear')(out)
            Model = tf.keras.Model(inputs = [input_ids,attention_mask,token_type_ids],outputs = out)
            Model.compile(optimizer = optimizer,loss = tf.keras.losses.MeanSquaredError(),metrics = [tf.keras.metrics.RootMeanSquaredError()])
            return Model
        
        else:
            
            model = TFAutoModel.from_pretrained(path)
            model = model([input_ids,attention_mask])[0]
            out = model[:,0,:]
            #out=tf.keras.layers.Dropout(0.1)(out)
            out = tf.keras.layers.Dense(1,activation='linear')(out)
            Model = tf.keras.Model(inputs=[input_ids,attention_mask],outputs = out)
            Model.compile(optimizer = optimizer,loss = tf.keras.losses.MeanSquaredError(),metrics = [tf.keras.metrics.RootMeanSquaredError()])
            return Model

In [None]:
def prediction(model_path,weights_path,batch_size,optimizer,token_ids,enc,tok):
    
    train_preds = np.zeros((train.shape[0],1))
    test_pred = np.zeros((test.shape[0],1))
    test_excerpt = tok([text.strip() for text in test.excerpt.values],padding='max_length',max_length=256,truncation=True,return_attention_mask=True)
    test_enc = tf.data.Dataset.from_tensor_slices((dict(test_excerpt),list(np.zeros((test.shape[0],1)))))
    test_input = (test_enc.map(enc,num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE))

    for fold in range(5):
        
        tf.keras.backend.clear_session()
        model = build_model(model_path,optimizer,token_ids)
        X1,X2,Y1,Y2 = split(fold)
        train_enc = tok([text.strip() for text in X1],padding='max_length',max_length=256,truncation=True,return_attention_mask=True)
        val_enc = tok([text.strip() for text in X2],padding='max_length',max_length=256,truncation=True,return_attention_mask=True)
        train1,valid1 = tensors(train_enc,Y1,val_enc,Y2,batch_size,enc)
    
        model.load_weights(weights_path+str(fold)+'.h5')
        train_preds[train.index[train.fold==fold]] = model.predict(valid1)
        loss,rmse = model.evaluate(valid1)
        test_pred += model.predict(test_input)/5
        print('fold ',fold,' ',rmse)
        
    print('oof_rmse ', np.sqrt(mean_squared_error(train_preds,train.target)))
    
    return test_pred

In [None]:
roberta_preds = prediction('../input/robertabasemodel', '../input/robertabasemodels/modelrobertabase_', 8,
                        AdamWeightDecay(learning_rate=3e-5,weight_decay_rate=0.1,epsilon=1e-6,beta_1=0.9,beta_2=0.98),
                        False,encode,AutoTokenizer.from_pretrained('../input/robertabasetokenizer'))[:,0]

bert_preds = prediction('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased', '../input/bertbase/modelbertbase_',8,
                     AdamWeightDecay(learning_rate=3e-5,weight_decay_rate=0.1,epsilon=1e-6,beta_1=0.9,beta_2=0.999),
                     True,encode_bert,AutoTokenizer.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'))[:,0]

distilbert_preds = prediction('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased', '../input/distlbert-models/modeldistilbertbase_',24,
                           tf.keras.optimizers.Adam(3e-5),False,encode,AutoTokenizer.from_pretrained('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'))[:,0]

xlnet_preds = prediction('../input/xlnet-base-tensorflow-20', '../input/xlnet-base/modelxlnetbase_', 12,
                      tf.keras.optimizers.Adam(3e-5),False,encode,AutoTokenizer.from_pretrained('../input/xlnet-base-tensorflow-20'))[:,0]

In [None]:
final_preds = ( roberta_preds + bert_preds + distilbert_preds + xlnet_preds ) / 4

pd.DataFrame({
    'id':test.id,
    'target':final_preds
}).to_csv('submission.csv',index=False)

Do upvote if you like it and any suggestions or doubts are welcome in the comment section.
* Happy Kaggling!!