In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import torch
import copy
import math
import gc
from tqdm import tqdm
import torch.utils.data as D
import random
import os
from transformers import AutoModelWithLMHead, AutoTokenizer,RobertaConfig, RobertaModel,AutoModelForSequenceClassification,AutoModelForMaskedLM
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch import optim
import time
import torch.nn.functional as F
from transformers import (
    AutoModel,
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
    set_seed,
)
from sklearn.linear_model import Ridge
import time

# parameters for this notebook

In [None]:
class CFG:
    seed=12345
    test_avg_n=1
    val_avg_n=5
    max_len=256
    batch_size=24
    dropout_p=0.1
    folds=5
    cv_shuffle=False
    pad_token_id=1
    device=torch.device('cuda:0')
    dtype=torch.float32

In [None]:
train_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
res_df=pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
class RobertaDataset(D.Dataset):
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate(batch):
    ids, attns, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True,padding_value=CFG.pad_token_id).to(CFG.device)
    attns = pad_sequence(attns, batch_first=True,padding_value=CFG.pad_token_id).to(CFG.device)
    targets = torch.tensor(targets).float().to(CFG.device)
    return ids, attns, targets

def CV_split(m,k=5,shuffle=False,seed=7):
    index=np.arange(m)
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(index)
    test_size=math.ceil(m/k)
    split_indices=[]
    for i in range(k):
        bool_index=np.zeros(m)
        bool_index[test_size*i:test_size*(i+1)]=1
        bool_index=bool_index.astype('bool')
        val_index=index[bool_index]
        train_index=index[~bool_index]
        split_indices.append((train_index,val_index))
    return split_indices

def rmse(y1,y2):
    score=np.sqrt(((y1-y2)**2).mean())
    return score

def score_test(model,test_ldr,mode='train',avg_n=1):
    if mode=='eval':
        model.eval()
    elif mode=='train':
        model.train()
    avg_pred=pd.DataFrame()
    for i in range(avg_n):
        preds=[]
        for texts, attns, idx in test_ldr:
            with torch.no_grad():
                pred = model(texts,attns)
                preds.append(pred)
        preds=torch.cat(preds,axis=0)
        preds=preds.to('cpu').numpy().reshape(-1)
        avg_pred[f'pred{i+1}']=preds
    #print(avg_pred.corr())
    return avg_pred.values.mean(axis=1)

def tokenize(tokenizer,texts):
    tokens=[]
    for text in texts:
        token=tokenizer(text,max_length=CFG.max_len,truncation=True, padding='max_length',add_special_tokens=True)
        tokens.append(token)
    return tokens

# Models

* remeber that different models has different mask token id
* use different seed in dataloader for diversity

* F1
* CV 0.477 LB 0.469
* roberta base train with ITPT weights


In [None]:
class MyModel1(nn.Module):
    def __init__(self, model):
        super(MyModel1, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F1_CFG={}
F1_CFG['max_len']=256
F1_CFG['seed']=12345
F1_CFG['batch_size']=24
F1_CFG['dropout_p']=0.1

* F2
* CV 0.4839 LB 0.470
* roberta large
* mv state dict to cpu



In [None]:
class MyModel2(nn.Module):
    def __init__(self, model):
        super(MyModel2, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F2_CFG={}
F2_CFG['max_len']=256
F2_CFG['seed']=7
F2_CFG['batch_size']=8
F2_CFG['dropout_p']=0.1

* F3
* CV 0.487 LB 0.486
* SimCSE roberta base

In [None]:
class MyModel3(nn.Module):
    def __init__(self, model):
        super(MyModel3, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F3_CFG={}
F3_CFG['max_len']=256
F3_CFG['seed']=33
F3_CFG['batch_size']=24
F3_CFG['dropout_p']=0.1

* F4

* F5
* CV 0.4773 LB 0.474
* roberta base train with ITPT weights
* ITPT model from: Maunish's pre-trained model

In [None]:
class MyModel5(nn.Module):
    def __init__(self, model):
        super(MyModel5, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F5_CFG={}
F5_CFG['max_len']=256
F5_CFG['seed']=12345
F5_CFG['batch_size']=24
F5_CFG['dropout_p']=0.1

* F6
* CV 0.4855 LB 0.471
* almost no diversity same seed and model and pretrain

In [None]:
class MyModel6(nn.Module):
    def __init__(self, model):
        super(MyModel6, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F6_CFG={}
F6_CFG['max_len']=256
F6_CFG['seed']=12345
F6_CFG['batch_size']=24
F6_CFG['dropout_p']=0.1

* F7
* CV 0.4781 LB 0.473

In [None]:
class MyModel7(nn.Module):
    def __init__(self,model):
        super(MyModel7,self).__init__()
        self.roberta = model             
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        return self.regressor(context_vector)
    
F7_CFG={}
F7_CFG['max_len']=256
F7_CFG['seed']=666
F7_CFG['batch_size']=24
F7_CFG['dropout_p']=0.1

* F8
* CV 0.4815  LB 0.472

In [None]:
class MyModel8(nn.Module):
    def __init__(self, model):
        super(MyModel8, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.output_layer(text_emb)
        return x2
F8_CFG={}
F8_CFG['max_len']=256
F8_CFG['seed']=54321
F8_CFG['batch_size']=24
F8_CFG['dropout_p']=0.1

* F9
* CV 0.4799 LB 0.475
* roberta base
* one linear output layer
* 2 layer re-init

In [None]:
class MyModel9(nn.Module):
    def __init__(self, model):
        super(MyModel9, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.output_layer(text_emb)
        return x2
F9_CFG={}
F9_CFG['max_len']=256
F9_CFG['seed']=54321
F9_CFG['batch_size']=24
F9_CFG['dropout_p']=0.1

* F10
* CV 0.479 LB 0.479

* roberta base
* two linear output layer
* 2 layer re-init
* seed 777 to compare with F1

In [None]:
class MyModel10(nn.Module):
    def __init__(self, model):
        super(MyModel10, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,768)
        self.linear2 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F10_CFG={}
F10_CFG['max_len']=256
F10_CFG['seed']=12345
F10_CFG['batch_size']=24
F10_CFG['dropout_p']=0.1

* F11
* CV 0.4811 LB 0.466
* roberta large
* two linear output layer
* 5 layer re init

In [None]:
class MyModel11(nn.Module):
    def __init__(self, model):
        super(MyModel11, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
F11_CFG={}
F11_CFG['max_len']=256
F11_CFG['seed']=7
F11_CFG['batch_size']=8
F11_CFG['dropout_p']=0.1

* F12
* CV 0.4773 LB 0.473
* roberta base
* seed 777 to compare with F1
* 10% warm up

In [None]:
class MyModel12(nn.Module):
    def __init__(self, model):
        super(MyModel12, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F12_CFG={}
F12_CFG['max_len']=256
F12_CFG['seed']=12345
F12_CFG['batch_size']=24
F12_CFG['dropout_p']=0.1

* F13
* CV 0.4816 LB 0.484
* roberta base 
* two linear output layer
* 3 layer re-init
* seed 777 to compare with F1
* lr diff don't apply to re-init layers
* seems roberta base (not large) with re init do poorly in the LB test fold...

In [None]:
class MyModel13(nn.Module):
    def __init__(self, model):
        super(MyModel13, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,768)
        self.linear2 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F13_CFG={}
F13_CFG['max_len']=256
F13_CFG['seed']=12345
F13_CFG['batch_size']=24
F13_CFG['dropout_p']=0.1

* F14
* CV 0.4778 LB 0.468
* roberta base 
* seed 777 to compare with F1
* 10% warm up
* batch size 32 and lr 8e-5

In [None]:
class MyModel14(nn.Module):
    def __init__(self, model):
        super(MyModel14, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F14_CFG={}
F14_CFG['max_len']=256
F14_CFG['seed']=12345
F14_CFG['batch_size']=24
F14_CFG['dropout_p']=0.1

* F15
* CV 0.4839 LB
* roberta large
* two linear output layer
* 5 layer re init
* lr with warm up

In [None]:
class MyModel15(nn.Module):
    def __init__(self, model):
        super(MyModel15, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
F15_CFG={}
F15_CFG['max_len']=256
F15_CFG['seed']=7
F15_CFG['batch_size']=8
F15_CFG['dropout_p']=0.1

* F16
* CV 0.4824 LB 0.468
* roberta large
* two linear output layer
* 5 layer re init
* 6 epochs (=large lr in 5epochs)

In [None]:
class MyModel16(nn.Module):
    def __init__(self, model):
        super(MyModel16, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
F16_CFG={}
F16_CFG['max_len']=256
F16_CFG['seed']=7
F16_CFG['batch_size']=8
F16_CFG['dropout_p']=0.1

* F17
* CV 0.47581798 LB 0.468
* roberta base train with ITPT weights
* bs 32 
* fix oom
* fix possible token id change
* add reinit code

In [None]:
class MyModel17(nn.Module):
    def __init__(self, model):
        super(MyModel17, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
    
F17_CFG={}
F17_CFG['max_len']=256
F17_CFG['seed']=12345
F17_CFG['batch_size']=24
F17_CFG['dropout_p']=0.1

* F18
* CV 0.4874678 LB 0.488
* Simcse base 
* seed 754
* with attention head

In [None]:
class MyModel18(nn.Module):
    def __init__(self,model):
        super(MyModel18,self).__init__()
        self.model = model             
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.model(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        return self.regressor(context_vector)
    
F18_CFG={}
F18_CFG['max_len']=256
F18_CFG['seed']=755
F18_CFG['batch_size']=24
F18_CFG['dropout_p']=0.1

* F19
* CV 0.4845 LB 0.483
* Simcse base 
* seed 754
* 2 layer re init

In [None]:
class MyModel19(nn.Module):
    def __init__(self, model):
        super(MyModel19, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
F19_CFG={}
F19_CFG['max_len']=256
F19_CFG['seed']=755
F19_CFG['batch_size']=24
F19_CFG['dropout_p']=0.1

* F20
* CV 0.4796 LB 0.464
* Simcse large
* seed 754
* two linear output layer
* 5 layer re init

In [None]:
class MyModel20(nn.Module):
    def __init__(self, model):
        super(MyModel20, self).__init__()
        self.model = model
        self.output_layer = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.model.lm_head.dense(text_emb)
        x3=self.output_layer(x2)
        return x3
F20_CFG={}
F20_CFG['max_len']=256
F20_CFG['seed']=755
F20_CFG['batch_size']=8
F20_CFG['dropout_p']=0.1

* F22
* CV LB
* roberta large
* gradient accumulation
* batch size 16 lr 4e-5

In [None]:
class MyModel22(nn.Module):
    def __init__(self, model):
        super(MyModel22, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F22_CFG={}
F22_CFG['max_len']=256
F22_CFG['seed']=7
F22_CFG['batch_size']=8
F22_CFG['dropout_p']=0.1

* F23
* CV 0.4792 LB 0.465
* roberta large
* gradient accumulation
* batch size 16 lr 2.5e-5

In [None]:
class MyModel23(nn.Module):
    def __init__(self, model):
        super(MyModel23, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F23_CFG={}
F23_CFG['max_len']=256
F23_CFG['seed']=7
F23_CFG['batch_size']=8
F23_CFG['dropout_p']=0.1

* F24
* CV 0.4815 LB 0.488
* deberta base 
* seed 5311
* bs 24

In [None]:
class MyModel24(nn.Module):
    def __init__(self, model):
        super(MyModel24, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,768)
        self.linear2 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F24_CFG={}
F24_CFG['max_len']=256
F24_CFG['seed']=721
F24_CFG['batch_size']=24
F24_CFG['dropout_p']=0.1

* F25
* CV 0.4831 LB 0.488
* deberta base 
* seed 5311
* bs 24
* re init 2 layers

In [None]:
class MyModel25(nn.Module):
    def __init__(self, model):
        super(MyModel25, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,768)
        self.linear2 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F25_CFG={}
F25_CFG['max_len']=256
F25_CFG['seed']=721
F25_CFG['batch_size']=24
F25_CFG['dropout_p']=0.1

* F26
* CV 0.4757 LB 0.465
* deberta large
* seed 5311
* re init 5 layers
* bs 4
* grad accum 4 lr 2.5

In [None]:
class MyModel26(nn.Module):
    def __init__(self, model):
        super(MyModel26, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F26_CFG={}
F26_CFG['max_len']=256
F26_CFG['seed']=721
F26_CFG['batch_size']=8
F26_CFG['dropout_p']=0.1

* F27
* CV 0.486 LB 0.464
* simcse large
* seed 754
* re init 5 layers
* grad accum 2 lr 2.5

In [None]:
class MyModel27(nn.Module):
    def __init__(self, model):
        super(MyModel27, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F27_CFG={}
F27_CFG['max_len']=256
F27_CFG['seed']=755
F27_CFG['batch_size']=8
F27_CFG['dropout_p']=0.1

* F28
* CV LB

* F29
* CV 0.50150967 LB
* bert base cased
* seed 2334
* bs 32 lr 8

In [None]:
class MyModel29(nn.Module):
    def __init__(self, model):
        super(MyModel29, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,768)
        self.linear2 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
F29_CFG={}
F29_CFG['max_len']=256
F29_CFG['seed']=2
F29_CFG['batch_size']=24
F29_CFG['dropout_p']=0.1

* F30
* CV 0.47575  LB 0.462
* deberta large
* seed 5311
* re init 5 layers
* bs 4
* grad accum 2 lr 2

In [None]:
class MyModel30(nn.Module):
    def __init__(self, model):
        super(MyModel30, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1024)
        self.linear2 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        x3=self.linear2(x2)
        return x3
    
F30_CFG={}
F30_CFG['max_len']=256
F30_CFG['seed']=721
F30_CFG['batch_size']=8
F30_CFG['dropout_p']=0.1

* F31
* CV LB
* bert large cased
* seed 2334
* bs 32 lr 8

In [None]:
class MyModel31(nn.Module):
    def __init__(self, model):
        super(MyModel31, self).__init__()
        self.model = model
        self.linear1 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
F31_CFG={}
F31_CFG['max_len']=256
F31_CFG['seed']=7212
F31_CFG['batch_size']=8
F31_CFG['dropout_p']=0.1

* F32
* CV 0.4749 LB 0.469
* deberta large
* seed 12222
* no re init
* bs 4
* grad accum 2 lr 2
* 1 linear layer

In [None]:
class MyModel32(nn.Module):
    def __init__(self, model):
        super(MyModel32, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F32_CFG={}
F32_CFG['max_len']=256
F32_CFG['seed']=20134
F32_CFG['batch_size']=8
F32_CFG['dropout_p']=0.1

* F33
* CV 0.4835 LB 0.464
* roberta large with ITPT
* seed 555666
* re init 5 layers
* 1 linear layer

In [None]:
class MyModel33(nn.Module):
    def __init__(self, model):
        super(MyModel33, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F33_CFG={}
F33_CFG['max_len']=256
F33_CFG['seed']=20134
F33_CFG['batch_size']=8
F33_CFG['dropout_p']=0.1

* F34
* CV LB
* bart base
* seed 9999
* one linear layer


In [None]:
class MyModel34(nn.Module):
    def __init__(self, model):
        super(MyModel34, self).__init__()
        self.model = model
        self.linear1 = nn.Linear(768,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['last_hidden_state']
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2

F34_CFG={}
F34_CFG['max_len']=256
F34_CFG['seed']=201
F34_CFG['batch_size']=24
F34_CFG['dropout_p']=0.1

* F35
* CV 0.4986 LB
* electra base
* seed 75412
* 1 linear layer

In [None]:
class MyModel35(nn.Module):
    def __init__(self, model):
        super(MyModel35, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F35_CFG={}
F35_CFG['max_len']=256
F35_CFG['seed']=75412
F35_CFG['batch_size']=24
F35_CFG['dropout_p']=0.1

* F36
* CV 0.4931 LB 0.480
* electra large
* seed 75412
* 1 linear layer
* re init 5 layers

In [None]:
class MyModel36(nn.Module):
    def __init__(self, model):
        super(MyModel36, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F36_CFG={}
F36_CFG['max_len']=256
F36_CFG['seed']=75412
F36_CFG['batch_size']=8
F36_CFG['dropout_p']=0.1

* F37
* CV LB
* xlm-roberta base
* seed 900
* 1 linear layer

In [None]:
class MyModel37(nn.Module):
    def __init__(self, model):
        super(MyModel37, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F37_CFG={}
F37_CFG['max_len']=256
F37_CFG['seed']=900
F37_CFG['batch_size']=24
F37_CFG['dropout_p']=0.1

* F38
* CV 0.4782 LB 0.477
* roberta base
* seed 900
* 1 linear layer
* len 300
* bs 24

In [None]:
class MyModel38(nn.Module):
    def __init__(self, model):
        super(MyModel38, self).__init__()
        self.model = model
        self.linear1=nn.Linear(768,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F38_CFG={}
F38_CFG['max_len']=300
F38_CFG['seed']=900
F38_CFG['batch_size']=24
F38_CFG['dropout_p']=0.1

* F39
* CV LB
* xlnet-base
* seed 900
* 1 linear layer


* F40
* CV 0.4836 LB 0.463
* Simcse large unsup
* seed 754
* re init 5 layers
* bs 8 lr 2e-5
* 1 linear layer

In [None]:
class MyModel40(nn.Module):
    def __init__(self, model):
        super(MyModel40, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F40_CFG={}
F40_CFG['max_len']=256
F40_CFG['seed']=901
F40_CFG['batch_size']=8
F40_CFG['dropout_p']=0.1

* F41
* CV 0.4796 LB 0.475
* deberta-large-mnli
* seed 5311
* re init 5 layers
* bs 4
* grad accum 2 lr 2
* 1 linear layer

In [None]:
class MyModel41(nn.Module):
    def __init__(self, model):
        super(MyModel41, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F41_CFG={}
F41_CFG['max_len']=256
F41_CFG['seed']=111
F41_CFG['batch_size']=8
F41_CFG['dropout_p']=0.1

* F42
* CV 0.4883 LB 
* roberta large roberta-large-mnli
* seed 5551
* re init 5 layers
* bs 8 lr 2e-5
* 1 linear layer

In [None]:
class MyModel42(nn.Module):
    def __init__(self, model):
        super(MyModel42, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F42_CFG={}
F42_CFG['max_len']=256
F42_CFG['seed']=1112
F42_CFG['batch_size']=8
F42_CFG['dropout_p']=0.1

* F43
* CV 0.4970 LB 
* roberta-large-openai-detector
* seed 5551
* re init 5 layers
* bs 8 lr 2e-5
* 1 linear layer

In [None]:
class MyModel43(nn.Module):
    def __init__(self, model):
        super(MyModel43, self).__init__()
        self.model = model
        self.linear1=nn.Linear(1024,1)

    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F43_CFG={}
F43_CFG['max_len']=256
F43_CFG['seed']=1112
F43_CFG['batch_size']=8
F43_CFG['dropout_p']=0.1

* F45
* roberta large -5 layer output

In [None]:
class MyModel45(nn.Module):
    def __init__(self, model):
        super(MyModel45, self).__init__()
        self.model = model
        self.linear1 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-5]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F45_CFG={}
F45_CFG['max_len']=256
F45_CFG['seed']=7
F45_CFG['batch_size']=8
F45_CFG['dropout_p']=0.1

* F50
* funnel large
* bs 16
* lr 3e-5

In [None]:
class MyModel50(nn.Module):
    def __init__(self, model):
        super(MyModel50, self).__init__()
        self.model = model
        self.linear1 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F50_CFG={}
F50_CFG['max_len']=256
F50_CFG['seed']=711223
F50_CFG['batch_size']=8
F50_CFG['dropout_p']=0.1

* F51
* electra large
* bs 16
* lr 3.5e-5
* no reinit

In [None]:
class MyModel51(nn.Module):
    def __init__(self, model):
        super(MyModel51, self).__init__()
        self.model = model
        self.linear1 = nn.Linear(1024,1)
        
    def forward(self, text,attention_mask):
        x1 = self.model(text,attention_mask)
        last_hidden_state = x1['hidden_states'][-1]
        text_emb=last_hidden_state.mean(axis=1)
        x2=self.linear1(text_emb)
        return x2
    
F51_CFG={}
F51_CFG['max_len']=256
F51_CFG['seed']=91
F51_CFG['batch_size']=8
F51_CFG['dropout_p']=0.1

# Inference

In [None]:
def inference_wrapper(model_name,model_class,model_CFG,test_df,train_df):
    # moad model cfg
    for key,val in model_CFG.items():
        setattr(CFG,key,val)
    # fix random seed    
    random.seed(CFG.seed)
    os.environ['PYTHONHASHSEED'] = str(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    torch.cuda.manual_seed(CFG.seed)
    torch.cuda.manual_seed_all(CFG.seed)
    torch.backends.cudnn.deterministic = True
    # load model
    notebook=model_name+'-train'
    path='../input/'+notebook+'/model_init/'
    config = AutoConfig.from_pretrained(path, output_hidden_states=True,attention_probs_dropout_prob=CFG.dropout_p,hidden_dropout_prob=CFG.dropout_p)
    tokenizer = AutoTokenizer.from_pretrained(path,model_max_length=CFG.max_len)
    CFG.pad_token_id=tokenizer.pad_token_id
    if int(model_name[1:])<=23:
        model =AutoModelForMaskedLM.from_pretrained(path,config=config)
    else:
        model =AutoModel.from_pretrained(path,config=config)
    model=model_class(model)
    # load state dicts
    state_dicts=[]
    for i in range(CFG.folds):
        checkpoint=torch.load('../input/'+notebook+f'/fold_{i+1}_model',map_location=CFG.device)
        state_dicts.append(checkpoint['model_state_dict'])
    # create data loader
    
    # test df
    if test_df is not None:
        test_df['token'] = tokenize(tokenizer,test_df.excerpt)
        test_dataset = RobertaDataset(test_df.token, test_df.index)
        # shuffle = False !!
        test_dataloader = D.DataLoader(test_dataset, batch_size=CFG.batch_size,
                                     shuffle=False, collate_fn = collate,num_workers=0)
        test_df['test/'+model_name]=0
        for i in range(CFG.folds):
            model.load_state_dict(state_dicts[i])
            model.to(CFG.device)
            preds=score_test(model,test_dataloader,mode='train',avg_n=CFG.test_avg_n)
            test_df['test/'+model_name]+=preds/CFG.folds
    # train
    if train_df is not None:
        train_df['val/'+model_name]=0
        train_df['token'] = tokenize(tokenizer,train_df.excerpt)
        train_dataset = RobertaDataset(train_df.token, train_df.index)
        split_indices=CV_split(len(train_df),k=CFG.folds,shuffle=CFG.cv_shuffle,seed=7)
        for i in range(CFG.folds):
            train_index,val_index=split_indices[i]
            val_dataset = D.Subset(train_dataset, val_index)
            val_dataloader = D.DataLoader(val_dataset, batch_size=CFG.batch_size,
                                          shuffle=False, collate_fn = collate,num_workers=0)
            model.load_state_dict(state_dicts[i])
            model.to(CFG.device)
            preds=score_test(model,val_dataloader,mode='train',avg_n=CFG.val_avg_n)
            train_df.loc[val_index,'val/'+model_name]=preds
    return 

In [None]:
train_df=pd.read_csv('../input/f-inference-1/ensemble_train.csv')

In [None]:
inference_tasks=[
                ('f50',MyModel50,F50_CFG),
                ('f51',MyModel51,F51_CFG),
                ]
for model_name,model_class,model_CFG in tqdm(inference_tasks):
    inference_wrapper(model_name,model_class,model_CFG,test_df=None,train_df=train_df)

# Validation & Ensemble

In [None]:
val_cols=[]
test_cols=[]
for model_name,_,_ in inference_tasks:
    val_cols.append('val/'+model_name)
    test_cols.append('test/'+model_name)

In [None]:
train_df['val_oof_pred']=0
#test_df['final_test_pred']=0
model_weight=0
bias=0
for i,(train_index,val_index) in enumerate(CV_split(len(train_df),k=CFG.folds,shuffle=False,seed=7)):
    model=Ridge(alpha=0.01,fit_intercept=True,normalize=False)
    model.fit(train_df.loc[train_index,val_cols],train_df.loc[train_index,'target'])
    val_preds=model.predict(train_df.loc[val_index,val_cols])
    #test_preds=model.predict(test_df[test_cols])
    train_df.loc[val_index,'val_oof_pred']=val_preds
    #test_df['final_test_pred']+=test_preds/CFG.folds
    model_weight+=model.coef_/CFG.folds
    bias+=model.intercept_/CFG.folds
print('ensemble cv score is:',rmse(train_df['target'],train_df['val_oof_pred']))
print('################################################################')
for i in range(len(val_cols)):
    print(val_cols[i],' weight is:',np.round(model_weight[i],3))
print('ensemble bias is:',bias)
print('################################################################')
print(train_df[val_cols].corr())
train_df.to_csv('ensemble_train.csv',index=False)