In [None]:
%%time
!pip install ../input/textstats/textstat-master
!pip install ../input/pyphen/Pyphen-master

In [None]:
%%writefile ensemble.py

import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


import torch
import transformers

import random
import os
import sys

from tqdm import tqdm

class CLRPDataset():
    def __init__(self,df,max_len, tokenizer):
        self.excerpt = df['excerpt'].values
        self.max_len = max_len
        self.tokenizer = tokenizer 


        if "target" in df.columns:
            self.target = df['target'].values
        else:
            self.target = None
    
    def __getitem__(self,index):
        encode = self.tokenizer(self.excerpt[index],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                return_token_type_ids = True,
                                truncation=True)  

        #token_ids = encode['input_ids'].squeeze(0)
        #attn_masks = encode['attention_mask'].squeeze(0)
        #token_type_ids = encode['token_type_ids'].squeeze(0)

        token_ids = encode['input_ids'][0]
        attn_masks = encode['attention_mask'][0]
        token_type_ids = encode['token_type_ids'][0]
        
        
        if self.target is None:
            return token_ids, attn_masks, token_type_ids


        target = self.target[index]
        target = torch.tensor(target).float()    

        return token_ids, attn_masks, token_type_ids, target  


    def __len__(self):
        return len(self.excerpt)
    
class BertRegreesion(torch.nn.Module):


    def __init__(self, dropout, bert_model, model_path,  freeze_bert=False):
        super(BertRegreesion, self).__init__()
        
        self.bert_layer = transformers.AutoModel.from_pretrained(model_path)
        
        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "roberta-base":  
            hidden_size = 768
        elif bert_model == "roberta-large":  
            hidden_size = 1024
        elif bert_model == "microsoft/deberta-large":  
            hidden_size = 1024



        # Freeze bert layers and only train the regression layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # ReGression layer
        self.dropout = torch.nn.Dropout(p=dropout)
        self.head = torch.nn.Linear(hidden_size, 1)
        self.bert_model = bert_model
    
    def forward(self, input_ids, attn_masks, token_type_ids):

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        if  self.bert_model == "microsoft/deberta-large":
            output = self.bert_layer(input_ids, attn_masks, token_type_ids)
            output = output[0]
            output = output[:,0,:].squeeze(1)

        else:  
            cont_reps, output = self.bert_layer(input_ids, attn_masks, token_type_ids,  return_dict=False)

        output = self.head(self.dropout(output))

        return output
    
BATCH_SIZE = 4
FOLDS = 5
NUM_WORKERS = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class Config:
    epochs = 20 
    eval_per_epoch = 8 #4
    dropout = 0.4
    max_len = 256
    bert_model = "roberta-base"
    warm_up_steps = 300

def get_preds (bert_model, instances_path, max_len, df_test):
    print(instances_path)
    model_path = "/kaggle/input/transformers/" + bert_model + "-hf"
    print(f"model_path:{model_path}")

    vocab_path = model_path
    instances_path = "/kaggle/input/" + instances_path
    instance_name = bert_model.replace("/","_")

    
    device = DEVICE
    tokenizer = transformers.AutoTokenizer.from_pretrained(vocab_path)

    test_data = CLRPDataset(df_test,max_len, tokenizer=tokenizer)
    test_loader = torch.utils.data.DataLoader(test_data,
                                          batch_size=BATCH_SIZE,
                                          shuffle=False,
                                          num_workers=NUM_WORKERS)

    p = np.zeros((len(df_test),))
    for fold in range(FOLDS): 
        preds = []

        model = BertRegreesion (dropout = Config.dropout, bert_model=bert_model, model_path= model_path, freeze_bert=False)
        
        
        filename = f"{instances_path}/{instance_name}_{fold}.pt"
        model.load_state_dict(torch.load(filename, map_location=torch.device(device)))
        model.to(device)
        model.eval()
    
        with torch.no_grad():
            for token_ids, attn_masks, token_type_ids in tqdm(test_loader):
                token_ids = token_ids.to(device)
                attn_masks = attn_masks.to(device)
                token_type_ids = token_type_ids.to(device)

                output = model.forward(token_ids, attn_masks, token_type_ids)
                output = output.detach().cpu()[:,0]

                preds.append(output)
        preds = np.concatenate(preds)
        p += preds
        del model
    
    return p/FOLDS

def main():
    from numba import cuda 
    device = cuda.get_current_device()
    device.reset()
    
    df_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
    args = sys.argv[1:]
    
    preds = get_preds (args[0],args[1], max_len = Config.max_len, df_test = df_test)
    df_test [args[1]] = preds
    
    df_test[[args[1]]].to_pickle(args[1]+".pkl")

if __name__ == "__main__":
    main()            
            

In [None]:
%%writefile ensemble_atthead.py

import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt


import torch
import transformers

import random
import os
import sys

from tqdm import tqdm

class CLRPDataset():
    def __init__(self,df,max_len, tokenizer):
        self.excerpt = df['excerpt'].values
        self.max_len = max_len
        self.tokenizer = tokenizer 


        if "target" in df.columns:
            self.target = df['target'].values
        else:
            self.target = None
    
    def __getitem__(self,index):
        encode = self.tokenizer(self.excerpt[index],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                return_token_type_ids = True,
                                truncation=True)  

        #token_ids = encode['input_ids'].squeeze(0)
        #attn_masks = encode['attention_mask'].squeeze(0)
        #token_type_ids = encode['token_type_ids'].squeeze(0)

        token_ids = encode['input_ids'][0]
        attn_masks = encode['attention_mask'][0]
        token_type_ids = encode['token_type_ids'][0]
        
        
        if self.target is None:
            return token_ids, attn_masks, token_type_ids


        target = self.target[index]
        target = torch.tensor(target).float()    

        return token_ids, attn_masks, token_type_ids, target  


    def __len__(self):
        return len(self.excerpt)
    
class AttentionHead(torch.nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = torch.nn.Linear(in_features, hidden_dim)
        self.V = torch.nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector


class BertRegreesion(torch.nn.Module):



    def __init__(self, dropout, bert_model, model_path,  freeze_bert=False):
        super(BertRegreesion, self).__init__()
        
        self.bert_layer = transformers.AutoModel.from_pretrained(model_path, output_hidden_states=True)


        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "roberta-base":  
            hidden_size = 768
        elif bert_model == "roberta-large":  
            hidden_size = 1024
        elif bert_model == "microsoft/deberta-large":  
            hidden_size = 1024



        # Freeze bert layers and only train the regression layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.head = AttentionHead(hidden_size,hidden_size,1)
                
        # ReGression layer
        self.dropout = torch.nn.Dropout(p=dropout)
        self.linear = torch.nn.Linear(hidden_size, 1)
        
        
        
        
        self.bert_model = bert_model
    
    def forward(self, input_ids, attn_masks, token_type_ids):

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        if  self.bert_model == "microsoft/deberta-large":
            output = self.bert_layer(input_ids, attn_masks, token_type_ids)
            output = output[0]

        else:  
            output = self.bert_layer(input_ids, attn_masks, token_type_ids,  return_dict=False)
            output = output[0]
        
        
        output = self.head(output)
        output = self.dropout(output)
        output = self.linear(output)

        return output
    
BATCH_SIZE = 4
FOLDS = 5
NUM_WORKERS = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class Config:
    epochs = 20 
    eval_per_epoch = 8 #4
    dropout = 0.4
    max_len = 256
    bert_model = "roberta-base"
    warm_up_steps = 300

def get_preds (bert_model, instances_path, max_len, df_test):
    print(instances_path)
    model_path = "/kaggle/input/transformers/" + bert_model + "-hf"
    print(f"model_path:{model_path}")

    vocab_path = model_path
    instances_path = "/kaggle/input/" + instances_path
    instance_name = bert_model.replace("/","_")

    
    device = DEVICE
    tokenizer = transformers.AutoTokenizer.from_pretrained(vocab_path)

    test_data = CLRPDataset(df_test,max_len, tokenizer=tokenizer)
    test_loader = torch.utils.data.DataLoader(test_data,
                                          batch_size=BATCH_SIZE,
                                          shuffle=False,
                                          num_workers=NUM_WORKERS)

    p = np.zeros((len(df_test),))
    for fold in range(FOLDS): 
        preds = []

        model = BertRegreesion (dropout = Config.dropout, bert_model=bert_model, model_path= model_path, freeze_bert=False)
        
        
        filename = f"{instances_path}/{instance_name}_{fold}.pt"
        model.load_state_dict(torch.load(filename, map_location=torch.device(device)))
        model.to(device)
        model.eval()
    
        with torch.no_grad():
            for token_ids, attn_masks, token_type_ids in tqdm(test_loader):
                token_ids = token_ids.to(device)
                attn_masks = attn_masks.to(device)
                token_type_ids = token_type_ids.to(device)

                output = model.forward(token_ids, attn_masks, token_type_ids)
                output = output.detach().cpu()[:,0]

                preds.append(output)
        preds = np.concatenate(preds)
        p += preds
        del model
    
    return p/FOLDS

def main():
    from numba import cuda 
    device = cuda.get_current_device()
    device.reset()
    
    df_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
    args = sys.argv[1:]
    
    preds = get_preds (args[0],args[1], max_len = Config.max_len, df_test = df_test)
    df_test [args[1]] = preds
    
    df_test[[args[1]]].to_pickle(args[1]+".pkl")

if __name__ == "__main__":
    main()            
            

In [None]:
%%time
!python ensemble_atthead.py "roberta-large" "clrp-roberta-large-2h-atthead-se"

!python ensemble_atthead.py "microsoft/deberta-large" "clrp-deberta-large-ppln4-atthead"
!python ensemble.py "roberta-large" "clrp-roberta-large-2f-se"

!python ensemble.py "microsoft/deberta-large" "clrp-deberta-large-2-se"
!python ensemble.py "microsoft/deberta-large" "clrp-deberta-large-4-se"


In [None]:
!ls -lart *pkl

In [None]:
import numpy as np 
import pandas as pd 
import sklearn.linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import textstat
import spacy.lang.en 


FOLDS = 5

def read_oof (name, return_all = True):
    oof = pd.read_csv(f"/kaggle/input/{name}/oof.csv")
    
    if "pred_x" in oof.columns:
        oof = oof.rename (columns={"pred_x":name})
    else:
        oof = oof.rename (columns={"pred":name})
        
    if return_all:
        return oof
    else:
        return oof[["id",name]]


name_1 = "clrp-roberta-large-2f-se"
name_2 = "clrp-deberta-large-2-se"
name_3 = "clrp-deberta-large-4-se"
name_4 = "clrp-deberta-large-ppln4-atthead"
name_5 = "clrp-roberta-large-2h-atthead-se"

oof = read_oof (name_1, return_all =True)
print(f"oof shape:{oof.shape}")
tmp = read_oof (name_2, return_all =False)
oof = oof.merge(tmp, on ="id")
tmp = read_oof (name_3, return_all =False)
oof = oof.merge(tmp, on ="id")
tmp = read_oof (name_4, return_all =False)
oof = oof.merge(tmp, on ="id")
tmp = read_oof (name_5, return_all =False)
oof = oof.merge(tmp, on ="id")

oof_textstats = pd.read_csv("/kaggle/input/clrp-textstats/textstats.csv")
oof = oof.merge(oof_textstats, on="id")

oof["t1"] = oof["syllable_count"]**2
oof["t2"] = oof["coleman_liau_index"]**2


print(f"oof shape:{oof.shape}")

textstats_feats = [
    'syllable_count', 
    'gunning_fog', 'automated_readability_index', 'coleman_liau_index', 'text_standard', 
    "dale_chall_readability_score",
    "t1", 
    "t2", 
] 



https://pypi.org/project/textstat/

* **syllable_count**: Returns the number of syllables present in the given text.
* **gunning_fog**: Returns the FOG index of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document.
* **automated_readability_index**: Returns the ARI (Automated Readability Index) which outputs a number that approximates the grade level needed to comprehend the text.
* **coleman_liau_index**: Returns the grade level of the text using the Coleman-Liau Formula. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document.
* **text_standard**: Based upon all the above tests, returns the estimated school grade level required to understand the text.
* **dale_chall_readability_score**: Different from other tests, since it uses a lookup table of the most commonly used 3000 English words. Thus it returns the grade level using the New Dale-Chall Formula
* **t1-t2**: syllable_count^2 and coleman_liau_index^2



In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv",usecols=['id','target'])
train = pd.merge(train,oof_textstats, on=['id'] )

In [None]:
train[['gunning_fog', 'automated_readability_index', 'coleman_liau_index', 'text_standard', 
    "dale_chall_readability_score",'target']].corr()

In [None]:
%%time
df_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")


pred = pd.read_pickle(name_1 + ".pkl")
df_test[name_1] = pred.values

pred = pd.read_pickle(name_2+ ".pkl")
df_test[name_2] = pred.values

pred = pd.read_pickle(name_3+ ".pkl")
df_test[name_3] = pred.values

pred = pd.read_pickle(name_4+ ".pkl")
df_test[name_4] = pred.values

pred = pd.read_pickle(name_5+ ".pkl")
df_test[name_5] = pred.values


df_test["syllable_count"] = df_test ["excerpt"].map(lambda x: textstat.syllable_count(x))
df_test["gunning_fog"] = df_test ["excerpt"].map(lambda x: textstat.gunning_fog(x))
df_test["automated_readability_index"] = df_test ["excerpt"].map(lambda x: textstat.automated_readability_index(x))
df_test["coleman_liau_index"] = df_test ["excerpt"].map(lambda x: textstat.coleman_liau_index(x))
df_test["text_standard"] = df_test ["excerpt"].map(lambda x: textstat.text_standard(x, float_output=True))
df_test["dale_chall_readability_score"] = df_test ["excerpt"].map(lambda x: textstat.dale_chall_readability_score(x))


df_test["t1"] = df_test["syllable_count"]**2
df_test["t2"] = df_test["coleman_liau_index"]**2



df_test["target"] = 0



In [None]:
cols = [name_1, name_2, name_3, name_4, name_5] + textstats_feats

In [None]:
scaler = StandardScaler().fit(oof[cols])

oof[cols] = scaler.transform(oof[cols])
df_test[cols] = scaler.transform(df_test[cols])

oof_pred = []
oof_target = []

X_test = df_test[cols].values

for fold in range(FOLDS):
    df_val = oof.query("kfold == @fold")
    X_val = df_val[cols].values
    y_val = df_val["target"].values
    
    
    df_train = oof.query("kfold != @fold")
    X_train = df_train[cols].values
    y_train = df_train["target"].values
    
    model = sklearn.linear_model.Ridge(alpha=5.0)
    model.fit (X_train, y_train)
    p_val = model.predict (X_val)
    
    df_test["target"] += model.predict (X_test)
    
    oof_pred.append(p_val)
    oof_target.append (y_val)
    score = mean_squared_error (y_val, p_val, squared=False)
    print(f"fold:{fold}  ens: {score:.5f}"  )

oof_pred = np.concatenate (oof_pred)
oof_target = np.concatenate (oof_target)
ens_score = mean_squared_error (oof_target, oof_pred, squared=False)
print(f"oof  ens: {ens_score:.5f}"  )

In [None]:
df_test["target"] /= FOLDS  


df_test[["id","target"]].to_csv("submission.csv", index=False)

df_test[["id","target"]]