In [None]:
import numpy as np
import pandas as pd

import torch

import transformers

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train.head()

In [None]:
#from transformers import AutoTokenizer
#from transformers import AutoModel

path_model = '../input/pretrained-transformers/roberta_large_model'
path_tokenizer = '../input/pretrained-transformers/roberta_large_tokenizer'

#path_model = '../input/clrp-roberta-base/clrp_roberta_base'
#path_tokenizer = '../input/clrp-roberta-base/clrp_roberta_base'

#path_model = '../input/pretrained-transformers/roberta_base_model'
#path_tokenizer = '../input/pretrained-transformers/roberta_base_tokenizer'


tokenizer = transformers.RobertaTokenizer.from_pretrained(path_tokenizer)

In [None]:
tokenizer

In [None]:
from pprint import pprint

print(tokenizer(train['excerpt'][0]))

In [None]:
from transformers import AutoModel


model = AutoModel.from_pretrained(path_model, num_labels=1)

In [None]:
#model

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class TextData(Dataset):
    def __init__(self, text, labels, is_train = True, max_len=250):
        self.text = text
        self.labels = labels if is_train else None ###
        self.is_train = is_train ###
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        tokenized_text = tokenizer(
            self.text[item].replace('\n', ''), max_length=self.max_len, truncation=True, 
            return_attention_mask=True, return_token_type_ids=False)
                           
        padding_length = self.max_len - len(tokenized_text['input_ids'])
        
           
        input_ids = torch.tensor(tokenized_text['input_ids'] + ([0] * padding_length), dtype=torch.long)
        #token_type_ids = torch.tensor(tokenized_text['token_type_ids'] + ([0] * padding_length), dtype=torch.long)
        attention_mask = torch.tensor(tokenized_text['attention_mask'] + ([0] * padding_length), dtype=torch.long)
        
        
        if self.is_train:
            label = torch.tensor(self.labels[item], dtype=torch.double)
            
        if self.is_train:
            return {
                'input_ids': input_ids, 
                #'token_type_ids': token_type_ids,
                'attention_mask': attention_mask,                                   
                'label': label
            }
        else:
            return {
                'input_ids': input_ids,
                #'token_type_ids': token_type_ids,
                'attention_mask': attention_mask                
            }

In [None]:
train_dataset = TextData(train.loc[:2000, 'excerpt'].values, train.loc[:2000, 'target'].values)
valid_dataset = TextData(train.loc[2000:, 'excerpt'].values, train.loc[2000:, 'target'].values)

In [None]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
valid_dataloader = DataLoader(valid_dataset, batch_size=16)

In [None]:
next(iter(train_dataloader))

In [None]:
#next(iter(valid_dataloader))

In [None]:
#from transformers import AutoModel

#AutoModel.from_pretrained('bert-base-cased', output_hidden_states=False);

In [None]:
from transformers import AutoConfig
import torch.nn as nn

class RegressionModel(torch.nn.Module):
    
    def __init__(self):
        super(RegressionModel, self).__init__()
        config = AutoConfig.from_pretrained(path_model)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.bert = AutoModel.from_pretrained(path_model, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(1024, 512),            
            nn.Tanh(),                       
            nn.Linear(1024, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )   

    def forward(self, input_ids, attention_mask, label=None): #token_type_ids,
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
            #token_type_ids=token_type_ids,
        )
        
        sequence_output = self.regressor(outputs.last_hidden_state[:,0])
        #sequence_output = torch(self.regressor(self.dropout(outputs.last_hidden_state[:,-1,:])), 1)
        
        logits = (sequence_output)
        
        loss = None
        if label is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(label.dtype)
            loss = torch.sqrt(loss_fn(logits, label.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [None]:
#RegressionModel()
model = RegressionModel()

In [None]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[0].parameters():
    param.requires_grad = False 

for param in model.bert.encoder.layer[1].parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[2].parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[3].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[4].parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[5].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[6].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[7].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[8].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[9].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[10].parameters():
    param.requires_grad = False
    
for param in model.bert.encoder.layer[11].parameters():
    param.requires_grad = False
    
#for param in model.bert.encoder.layer[12].parameters():
#    param.requires_grad = False
#    
#for param in model.bert.encoder.layer[13].parameters():
#    param.requires_grad = False
#    
#for param in model.bert.encoder.layer[14].parameters():
#    param.requires_grad = False
#    
#for param in model.bert.encoder.layer[15].parameters():
#    param.requires_grad = False

In [None]:
device = "cuda"

model.to(device);

In [None]:
from transformers import AdamW

optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 0.00001},
    {'params': model.regressor.parameters(), 'lr': 0.001}
])

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=10,
    num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    
    model.train()
    
    losses = []
    
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    print(f"train RMSE: {np.array(losses).mean()}")
        
    model.eval()
    
    losses = []
    
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs[0].item()
        
        losses.append(loss)
     
    
    print(f"valid RMSE: {np.array(losses).mean()}")

train RMSE: 0.8249859021885411
valid RMSE: 0.5797919076732253
train RMSE: 0.5543886480447727
valid RMSE: 0.6473528378036738
train RMSE: 0.4553731330876249
valid RMSE: 0.573552249816162
train RMSE: 0.39186169261522136
valid RMSE: 0.5580758739581811
train RMSE: 0.32023625471194966
valid RMSE: 0.5731837957088508
train RMSE: 0.26528761870212597
valid RMSE: 0.5293163088017251

train RMSE: 0.8144331196217753
valid RMSE: 0.6743270671867236

train RMSE: 0.5551262840222608
valid RMSE: 0.569206174302016

train RMSE: 0.47716535342445876
valid RMSE: 0.5417254085514731

Large without 11 layers and 4 epoch

train RMSE: 0.8796342564623829
valid RMSE: 0.6481774119879732

train RMSE: 0.5957295629519985
valid RMSE: 0.594007074209979

train RMSE: 0.5168589246785117
valid RMSE: 0.5644268208866577

train RMSE: 0.4681257860342965
valid RMSE: 0.6382700901139396

train RMSE: 0.70108438117762
valid RMSE: 0.591509985725852

train RMSE: 0.4599159102574394
valid RMSE: 0.5314388177489617

train RMSE: 0.3041276327755984
valid RMSE: 0.508451227175764

train RMSE: 0.2022837107640159
valid RMSE: 0.5050663417667759

In [None]:
def get_preds(dataloader, model):
    preds = []
    with torch.no_grad():
        for data, batch in enumerate(dataloader):
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            #tok_types = batch["token_type_ids"].to(device)
            
                        
            output = model(ids, mask) #, tok_types)
            output = output[0].squeeze(-1)

            #output = output['logits'].squeeze(-1)
            #output = output.view(-1).float() ##
            #preds.append(output.detach().cpu().numpy())
            preds.extend(output.float().detach().to('cpu').tolist())##
    return preds   

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_dataset = TextData(test_df['excerpt'], None, False, 250)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=8) #DataLoader

In [None]:
all_preds = get_preds(test_loader, model)
#predicts = all_preds[0].detach().to('cpu').tolist()
predicts = all_preds

In [None]:
predicts

In [None]:
pd.DataFrame(all_preds).T.mean(axis=1)

In [None]:
predicts = pd.DataFrame(predicts)

In [None]:
submit = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
submit['target'] = predicts

In [None]:
submit.to_csv("submission.csv",index = False)

In [None]:
#torch.cuda.empty_cache()

In [None]:
submit