In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



import pandas as pd

import gc
from tqdm import tqdm

import transformers
from transformers import AdamW
from transformers import RobertaTokenizer, RobertaModel
from transformers import get_cosine_schedule_with_warmup


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import KFold

In [None]:
gc.collect
torch.cuda.empty_cache()

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("# Using device: ", device)

In [None]:
train_df_tmp = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df_tmp.head()

In [None]:
def text_converter(input_txt):
    out_txt = input_txt.replace("\'re", "'re")
    out_txt = out_txt.replace("\n", " ")
    out_txt = out_txt.replace("\'t", "'t")
    out_txt = out_txt.replace("\'s", "'s")    
    out_txt = out_txt.replace(";", ",")    
    out_txt = out_txt.replace("\'v", "'v")
    out_txt = out_txt.replace("\'", "'")    
    
    return out_txt
    

In [None]:
train_df_tmp['excerpt'] = train_df_tmp.excerpt.apply(lambda x: text_converter(x))

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=250, target_flag=True):
        super().__init__()
        self.df = dataframe
        self.tokenizer = tokenizer
        self.target_flag = target_flag

        if max_len is None:
            self.max_len = 512#
        else:
            self.max_len = max_len
        

        
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
    
        token_dict = self.padding_transform(row.excerpt)
        
        if self.target_flag == True:
            token_dict['target'] = torch.tensor(row.target, dtype=torch.float32)
        
        return token_dict

        
    def padding_transform(self, input_text):
        encoded = self.tokenizer(input_text,
                                 is_split_into_words=True,
                                 max_length=self.max_len,
                                 truncation=True)
        padding_len = self.max_len - len(encoded['input_ids'])
        encoded['input_ids'] = torch.tensor(encoded['input_ids'] + [0] * padding_len, dtype=torch.long)
        encoded['attention_mask'] = torch.tensor(encoded['attention_mask'] + [0] * padding_len, dtype=torch.long)
        
        return {'input_ids': encoded['input_ids'], 'attention_mask': encoded['attention_mask']}

        
        
    def __len__(self):
        return len(self.df.excerpt)

In [None]:
class RobModel(nn.Module):
    def __init__(self, hidden_layer=256):
        super().__init__()
#         self.rob_config = RobertaConfig(attention_probs_dropout_prob=0.2)
        self.model = RobertaModel.from_pretrained('../input/roberta-l')#('roberta-base')#
        self.features = nn.Linear(1024, hidden_layer)
        self.regressor = nn.Linear(hidden_layer, 1)

        
    def forward(self, i_ids, a_mask):

        output = self.model(input_ids=i_ids, attention_mask=a_mask)
        output = output.last_hidden_state[:, 0]#768
        output = F.gelu(self.features(output))#256
        output = self.regressor(output)
        
        return output
        
        
        

In [None]:
def train_mode(criterion, optimizer, data_loader, scheduler=None):
    mean_cost = 0
    mean_rmse = 0
    
    model.train()
    for step, batch in tqdm(enumerate(data_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['target'].to(device)

        optimizer.zero_grad()
        predict = model(input_ids, attention_mask)
        predict = predict.view(-1)
        
        cost = criterion(predict, target)
        cost.backward()
        optimizer.step()
        
        if scheduler is not None:
            scheduler.step()
        
        mean_cost += cost
    
    mean_cost /= len(data_loader)
    mean_rmse = torch.sqrt(mean_cost).to("cpu").detach().numpy()
    return mean_cost, mean_rmse
    

In [None]:
def eval_mode(criterion, data_loader):
    mean_cost = 0
    mean_rmse = 0
    
    model.eval()
    with torch.no_grad():
        for step, batch in tqdm(enumerate(data_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)

            predict = model(input_ids, attention_mask)
            predict = predict.view(-1)

            cost = criterion(predict, target)        
            mean_cost += cost

        mean_cost /= len(data_loader)
        mean_rmse = torch.sqrt(mean_cost).to("cpu").detach().numpy()
        return mean_cost, mean_rmse

In [None]:
def create_optimizer_old(model):
    named_parameters = list(model.named_parameters())    
    
    #roberta large layers
    robert_l_params = named_parameters[:389]
    attention_params = named_parameters[391:393]
    regressor_params = named_parameters[393:]

#     robert_b_params = named_parameters[:197]
#     attention_params = named_parameters[199:201]
#     regressor_params = named_parameters[201:]

    attention_group = [params for (name, params) in attention_params]
    regressor_group = [params for (name, params) in regressor_params]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(robert_l_params):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)    

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    robert_l_params = named_parameters[:389]
    attention_params = named_parameters[391:393]
    regressor_params = named_parameters[393:]
        
    attention_group = [params for (name, params) in attention_params]
    regressor_group = [params for (name, params) in regressor_params]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(robert_l_params):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 2e-5 * 2.5

        if layer_num >= 133:
            lr = 2e-5 * 5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return optim.AdamW(parameters)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-l')#('roberta-base')#

In [None]:
# rob_config = RobertaConfig(attention_probs_dropout_prob=0.2)
# model = RobertaModel.from_pretrained('roberta-large')
model = RobModel()


In [None]:

train_dataset = MyDataset(train_df_tmp[:2000], tokenizer)
valid_dataset = MyDataset(train_df_tmp[2000:], tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=12, shuffle=True, num_workers=8)
valid_dataloader = DataLoader(valid_dataset, batch_size=12, shuffle=True, num_workers=8)
    

In [None]:
# freezing parameters

for param in model.model.embeddings.parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[0].parameters():
    param.requires_grad = False 

for param in model.model.encoder.layer[1].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[2].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[3].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[4].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[5].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[6].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[7].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[8].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[9].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[10].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[11].parameters():
    param.requires_grad = False

for param in model.model.encoder.layer[12].parameters():
    param.requires_grad = False

In [None]:
model

In [None]:
model.load_state_dict(torch.load('../input/roberta-l/ro_best_ever_model.pt'))

In [None]:
num_epochs = 7
model.to(device)
criterion = nn.MSELoss().to(device)
# optimizer = create_optimizer(model)#optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-6) #create_optimizer(model)

optimizer = optim.Adam([
        {'params': model.features.parameters(), 'lr': 1e-4},
        {'params': model.regressor.parameters(), 'lr': 1e-4},
        {'params': model.model.encoder.parameters(), 'lr': 2e-5},
    ], betas=(0.6, 0.7))



scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_training_steps=num_epochs * len(train_dataloader),
    num_warmup_steps=20)
best_cost = np.inf

SEED = 42
NUM_FOLDS = 5


kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
for epoch in range(num_epochs):

    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df_tmp)):
        print(f"\nFold {fold + 1}/{NUM_FOLDS}")
        train_dataset = MyDataset(train_df_tmp.loc[train_idx], tokenizer)
        valid_dataset = MyDataset(train_df_tmp.loc[val_idx], tokenizer)

        train_dataloader = DataLoader(train_dataset, batch_size=12, shuffle=True, num_workers=8)
        valid_dataloader = DataLoader(valid_dataset, batch_size=12, shuffle=True, num_workers=8)    
        
        train_cost, train_rmse = train_mode(criterion, optimizer, train_dataloader, scheduler=scheduler)
        print("train cost: ", train_cost.item(), " train rmse: ", train_rmse)

        valid_cost, valid_rmse = eval_mode(criterion, valid_dataloader)
        print("valid cost: ", valid_cost.item(), " valid rmse: ", valid_rmse)

        if valid_cost < best_cost:
            best_cost = valid_cost
            torch.save(model.state_dict(), "ro_best_ever_model.pt")

In [None]:
def predict(model, data_loader):
#     result = np.zeros(len(data_loader))    
    result = []
    index = 0
    
    model.eval()
    with torch.no_grad():
        for step, batch in tqdm(enumerate(data_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
                        
            predict = model(input_ids, attention_mask)                        
            predict = predict.view(-1).float()
            print(step)
            print(predict)
            result.extend(predict.float().detach().to("cpu").tolist())


    return result

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")


In [None]:
test_ds = MyDataset(test_df, tokenizer, 250, target_flag=False)
test_dataloader = DataLoader(test_ds, batch_size=8, num_workers=8)

In [None]:
labels = predict(model, test_dataloader)

In [None]:
labels

In [None]:
submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission_df
submission_df.target = labels

In [None]:
submission_df

In [None]:
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)