In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
import random
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn



import transformers
from transformers import BertTokenizer, BertModel
from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup

In [None]:
#clean memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"# Using device: {device}")

In [None]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
df_train.shape

In [None]:
df_train.isnull().sum()

In [None]:
len(df_train)

In [None]:
class Metrics():
    def __init__(self):
        self.sse = 0
        self.num_samples = 0
        
    
    def update(self, target, predict):
        predict = predict.view(-1)
        self.sse += np.sum(np.square(target - predict))
        self.num_samples += 1
    
    
    def comp_rmse(self):
        rmse = np.sqrt(self.sse / self.num_samples)
        return rmse
    
    
def flatten(array):
    """takes an output array and flatten it
    returns a list"""

    my_list = []
    for i in range(len(array)):
        my_list.append(array[i][0])
    return np.array(my_list)

In [None]:

class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, submit=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.submit = submit
    

    def __getitem__(self, item):
        row = self.df.iloc[item]
        encoded = self.tokenizer(row.excerpt, return_tensors='pt', 
                                 max_length=self.max_len, padding='max_length', truncation=True)

        if self.submit == False:
            return {
            "input_ids": encoded["input_ids"][0],
            "a_mask": encoded['attention_mask'][0],
            'target': torch.tensor(row.target).float()
        }
        else:
            return {
            "input_ids": encoded["input_ids"][0],
            "a_mask": encoded['attention_mask'][0]
        }
            
    
    
    def __len__(self):
        return len(self.df.excerpt)

In [None]:
tokenizer = BertTokenizer.from_pretrained('../input/bert-uncased')#("bert-base-uncased")

In [None]:
result = tokenizer(df_train.iloc[0].excerpt, return_tensors='pt', max_length=250, padding='max_length')
result

In [None]:
dataset_tmp = SentimentDataset(df_train, tokenizer, 250)
len(dataset_tmp)

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    bert_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:201]
    regressor_parameters = named_parameters[201:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(bert_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
data_loader = DataLoader(dataset_tmp, batch_size=8)

In [None]:
# for batch in dataset_tmp:
#     print(batch)
#     break
# len(dataset_tmp)

In [None]:
next(iter(dataset_tmp))
next(iter(dataset_tmp))

In [None]:
class SentimentModel(nn.Module):
    def __init__(self, dropout_p=0.3):
        super(SentimentModel, self).__init__()
        self.model = BertModel.from_pretrained("../input/bert-uncased")#("bert-base-uncased")
        self.features = nn.Linear(768, 256)
        self.dropout = nn.Dropout(dropout_p)
        self.regressor = nn.Linear(256, 1)
        
    
    def forward(self, input_ids, a_mask):
        output = self.model(input_ids=input_ids, attention_mask=a_mask)
        output = output.last_hidden_state[:, 0]
        output = F.gelu(self.features(output))
        output = self.dropout(output)
        output = self.regressor(output)    
#         output.keys()
        return output

In [None]:
def train_model(criterion, optimizer, data_loader, epoch, device=device, scheduler=None):#model, 
    mean_loss = 0
    mean_rmse = 0
    rmse_list = []
    
    
    model.train()
    for step, batch in tqdm(enumerate(data_loader)):
        input_ids = batch['input_ids'].to(device)
        a_mask = batch['a_mask'].to(device)
        target = batch['target'].to(device)
        
        optimizer.zero_grad()
        predict = model(input_ids, a_mask)#(8, 1)
        predict = predict.view(-1)

        loss = criterion(predict, target)
        

#         print(predict.shape, target.shape)
        loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()
        
        
        mean_loss += loss
#         mean_rmse += np.sqrt(np.sum(np.square(predict.detach().cpu().numpy() - target.detach().cpu().numpy())))
#         rmse_list[]

    mean_loss /= len(data_loader)
    mean_rmse = torch.sqrt(mean_loss).cpu().detach().numpy()
    return mean_loss, mean_rmse

In [None]:
def evaluate_model(criterion, data_loader, epoch, device=device):#model, 
    mean_loss = 0
    mean_rmse = 0
    
    model.eval()
    with torch.no_grad():
        for step, batch in tqdm(enumerate(data_loader)):
            input_ids = batch['input_ids'].to(device)
            a_mask = batch['a_mask'].to(device)
            target = batch['target'].to(device)#8

            predict = model(input_ids, a_mask)#8,1
            predict = predict.view(-1)
            loss = criterion(predict, target)
            mean_loss += loss
#             mean_rmse += np.sqrt(np.sum(np.square(predict.detach().cpu().numpy() - target.detach().cpu().numpy()))) 

        mean_loss /= len(data_loader)
        mean_rmse = torch.sqrt(mean_loss).cpu().detach().numpy()
        return mean_loss, mean_rmse

In [None]:
def inference(state, data_loader, device=device):#model, 
    results = []
    
#     model.to(device)
    model.eval()

    for step, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        a_mask = batch['a_mask'].to(device)
        output = model(input_ids, a_mask)
        predict = output.detach().cpu().numpy()
#         print(predict, type(predict))
#         results.append(predict)
#     mean_res = np.mean(results, axis=0)
    return results, mean_res

In [None]:

# model = BertModel.from_pretrained("bert-base-uncased")
model = SentimentModel().to(device)

In [None]:
train_ds = SentimentDataset(df_train[:2000], tokenizer, 250)
valid_ds = SentimentDataset(df_train[2000:], tokenizer, 250)

train_dataloader = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=8)
valid_dataloader = DataLoader(valid_ds, batch_size=8, shuffle=True, num_workers=8)



In [None]:
# tokenizer.save_pretrained("./tokenizer/")

In [None]:
model.load_state_dict(torch.load("../input/bert-uncased/best_ever_model.pt"))#('best_ever_model.pt')

In [None]:
Num_Epochs = 20

criterion = nn.MSELoss().to(device)
optimizer = create_optimizer(model) #optim.AdamW(model.parameters(), lr = 0.001)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_training_steps=Num_Epochs * len(train_dataloader), 
    num_warmup_steps=50)
best_cost = np.inf


for epoch in range(Num_Epochs):
    train_loss, train_rmse = train_model(criterion, optimizer, train_dataloader, 1, scheduler=scheduler)#model, 
    print("train loss: ", train_loss.item(), " train rmse: ", train_rmse)
 
    valid_loss, valid_rmse = evaluate_model(criterion, valid_dataloader, 1)#model, 
    print("valid loss: ", valid_loss.item(), " valid rmse: ", valid_rmse)
    
    if valid_loss < best_cost:
        best_cost = valid_loss
        torch.save(model.state_dict(), "best_ever_model.pt")

In [None]:
def predict(model, data_loader):
#     result = np.zeros(len(data_loader))    
    result = []
    index = 0
    
    model.eval()
    with torch.no_grad():
        for step, batch in tqdm(enumerate(data_loader)):
            input_ids = batch['input_ids'].to(device)
            a_mask = batch['a_mask'].to(device)
                        
            predict = model(input_ids, a_mask)                        
            predict = predict.view(-1).float()
            print(step)
            print(predict)
            result.extend(predict.float().detach().to("cpu").tolist())


    return result

In [None]:
test_ds = SentimentDataset(df_test, tokenizer, 250, submit=True)
test_dataloader = DataLoader(test_ds, batch_size=8, num_workers=8)




In [None]:
# labels = np.zeros(len(test_dataloader))
labels = predict(model, test_dataloader)

In [None]:
labels

In [None]:
submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission_df
submission_df.target = labels

In [None]:
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)