In [None]:
import os
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
import numpy as np

base = '/kaggle/input/commonlitreadabilityprize/'
train = pd.read_csv(base + 'train.csv')
train.head()

In [None]:
train.excerpt.apply(lambda x: len(x)).hist()

In [None]:
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import torch
from torch.utils.data import Dataset

In [None]:
class commonlit(Dataset):
    def __init__(
        self,
        inputs,
        tokenizer,
        max_len,
        labels,
    ):

        self.inputs = inputs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):

        return len(self.inputs)

    def __getitem__(self, idx):
        text = self.inputs[idx]
        inps = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        inps_dict = {k: inps[k].squeeze(0) for k in inps}  
        labels = self.labels[idx]
        return inps_dict, torch.tensor(labels, dtype=torch.float)

In [None]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.fc1 = nn.Linear(768, 100)
        self.relu =  nn.ReLU()
        self.fc2 = nn.Linear(100, 1)

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False,
        )
        output = self.fc1(output)
        #output = self.relu(output)
        output = self.fc2(output)
        return output

In [None]:
class Trainer():
    
    def __init__(self,model,train_set,test_set,opts):
        self.model = model
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        self.epochs = opts['epochs']
        print(model)
        self.optimizer = torch.optim.Adam(model.parameters(), opts['lr']) 
        self.criterion = torch.nn.MSELoss()                     
        self.train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                                        batch_size=opts['batch_size'],
                                                        shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                                       batch_size=opts['batch_size'],
                                                       shuffle=False)
    def train(self):
        self.model.train() #put model in training mode
        for epoch in range(self.epochs):
            self.tr_loss = []
            for i, (data,labels) in tqdm_notebook(enumerate(self.train_loader),
                                                   total = len(self.train_loader)):
                input_ids  = data['input_ids'].to(self.device)
                token = data['token_type_ids'].to(self.device) 
                mask = data['attention_mask'].to(self.device)
                labels = labels.to(self.device)
                self.optimizer.zero_grad()  
                outputs = self.model(
                                    input_ids=input_ids,
                                    attention_mask=mask,
                                    token_type_ids=token)   
                loss = self.criterion(outputs, labels) 
                loss.backward()                        
                self.optimizer.step()                  
                self.tr_loss.append(loss.item())       
            
            self.test(epoch) # run through the validation set
    
    def test(self,epoch):
            
            self.model.eval()    # puts model in eval mode - not necessary for this demo but good to know
            self.test_loss = []
            self.test_accuracy = []
            
            for i, (data, labels) in enumerate(self.test_loader):
                
                input_ids  = data['input_ids'].to(self.device)
                token = data['token_type_ids'].to(self.device) 
                mask = data['attention_mask'].to(self.device)
                labels = labels.to(self.device)
                # pass data through network
                # turn off gradient calculation to speed up calcs and reduce memory
                with torch.no_grad():
                    outputs = self.model(
                                        input_ids=input_ids,
                                        attention_mask=mask,
                                        token_type_ids=token)   
                
                # make our predictions and update our loss info
                loss = self.criterion(outputs, labels)
                self.test_loss.append(loss.item())
            
            print('epoch: {}, train loss: {}, test loss: {}'.format( 
                  epoch+1, np.mean(self.tr_loss), np.mean(self.test_loss)))

In [None]:
inputs = train.loc[:, "excerpt"].values.tolist()
labels = train.loc[:, "target"].values.tolist()
tokenizer = transformers.BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True
)
dataset = commonlit(inputs=inputs, tokenizer=tokenizer, max_len=185, labels=labels)
display(len(dataset))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [2000, 834])

In [None]:
model = BERTClass()
opts = {
    'lr': 2e-4,
    'epochs': 30,
    'batch_size': 64
}

CommonTrainer = Trainer(model = model,
                      train_set = train_dataset,
                      test_set = test_dataset,opts = opts)

In [None]:
CommonTrainer.train()