## **CommonLit Readability Prize**

In this competition, youâ€™ll build algorithms to rate the complexity of reading passages for grade 3-12 classroom use. 

The dataset that includes readers from a wide variety of age groups and a large collection of texts taken from various domains. Winning models will be sure to incorporate text cohesion and semantics.

Instead using the regular BERT implementation lets try out DistilBERT

DistilBERT is a transformers model, smaller and faster than BERT, which was pretrained on the same corpus in a self-supervised fashion, using the BERT base model as a teacher. 

![BERT](https://blog.rasa.com/content/images/2019/09/pruning_bert.png)

In [None]:
import os
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns 
from sklearn import metrics
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

In [None]:
#config
TRAIN_PATH = '../input/commonlitreadabilityprize/train.csv'
TEST_PATH = '../input/commonlitreadabilityprize/test.csv'
EPOCHS = 10
LR = 3e-4
MAX_LEN = 200
BERT_MODEL = '../input/distilbertbaseuncased'
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_MODEL,
    do_lower_case=True
)
TRAIN_BS = 16
VALID_BS = 32
COLUMNS = ['excerpt', 'target']
DEVICE = device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [None]:
#lets have a quick glance at the data
train = pd.read_csv(TRAIN_PATH)
train.head()

In [None]:
#we need to check if the data is clean and complete
train.info()
#we will see the all required columns contains no NaN values

In [None]:
#to check how many unique values are there in each columns
#this also gives us an idea of what sort of a problem we are going to face
for col in train.columns:
    print(f"{col}: {len(train[col].unique())}")
    
#from this we can see that the target is not a categorical field cause it has 
#same number of unique values as the number of rows
#So its kind of a regression problem 

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, excerpt, target=None, test=False):
        self.excerpt = excerpt
        self.target = target
        self.test = test 
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self, index):
        excerpt = str(self.excerpt[index])
        excerpt = ' '.join(excerpt.split())
        
        inputs = self.tokenizer.encode_plus(
            excerpt,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len,
            pad_to_max_length = True
        )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        
        if self.test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids
            }
        else:
            targets = torch.tensor(self.target[index], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': targets
            }      

### Train and Evaluation Engine

In [None]:
class Engine:
    def __init__(self,model,optimizer,train_dataloader,
                 valid_dataloader,device):
        self.model = model
        self.optimizer = optimizer
        self.train_data = train_dataloader
        self.valid_data = valid_dataloader
        self.device = device
        
    def loss_fn(self,outputs, targets):
        return torch.sqrt(nn.MSELoss()(outputs, targets))
    
    def train_fn(self):
        self.model.train()
        i = 0
        size = len(self.train_data)
        for data in self.train_data:
            ids = data['ids'].to(self.device,dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(self.device,dtype=torch.long)
            mask = data['mask'].to(self.device,dtype=torch.long)
            targets = data['targets'].to(self.device,dtype=torch.float)
            
            self.optimizer.zero_grad()
            outputs = model(
                    ids=ids,
                    mask=mask,
                    token_type_ids = token_type_ids
            )
            loss = self.loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            
            return loss
    
    def eval_fn(self):
        self.model.eval()
        _targets = []
        _outputs = []
        with torch.no_grad():
            for data in self.valid_data:
                ids = data['ids'].to(self.device,dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(self.device,dtype=torch.long)
                mask = data['mask'].to(self.device,dtype=torch.long)
                targets = data['targets'].to(self.device,dtype=torch.float)
            
                self.optimizer.zero_grad()
                outputs = model(
                        ids=ids,
                        mask=mask,
                        token_type_ids = token_type_ids
                        )
                val_loss = self.loss_fn(outputs, targets)
                targets = targets.cpu().detach()
                _targets.extend(targets.numpy().tolist())
                outputs = outputs.cpu().detach()
                _outputs.extend(outputs.numpy().tolist())
                
            return val_loss, _outputs, _targets
        
    def inference_fn(self, test_dl, infer_model):
        outputs = []
        infer_model.eval()
        with torch.no_grad():
            for i,data in enumerate(test_dl):
                ids = data['ids'].to(self.device,dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(self.device,dtype=torch.long)
                mask = data['mask'].to(self.device,dtype=torch.long)
            
                self.optimizer.zero_grad()
                out = infer_model(
                        ids=ids,
                        mask=mask,
                        token_type_ids = token_type_ids
                        )
                out = out.cpu().detach().numpy()
                if i==0:
                    outputs = out
                else:
                    outputs = np.concatenate((outputs,out), axis=None)
                
        return outputs

### DistilBERT Model

In [None]:
#model
class DistilBERT(nn.Module):
    def __init__(self):
        super(DistilBERT, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_MODEL)
        self.dropout1 = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.dropout1(output)
        output = self.out(output)
        return output

### Model Training

In [None]:
#Model training
data = pd.read_csv(TRAIN_PATH)
data = data.sample(frac=1).reset_index(drop=True)
data = data[COLUMNS]

train_df = data[:2500].sample(frac=1).reset_index(drop=True)
valid_df = data[2500:].sample(frac=1).reset_index(drop=True)
print(f"Train : {train_df.shape}\nValidation: {valid_df.shape}")

train_ds = CommonLitDataset(
    excerpt=train_df['excerpt'].values,
    target = train_df['target'].values
)
valid_ds = CommonLitDataset(
    excerpt=valid_df['excerpt'].values,
    target = valid_df['target'].values
)

train_dl = DataLoader(
    train_ds,
    batch_size = TRAIN_BS,
    shuffle = True,
    num_workers = 4
)
valid_dl = DataLoader(
    valid_ds,
    batch_size= VALID_BS,
    shuffle= True,
    num_workers= 4 
)
model = DistilBERT().to(DEVICE);
optimizer = transformers.AdamW(model.parameters(),lr=LR)
engine = Engine(model=model,optimizer=optimizer,
                train_dataloader=train_dl,
                valid_dataloader=valid_dl,
                device=DEVICE
               )
best_loss = 10
for epoch in range(EPOCHS):
    train_loss = engine.train_fn()
    val_loss, outputs, targets = engine.eval_fn()
    print(f"epoch: {epoch}, train loss: {train_loss}, val_loss: {val_loss}")
    if val_loss < best_loss:
        print(f"saving model with loss: {val_loss}")
        torch.save(model.state_dict(),f"CommonLit_{val_loss}.bin")
        best_loss = val_loss
        
print(f"final Report\nValidation RMSE Loss: {best_loss}")

### Inference and Submission

In [None]:
infer_model = DistilBERT()
infer_model.load_state_dict(torch.load(f"CommonLit_{best_loss}.bin"))
infer_model.to(device)
infer_model.eval()

test_df = pd.read_csv(TEST_PATH)
test_dataset = CommonLitDataset(
    excerpt=test_df['excerpt'].values,
    test=True
)
test_dl = DataLoader(test_dataset,batch_size=16,shuffle=False,num_workers=4)
output = engine.inference_fn(test_dl,infer_model)
submission_df = pd.DataFrame({'id': test_df.id, 'target': output.reshape(-1).tolist()})
submission_df.head()

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
submission_df

### **If you find it useful please upvote** :) 