In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

from sklearn.model_selection import KFold

In [None]:
# Config dict
cfg = {
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'max_len': 512,
    'learning_rate': 2e-5,
    'num_epochs': 3
}

In [None]:
cfg['device']

In [None]:
# Read raw csv data to a pandas df
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-bert/bert-base-uncased')

## Dataset class

In [None]:
class CommonLitDataset(Dataset):
    """ Dataset loader class for pytorch """
    
    def __init__(self, df, tokenizer, max_len, test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.test = test
        
        
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'excerpt']
        
        # encode the text and truncate if necessary
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        
        # Define the BERT inputs
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        if self.test == False:
            # Define the BERT outputs
            target = self.df.loc[idx, ['target']]
        
            return {
                'ids': torch.tensor(ids),
                'mask': torch.tensor(mask),
                'token_type_ids': torch.tensor(token_type_ids),
                'target': torch.torch.FloatTensor(target)
            }
        
        return {
                'ids': torch.tensor(ids),
                'mask': torch.tensor(mask),
                'token_type_ids': torch.tensor(token_type_ids)
            }

## Criterion (RMSE)

In [None]:
def RMSE(y_pred, y_true):
    metric = nn.MSELoss()
    return torch.sqrt(metric(y_pred, y_true))

## Model class

In [None]:
class CommonLitModel(nn.Module):
    
    def __init__(self, name):
        super(CommonLitModel, self).__init__()
        self.name = name
        
        if name == 'BERT':
            self.bert = AutoModel.from_pretrained('../input/huggingface-bert/bert-base-uncased')
            # Output from BERT
            self.in_features = self.bert.pooler.dense.out_features
        
        self.dropout = nn.Dropout()
        self.layer_norm = nn.LayerNorm(self.in_features)
        self.fc = nn.Linear(self.in_features, 1)
    
    
    def forward(self, ids, mask, token_type_ids):
        
        if self.name == 'BERT':
            _, output = self.bert(ids,
                                 attention_mask=mask,
                                 token_type_ids=token_type_ids,
                                 return_dict=False)
            
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.fc(output)
        return output


In [None]:
def train_valid (model, optimizer, criterion, datasets, num_epochs=10):
    
    model.to(cfg['device'])
    
    for idx, (train, test) in enumerate(datasets):
        print(f'\nSPLIT {idx + 1}:')
        
        train_dataloader = DataLoader(dataset=train, shuffle=True, batch_size=16)
        test_dataloader = DataLoader(dataset=test, shuffle=False, batch_size=1)
        
        # Train the model
        model.train()
        for epoch in range(num_epochs):

            for idx, data in enumerate(train_dataloader):

                optimizer.zero_grad()
    
                X_train = {key: value.to(cfg['device']) for (key, value) in data.items() if key != 'target'}
                y_train = data['target'].to(cfg['device'])
            
                output = model(X_train['ids'],
                               X_train['mask'],
                               X_train['token_type_ids'])

                torch.cuda.empty_cache()
                loss = criterion(output, y_train)

                if idx % 140 == 0:
                    print(f'\nTRAIN RMSE: {loss}')

                loss.backward()
                optimizer.step()
                
        # Validate the model
        model.eval()
        for idx, data in enumerate(test_dataloader):
            
            X_test = {key: value.to(cfg['device']) for (key, value) in data.items()}
            y_test = data['target'].to(cfg['device'])

            with torch.no_grad():
                output = model(X_test['ids'],
                                X_test['mask'],
                                X_test['token_type_ids'])
                
            loss = criterion(output, y_test)
                
            if idx % 20 == 0:
                print(f'\nVALID RMSE: {loss}')

            torch.cuda.empty_cache()

In [None]:
# Split the dataframe in a 10 fold cross validation manner
kf = KFold(n_splits = 10, shuffle = True, random_state = 4)
datasets = []

for train_index, test_index in kf.split(train_df):
    
    train = train_df.iloc[train_index].reset_index(drop=True)
    test =  train_df.iloc[test_index].reset_index(drop=True)
    
    datasets.append((CommonLitDataset(train, tokenizer, cfg['max_len'], test=False),
                    CommonLitDataset(test, tokenizer, cfg['max_len'], test=False)))

In [None]:
model = CommonLitModel('BERT')
criterion = RMSE
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg['learning_rate'])

torch.cuda.empty_cache()
train_valid(model, optimizer, criterion, datasets, num_epochs=cfg['num_epochs'])

## TODO
- Look into early-stopping

In [None]:
def test (model, dataloader):
    
    model.eval()
    model.to(cfg['device'])
    output_list = []
    
    for idx, data in enumerate(dataloader):
            
        X_train = {key: value.to(cfg['device']) for (key, value) in data.items()}
        
        with torch.no_grad():
            output = model(X_train['ids'],
                            X_train['mask'],
                            X_train['token_type_ids'])
        
        output_list.append(output.item())
        torch.cuda.empty_cache()
    
    return output_list
            

In [None]:
dataset = CommonLitDataset(test_df, tokenizer, cfg['max_len'], test=True)
dataloader = DataLoader(dataset=dataset, shuffle=False, batch_size=1)

torch.cuda.empty_cache()
outputs = test(model, dataloader)

In [None]:
# Save the output
output_data = {
    'id': test_df['id'],
    'target': outputs
}

output_df = pd.DataFrame(output_data, columns=['id', 'target'])
output_df.to_csv('./submission.csv', index = False, header=True)