In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math

from collections import Counter
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader

from transformers import RobertaModel, RobertaTokenizer, BertTokenizer, BertModel, AdamW

In [2]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train = train[['excerpt', 'target']]
train.head()

Unnamed: 0,excerpt,target
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197


In [3]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test = test[['id', 'excerpt']]
test.head()

Unnamed: 0,id,excerpt
0,c0f722661,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,It was a bright and cheerful scene that greete...
3,04caf4e0c,Cell division is the process by which a parent...
4,0e63f8bea,Debugging is the process of finding and resolv...


## Preprocessing

In [4]:
train['excerpt'] = train.excerpt.apply(lambda x: re.sub(r'[\n]', ' ', x))
test['excerpt'] = test.excerpt.apply(lambda x: re.sub(r'[\n]', ' ', x))

In [5]:
# Shuffle data
train = train.sample(frac=1).reset_index()

# Train and validation split
split = 4*len(train)//5
X_train = train.excerpt.iloc[:split]
X_val = train.excerpt.iloc[split:]
y_train = train.target.iloc[:split]
y_val = train.target.iloc[split:]

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()
X_test = test.excerpt.to_numpy()

## Pre-trained model downloads

* Bert

In [6]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bt = BertModel.from_pretrained('bert-base-uncased')
# torch.save(bt, "./bt")
# torch.save(tokenizer, "./tokenizer")

* RoBerta

In [7]:
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# bt = RobertaModel.from_pretrained('roberta-base')
# torch.save(bt, "./bt")
# torch.save(tokenizer, "./tokenizer")

## Tokenizing

In [8]:
tokenizer = torch.load("../input/roberta/tokenizer")

MAX_SEQ_LEN = 256
check_len = tokenizer(X_train.tolist())
check_len_seq = pd.Series(check_len['input_ids']).apply(len).value_counts().sort_index()
seq_trimmed = check_len_seq[check_len_seq.index > MAX_SEQ_LEN]
print("If set max length of sequences %d, %d questions will be trimmed" % \
      (MAX_SEQ_LEN, seq_trimmed.sum()))

print("If set max length of sequences %d, %.2f%% questions will be trimmed" % \
      (MAX_SEQ_LEN, seq_trimmed.sum()*100/train.shape[0]))

If set max length of sequences 256, 111 questions will be trimmed
If set max length of sequences 256, 3.92% questions will be trimmed


In [9]:
X_train_batch = tokenizer(X_train.tolist(), padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")
X_val_batch = tokenizer(X_val.tolist(), padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(X_train_batch, y_train)
val_dataset = MyDataset(X_val_batch, y_val)

## Modeling

In [11]:
# class MyModel(nn.Module):

#     def __init__(self):
#         super().__init__()
        
#         self.bert = torch.load("../input/modelused/bt")
#         self.fc1 = nn.Linear(MAX_SEQ_LEN*768, 512)
#         self.fc2 = nn.Linear(512, 128)
#         self.fc3 = nn.Linear(128, 1)
#         self.do = nn.Dropout(0.1)

#     def forward(self, input_ids, attention_mask,token_type_ids):
#         x = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         x = torch.tanh(self.fc1(x.last_hidden_state.flatten(start_dim=1, end_dim=-1)))
#         x = self.do(x)
#         x = torch.tanh(self.fc2(x))
#         x = self.do(x)
#         x = self.fc3(x)
#         return x


class MyModel(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.bert = torch.load("../input/roberta/bt")
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 1)
        self.do = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask,token_type_ids):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        x = torch.tanh(self.fc1(x.pooler_output))
        x = self.do(x)
        x = torch.tanh(self.fc2(x))
        x = self.do(x)
        x = self.fc3(x)
        return x

## Training and testing

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = MyModel()
model.to(device)

loss_f =  nn.MSELoss()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=2e-5, weight_decay=0.9)
EPOCH = 10

def training_loop(epochs, optimizer, model, loss_f, train_loader, fold_num):
    
    best_val_loss = 9999
    step = 0
    for epoch in tqdm(range(epochs)):
        
        loss_train = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).to(device)
            loss = loss_f(outputs.flatten(), labels.type(torch.float32))
            loss.backward()
            optim.step()
            loss_train += loss.item()
            
            step += 1
            
            if step % 10 == 0:
                loss_val = 0.0
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    token_type_ids = batch['token_type_ids'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).to(device)
                    loss = loss_f(outputs.flatten(), labels.type(torch.float32))
                    loss_val += loss.item()

                # print("Epoch", epoch, "Training loss", math.sqrt(loss_train/len(train_loader)), "Validation loss", math.sqrt(loss_val/len(val_loader)))

                if loss_val < best_val_loss:
                    best_val_loss = loss_val
                    model_path = "./model_%s" % fold_num
                    torch.save(model.state_dict(), model_path)
                    print("Epoch", epoch, "Training loss", math.sqrt(loss_train/len(train_loader)), "Validation loss", math.sqrt(loss_val/len(val_loader)))

# training_loop(EPOCH, optim, model, loss_f, train_loader, 999)
    

# preds = []
# for i in test.excerpt:
#     test_token = tokenizer([i], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_tensors="pt")
#     input_ids = test_token['input_ids'].to(device)
#     attention_mask = test_token['attention_mask'].to(device)
#     token_type_ids = test_token['token_type_ids'].to(device)
#     test_pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#     test_pred = test_pred.cpu().detach().numpy().astype("float").item()
#     preds.append(test_pred)

## K Fold

In [13]:
num_folds = 5
kf = KFold(n_splits=num_folds)

kf.get_n_splits(X = train.excerpt, y = train.target)
preds_folds = []
for f, (train_index, val_index) in enumerate(kf.split(X = train.excerpt, y = train.target)):
    print("Fold", f)
    X_train = train.excerpt.iloc[train_index].to_numpy()
    X_val = train.excerpt.iloc[val_index].to_numpy()
    y_train = train.target.iloc[train_index].to_numpy()
    y_val = train.target.iloc[val_index].to_numpy()
    
    X_train_batch = tokenizer(X_train.tolist(), padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")
    X_val_batch = tokenizer(X_val.tolist(), padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")
    
    train_dataset = MyDataset(X_train_batch, y_train)
    val_dataset = MyDataset(X_val_batch, y_val)
    
    model = MyModel()
    model.to(device)

    loss_f =  nn.MSELoss()
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
    optim = AdamW(model.parameters(), lr=2e-5, weight_decay=0.9)
    EPOCH = 10
    
    training_loop(EPOCH, optim, model, loss_f, train_loader, f)
    
    preds = []
    for i in test.excerpt:
        test_token = tokenizer([i], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN, return_token_type_ids=True, return_tensors="pt")
        input_ids = test_token['input_ids'].to(device)
        attention_mask = test_token['attention_mask'].to(device)
        token_type_ids = test_token['token_type_ids'].to(device)
        test_pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        test_pred = test_pred.cpu().detach().numpy().astype("float").item()
        preds.append(test_pred)
    
    preds_folds.append(preds)

Fold 0


  0%|          | 0/10 [00:00<?, ?it/s]

  import sys


Epoch 0 Training loss 0.32806010087377596 Validation loss 0.9844775827634784
Epoch 0 Training loss 0.4174013196480886 Validation loss 0.9583452412307754
Epoch 0 Training loss 0.48281468914684494 Validation loss 0.8220427161395545
Epoch 0 Training loss 0.5316041004912815 Validation loss 0.6982417517885213
Epoch 0 Training loss 0.567925270372262 Validation loss 0.633341348159734
Epoch 0 Training loss 0.6039244796447842 Validation loss 0.608620258072846
Epoch 0 Training loss 0.6284186886313922 Validation loss 0.5892119170933868
Epoch 0 Training loss 0.647614497544729 Validation loss 0.5583561953091473
Epoch 0 Training loss 0.6885569524657695 Validation loss 0.5371374697032447
Epoch 0 Training loss 0.7045536609649092 Validation loss 0.5308481224282806
Epoch 0 Training loss 0.7458886570084187 Validation loss 0.5240588888621318
Epoch 1 Training loss 0.18609479929795697 Validation loss 0.5213737203708378
Epoch 1 Training loss 0.22441066448097896 Validation loss 0.49379150032038055
Epoch 1 Tra

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 Training loss 0.31470784934686963 Validation loss 1.0461158223775735
Epoch 0 Training loss 0.41695304541188954 Validation loss 1.0092707010735564
Epoch 0 Training loss 0.48054067216880286 Validation loss 0.8271097974623107
Epoch 0 Training loss 0.5218382178894945 Validation loss 0.7372878116924753
Epoch 0 Training loss 0.5529640070840776 Validation loss 0.7320146635237597
Epoch 0 Training loss 0.5899100245512867 Validation loss 0.6351634508584711
Epoch 0 Training loss 0.6356143409796469 Validation loss 0.6200176746557379
Epoch 0 Training loss 0.6563954319310854 Validation loss 0.6042890530938755
Epoch 0 Training loss 0.6747455111060816 Validation loss 0.5832240172190807
Epoch 1 Training loss 0.19138775072999648 Validation loss 0.5568738500839151
Epoch 1 Training loss 0.23011728170324205 Validation loss 0.55676675077749
Epoch 1 Training loss 0.26404928857394805 Validation loss 0.5504088696732627
Epoch 1 Training loss 0.29113123200572383 Validation loss 0.5363215571868287
Epoch 1

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 Training loss 0.3264594337469085 Validation loss 1.0245432376785661
Epoch 0 Training loss 0.42640090379542134 Validation loss 0.9523625972552868
Epoch 0 Training loss 0.48543900218281544 Validation loss 0.7944236837596246
Epoch 0 Training loss 0.5237052369873453 Validation loss 0.716997385925533
Epoch 0 Training loss 0.5869973305467878 Validation loss 0.6970376117694086
Epoch 0 Training loss 0.6168431092939947 Validation loss 0.619887989971374
Epoch 0 Training loss 0.7024200283493575 Validation loss 0.5935533167123177
Epoch 0 Training loss 0.738443490630078 Validation loss 0.567420889254689
Epoch 0 Training loss 0.7532364807047803 Validation loss 0.5573765032432677
Epoch 1 Training loss 0.1068272303002732 Validation loss 0.5452937263049222
Epoch 1 Training loss 0.16229787897504874 Validation loss 0.5365390327241384
Epoch 1 Training loss 0.2606377458604343 Validation loss 0.5183533762476359
Epoch 1 Training loss 0.4656508100148743 Validation loss 0.5155761780728644
Epoch 2 Train

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 Training loss 0.2939177317177218 Validation loss 1.0756448434853467
Epoch 0 Training loss 0.3847421636374346 Validation loss 0.9688735882225715
Epoch 0 Training loss 0.46193617728697517 Validation loss 0.8490817842261913
Epoch 0 Training loss 0.5050489759292089 Validation loss 0.8152104558290713
Epoch 0 Training loss 0.5441719526586686 Validation loss 0.7968108485356659
Epoch 0 Training loss 0.571056818845203 Validation loss 0.7186561312400503
Epoch 0 Training loss 0.6269860186410265 Validation loss 0.697434856276463
Epoch 0 Training loss 0.6719335522863685 Validation loss 0.6762108398994311
Epoch 0 Training loss 0.691124732412198 Validation loss 0.6272464394859214
Epoch 0 Training loss 0.7284059475466916 Validation loss 0.6000765507153218
Epoch 0 Training loss 0.7407990398296189 Validation loss 0.5882313016885722
Epoch 1 Training loss 0.12086597765402207 Validation loss 0.5676029151922076
Epoch 1 Training loss 0.16909330889543508 Validation loss 0.5548987836910313
Epoch 1 Trai

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 Training loss 0.3406298677338421 Validation loss 1.0466571030716407
Epoch 0 Training loss 0.4225961241041979 Validation loss 0.9838855412099905
Epoch 0 Training loss 0.48739087961027694 Validation loss 0.8579098633869587
Epoch 0 Training loss 0.5325167617837963 Validation loss 0.7525397798963979
Epoch 0 Training loss 0.5656757032498787 Validation loss 0.6817057865941731
Epoch 0 Training loss 0.590458375538171 Validation loss 0.6286837537675187
Epoch 0 Training loss 0.6394149673280801 Validation loss 0.6033424792970823
Epoch 0 Training loss 0.6603395180783894 Validation loss 0.5718060717997201
Epoch 0 Training loss 0.6798729352561831 Validation loss 0.5524818898138948
Epoch 0 Training loss 0.6932265452786024 Validation loss 0.5442396929607164
Epoch 0 Training loss 0.7452774140699447 Validation loss 0.5371680545542128
Epoch 1 Training loss 0.1109764879099735 Validation loss 0.522811049211195
Epoch 1 Training loss 0.1647481068004914 Validation loss 0.5123148405586576
Epoch 1 Train

In [None]:
tgt = np.mean(np.array(preds_folds), axis=0)

In [None]:
ans = pd.DataFrame({'id': test.id, 'target': tgt})
ans

In [None]:
ans.to_csv('submission.csv', index=False)