In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torchtext
import pandas as pd
import numpy as np
import nltk.data
import torch.nn.functional as F
import math

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en')

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [None]:
df.head()

In [None]:
#### Loading glove embeddings #####
embed_dim = 300
f = open('../input/glove-embeds/glove.6B.'+str(embed_dim)+'d.txt', 'r', encoding='utf-8')

word_to_idx = {'<pad>': 0}
idx_to_embed = {}
idx = 1
for l in f:
    values = l.split()
    word = values[0]
    word_to_idx[word] = idx
    embed = np.array([float(v) for v in values[1:]])
    idx_to_embed[idx] = embed
    idx += 1

In [None]:
idx_to_word = {idx: w for w, idx in word_to_idx.items()}

In [None]:
embed_matrix = np.random.randn(len(word_to_idx), embed_dim)
for idx, embed in idx_to_embed.items():
    embed_matrix[idx, :] = embed

In [None]:
def encode_paragraph(para, tokenizer, word_to_idx):
    para_encoded = []
    para = para.lower()
    for word in tokenizer(para):
        try:
            para_encoded.append(word_to_idx[word])
        except KeyError as e:
            continue  # skip words not in vocab
    return para_encoded

In [None]:
para_encodings = []
targets = []
for idx, row in df.iterrows():
    para_enc = encode_paragraph(row.excerpt, tokenizer, word_to_idx)
    target = row.target
    para_enc = torch.tensor(para_enc, dtype=torch.long)
    para_encodings.append(para_enc)
    targets.append(target)

In [None]:
train_data = list(zip(para_encodings, targets))

In [None]:
from torch.utils.data import DataLoader, Dataset

# class ParaDataset(Dataset):
#     def __init__(self, data, stage):
#         self.data = data
#         self.stage = stage
        
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         if self.stage == 'train' or self.stage == 'val':
#             word_enc, syll_enc, target = self.data[idx]
#             return word_enc, syll_enc, target
#         if self.stage == 'test':
#             word_enc, syll_enc = self.data[idx]
#             return word_enc, syll_enc

In [None]:
from torch.nn.utils.rnn import pad_sequence

word_pad_idx = word_to_idx['<pad>']

def generate_batch(data_batch):
    word_batch = [d[0] for d in data_batch]
    word_batch = pad_sequence(word_batch, batch_first=True, padding_value=word_pad_idx)
    try:
        target_batch = torch.tensor([d[1] for d in data_batch], dtype=torch.float32)
        return word_batch, target_batch
    except IndexError as e:
        return word_batch

### Build Model

In [None]:
class Model(pl.LightningModule):
    def __init__(self, word_embed_weights, embed_dim, dropout=0.2, lr=1e-3):
        super(Model, self).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(word_embed_weights, freeze=False)
        self.embed_dim = embed_dim
        self.dropout1 = nn.Dropout(p=dropout)
        self.linear1 = nn.Linear(embed_dim, 150)
        self.dropout2 = nn.Dropout(p=dropout)
        self.linear2 = nn.Linear(150, 75)
        self.dropout3 = nn.Dropout(p=dropout)
        self.linear3 = nn.Linear(75, 25)
        self.dropout4 = nn.Dropout(p=dropout)
        self.linear4 = nn.Linear(25, 1)
        self.lr = lr
        self.test_preds = []
        
    def forward(self, x, mask):
        """
        x is of size (bsz, seq_len)
        mask is of size (bsz, seq_len)
        """
        x = self.word_embeddings(x)  # (bsz, seq_len, embed_dim)
        n_non_zero = mask.sum(-1)
        n_non_zero = n_non_zero.unsqueeze(-1)
        x = x.sum(1) / n_non_zero  # take average embedding for sentence
        x = self.dropout1(x)
        x = F.tanh(self.linear1(x))
        x = self.dropout2(x)
        x = F.tanh(self.linear2(x))
        x = self.dropout3(x)
        x = F.tanh(self.linear3(x))
        x = self.dropout4(x)
        pred = self.linear4(x)
        return pred
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

    def shared_step(self, batch, stage):
        if stage in ['train', 'val']:
            x, y = batch
        elif stage == 'test':
            x = batch
        mask = x > 0
        y_hat = self(x, mask)
        y_hat = y_hat.flatten()
        if stage == 'train' or stage == 'val':
            loss = F.mse_loss(y_hat, y)
            return loss
        if stage == 'test':
            return y_hat
    
    def training_step(self, train_batch, batch_idx):
        loss = self.shared_step(train_batch, 'train')
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        loss = self.shared_step(val_batch, 'val')
        self.log('val_loss', loss)
    
    def test_step(self, test_batch, batch_idx):
        preds = self.shared_step(test_batch, 'test')
        self.test_preds.extend(preds.flatten().cpu().numpy())

In [None]:
BATCH_SIZE = 256
train_data_split, val_data_split = torch.utils.data.random_split(train_data, [2734, 100])
train_dataloader = DataLoader(train_data_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
val_dataloader = DataLoader(val_data_split, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
embed_matrix = torch.tensor(embed_matrix, dtype=torch.float32)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath='./saved_models/',
    filename='model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min'
)

In [None]:
model = Model(
    word_embed_weights=embed_matrix,
    embed_dim=embed_dim, 
    dropout=0.2)
trainer = pl.Trainer(
    gpus=1, 
    fast_dev_run=False, 
    auto_lr_find=True, 
    max_epochs=1500,
    progress_bar_refresh_rate=0,
    callbacks=[checkpoint_callback])
trainer.tune(model, train_dataloader, val_dataloader)
trainer.fit(model, train_dataloader, val_dataloader)

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
para_encodings_test = []
for idx, row in test_df.iterrows():
    para_enc = encode_paragraph(row.excerpt, tokenizer, word_to_idx)
    para_enc = torch.tensor(para_enc, dtype=torch.long)
    para_encodings_test.append([para_enc])
test_data = para_encodings_test

In [None]:
test_dataloader = DataLoader(test_data, batch_size=10, shuffle=False, collate_fn=generate_batch)

In [None]:
model = Model.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    word_embed_weights=embed_matrix,
    embed_dim=embed_dim, 
    dropout=0.2)

model.to(device)

In [None]:
trainer.test(model, test_dataloader)

In [None]:
submission = pd.DataFrame(zip(list(test_df.id), model.test_preds), columns=['id', 'target'])
submission.to_csv('./submission.csv', index=False)