In [None]:
# installing sentence-bert
!mkdir -p /tmp/pip/cache/
!cp ../input/sbert-stuff/sentence_transformers/sentence_transformers/sacremoses-0.0.45-py3-none-any.whl /tmp/pip/cache
!cp ../input/sbert-stuff/sentence_transformers/sentence_transformers/sentence-transformers-1.2.0.xyz /tmp/pip/cache/sentence-transformers-1.2.0.tar
!cp ../input/sbert-stuff/sentence_transformers/sentence_transformers/sentencepiece-0.1.95-cp36-cp36m-win_amd64.whl /tmp/pip/cache
!cp ../input/sbert-stuff/sentence_transformers/sentence_transformers/tokenizers-0.10.3-cp36-cp36m-win_amd64.whl /tmp/pip/cache
!cp ../input/sbert-stuff/sentence_transformers/sentence_transformers/transformers-4.6.1-py3-none-any.whl /tmp/pip/cache
!pip install --no-index --find-links /tmp/pip/cache/ sentence-transformers

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchtext.data.utils import get_tokenizer

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
base = '/kaggle/input/commonlitreadabilityprize/'
train_df = pd.read_csv(base + 'train.csv')
test_df = pd.read_csv(base + 'test.csv')

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
para_encoder = SentenceTransformer('../input/sbert-stuff/paraphrase-mpnet-base-v2/paraphrase-mpnet-base-v2', device='cuda')

In [None]:
class Model(pl.LightningModule):
    def __init__(self, para_encoder, embed_dim, hidden_dim, num_layers, ff_hidden, dropout=0.2, lr=1e-3):
        super(Model, self).__init__()
        self.para_encoder = para_encoder
        self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=dropout)
        self.linear1 = nn.Linear(hidden_dim, ff_hidden)
        self.linear2 = nn.Linear(ff_hidden, 1)
        self.lr = lr
    
    def forward(self, para):
        para_encoding = self.para_encoder.encode(para, show_progress_bar=False, convert_to_tensor=True)  # (seq_len, 1, embed_dim)
        para_encoding = para_encoding.unsqueeze(1)
        output, (h_n, c_n) = self.rnn(para_encoding)
        h_n = h_n.squeeze()  # (hidden_dim)
        out = self.linear2(F.relu(self.linear1(h_n)))
        return out
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        lmbda = lambda epoch: 0.95
        scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
    
    def shared_step(self, batch, batch_idx):
        X_para, y = batch
        X_para = tokenizer.tokenize(X_para[0])
        y_hat = self(X_para)
        loss = F.mse_loss(y_hat.type(torch.float64), y)
        return loss
    
    def training_step(self, train_batch, batch_idx):
        loss = self.shared_step(train_batch, batch_idx)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        loss = self.shared_step(val_batch, batch_idx)
        self.log('val_loss', loss)

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
class ParaDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        excerpt, target = row.excerpt, row.target
        return excerpt, target

In [None]:
dataset = ParaDataset(train_df)
train_dataset, val_dataset = random_split(dataset, [2634, 200], generator=torch.Generator().manual_seed(42))

In [None]:
train_dataloader = DataLoader(train_dataset)
val_dataloader = DataLoader(val_dataset)

In [None]:
embed_dim = para_encoder.get_sentence_embedding_dimension()
hidden_dim = 128
num_layers = 1
ff_hidden = 64
dropout = 0.0  # since num_layers is 1
model = Model(para_encoder, embed_dim, hidden_dim, num_layers, ff_hidden, dropout)

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='./saved_models/',
    filename='model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min'
)

In [None]:
trainer = pl.Trainer(gpus=1, max_epochs=5, callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloader, val_dataloader)

In [None]:
class ParaTestDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        excerpt = row.excerpt
        return excerpt

In [None]:
test_dataset = ParaTestDataset(test_df)
test_dataloader = DataLoader(test_dataset)

In [None]:
model = Model.load_from_checkpoint(checkpoint_callback.best_model_path,
                                   para_encoder=para_encoder,
                                   embed_dim=embed_dim, 
                                   hidden_dim=hidden_dim, 
                                   num_layers=num_layers, 
                                   ff_hidden=ff_hidden,
                                   dropout=dropout)

In [None]:
model.to(device)

In [None]:
model.eval()
preds = []
with torch.no_grad():
    for X_para_test in test_dataloader:
        y_hat = model(X_para_test)
        preds.append(y_hat)
preds = [p.item() for p in preds]

In [None]:
submission = pd.DataFrame(zip(test_df.id, preds), columns=['id', 'target'])
submission.to_csv('./submission.csv', index=False)