This is my first experience with NLP

Here I experimenting with different models (roberta, distilbert, xlnet) + different heads (LSTM, 1D-CNN, Attention, Transformer) + FC layer.

The pipeline is very flexible, you can change any of components

I run here only single fold, but you can add the loop and train with cross-validation.

In [None]:
pip install --upgrade -q pytorch-lightning==1.3.1

In [None]:
import numpy as np
import pandas as pd
import pathlib

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json, pickle

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from sklearn.model_selection import KFold

from tqdm.notebook import tqdm
import pytorch_lightning as pl

from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
import joblib

from typing import Optional
from collections import defaultdict

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
class CONFIG:
    max_length = 300
    num_targets = 1
    SEED = 321
    loader_params = dict(
        trn=dict(batch_size=4,
                 num_workers=0,
                 shuffle=True,
                 pin_memory=True),
        val=dict(batch_size=5,
                 num_workers=0,
                 shuffle=False,
                 pin_memory=True),
        tst=dict(batch_size=5,
                 num_workers=0,
                 shuffle=False,
                 pin_memory=True),
        all=dict(batch_size=5,
                 num_workers=0,
                 shuffle=False,
                 pin_memory=True)
    )
    learning_rate = 5e-5
    n_folds = 5
    
seed_everything(CONFIG.SEED)

root_path = pathlib.Path('../input/commonlitreadabilityprize')

save_path = pathlib.Path('.')


In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, inputs, masks, targets):
        assert inputs.shape == masks.shape
        self.inputs = inputs
        self.masks = masks
        self.targets = targets.type(torch.float32)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return dict(inputs=self.inputs[idx], masks=self.masks[idx], targets=self.targets[[idx]])


In [None]:
class PrintCallback(pl.callbacks.Callback):
    """
    callback for pytorch lightning which saves and prints out results
    """

    def __init__(self):
        self.metrics = {}

    def on_epoch_end(self, trainer, pl_module):
        metrics_dict = {k: v for k, v in trainer.callback_metrics.items() if 'step' not in k}
        self.metrics[trainer.current_epoch] = metrics_dict
        msg = f'epoch: {str(trainer.current_epoch).rjust(4)}\t'
        msg += '\t'.join([f'{k}: {v:.5f}' for k, v in metrics_dict.items()])
        print(msg)

    def to_df(self):
        return pd.DataFrame(
            {epoch: {k: v.item() for k, v in metrics.items()} for epoch, metrics in self.metrics.items()}).T

    def plot(self):
        df = self.to_df()
        fig, ax = plt.subplots(1, 1, figsize=(12, 6))
        for column in df.columns:
            df[column].plot(ax=ax, legend=column)
        return fig


In [None]:
class Squeeze(nn.Module):
    def __init__(self, dims=-1):
        super().__init__()
        self.dims = dims
        
    def forward(self, x):
        return x.squeeze(self.dims)
        
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

    

class CNNHead(nn.Module):
    def __init__(self, in_features, hidden_dim, kernel_size=10, num_targets=1):
        super().__init__() 
        self.head = nn.Sequential(nn.Conv1d(in_features, hidden_dim, kernel_size=kernel_size),
                                     nn.AdaptiveMaxPool1d(1),
                                     Squeeze()
                                    )
        self.out_features = hidden_dim
        
    def forward(self, x):
        return self.head(x.permute(0,2,1))
        

class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers, num_targets=1):
        super().__init__()
        self.lstm = nn.LSTM(in_features,
                            hidden_dim,
                            n_layers,
                            batch_first=True,
                            bidirectional=False,
                            dropout=0.2)
        self.out_features = hidden_dim

    def forward(self, x):
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        return out
        
        
class TransformerHead(nn.Module):
    def __init__(self, in_features, max_length, num_layers=1, nhead=8, num_targets=1):
        super().__init__()

        self.transformer = nn.TransformerEncoder(encoder_layer=nn.TransformerEncoderLayer(d_model=in_features,
                                                                                          nhead=nhead),
                                                 num_layers=num_layers)
        self.row_fc = nn.Linear(in_features, 1)
        self.out_features = max_length

    def forward(self, x):
        out = self.transformer(x)
        out = self.row_fc(out).squeeze(-1)
        return out

In [None]:
class CommonLitModel(pl.LightningModule):
    def __init__(self,
                 model_name: str,
                 fold_id: int,
                 head,
                 models_path: pathlib.Path = pathlib.Path('./models'),
                 data_path: pathlib.Path = pathlib.Path('../input/commonlitreadabilityprize')
                 ):
        super().__init__()
        self.model_name = model_name
        self.num_targets = CONFIG.num_targets
        self.criterion = nn.MSELoss()
        self.fold_id = fold_id
        self.save_hyperparameters()
        self.use_attn_block = True

        self.model_path = pathlib.Path(models_path) / model_name
        self.state_dict_path = self.model_path / f'{head}_fold{fold_id}.pt'
        self.config_path = self.model_path / 'config.pkl'
        self.tokenizer_path = self.model_path / 'tokenizer.pkl'

        self.is_fine_tuned = self.config_path.exists() and self.state_dict_path.exists()
        if self.is_fine_tuned:
            with open(self.config_path, 'rb') as f:
                self.config = pickle.load(f)
            self.feature_extractor = AutoModelForTokenClassification.from_config(self.config)
            
            with open(self.tokenizer_path, 'rb') as f:
                self.tokenizer = pickle.load(f)

        else:
            self.feature_extractor = AutoModelForTokenClassification.from_pretrained(model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.config = AutoConfig.from_pretrained(model_name)
            

        in_features = self.feature_extractor.classifier.in_features
        if head == 'AttentionHead':
            self.head = AttentionHead(in_features=in_features, hidden_dim=in_features, num_targets=1)
        if head == 'CNNHead':
            self.head = CNNHead(in_features=in_features, hidden_dim=in_features//4, kernel_size=10, num_targets=1)
        if head == 'LSTMHead':
            self.head = LSTMHead(in_features=in_features, hidden_dim=in_features//4, n_layers=1, num_targets=1)
        if head == 'TransformerHead':
            self.head = TransformerHead(in_features=in_features, max_length=CONFIG.max_length, num_layers=1, nhead=8, num_targets=1)
        self.feature_extractor.classifier = nn.Identity()
        self.fc = nn.Linear(self.head.out_features, self.num_targets)
        
        if self.is_fine_tuned:
            self.load_state_dict(torch.load(self.state_dict_path, map_location=self.device))
        self.data_path = data_path

        self.data = self.prepare_data()

        self.masks = dict(trn=(self.data.fold_id.ne(self.fold_id) & self.data.fold_id.ge(0)).values,
                          val=self.data.fold_id.eq(self.fold_id).values,
                          tst=self.data.fold_id.eq(-1).values
                          )

    @staticmethod
    def set_folds(in_data, cv_obj=KFold(n_splits=5, random_state=CONFIG.SEED, shuffle=True)):
        df = in_data.copy()
        df["fold_id"] = -1
        for fold_id, (_, val_set) in enumerate(cv_obj.split(np.arange(df.index.size))):
            df.loc[val_set, "fold_id"] = fold_id
        return df

    def prepare_data(self):
        train_df = (pd.read_csv(self.data_path / "train.csv")
                    .pipe(self.set_folds, cv_obj=KFold(n_splits=CONFIG.n_folds, random_state=CONFIG.SEED, shuffle=True))
                    )
        test_df = (pd.read_csv(self.data_path / "test.csv")
                   .assign(fold_id=-1))
        df = pd.concat([train_df, test_df], sort=False).set_index('id')

        inputs_ids = dict()
        attention_masks = dict()
        for idx, excerpt in zip(df.index, df.excerpt):
            out = self.tokenizer(excerpt,
                                 add_special_tokens=True,
                                 return_tensors="pt",
                                 max_length=CONFIG.max_length,
                                 padding="max_length",
                                 truncation=True)
            inputs_ids[idx] = out['input_ids']
            attention_masks[idx] = out['attention_mask']

        df = (df
                  .join(pd.Series(inputs_ids).to_frame('inputs'))
                  .join(pd.Series(attention_masks).to_frame('attention_masks'))
                  .loc[:, ['fold_id', 'inputs', 'attention_masks', 'target']])
        return df

    def fetch_dataloader(self, typ: str):
        if typ == 'all':
            dt = self.data
        else:
            dt = self.data.loc[self.masks[typ]]

        ds = CommonLitDataset(inputs=torch.cat(dt['inputs'].tolist()),
                              masks=torch.cat(dt['attention_masks'].tolist()),
                              targets=torch.from_numpy(dt['target'].values)
                              )
        return DataLoader(ds, **CONFIG.loader_params[typ])

    def train_dataloader(self):
        return self.fetch_dataloader(typ='trn')

    def val_dataloader(self):
        return self.fetch_dataloader(typ='val')

    def test_dataloader(self):
        return self.fetch_dataloader(typ='tst')

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=CONFIG.learning_rate)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=20)
        return dict(optimizer=optimizer, scheduler=scheduler)

    def forward(self, inputs, masks):
        x = self.feature_extractor(inputs, masks)["logits"]
        x = self.head(x)
        x = self.fc(x)
        return x

    def shared_step(self, batch, typ):
        inputs = batch['inputs']
        masks = batch['masks']
        targets = batch['targets']
        outputs = self(inputs, masks)
        loss = self.criterion(outputs, targets)
        self.log(f'{typ}_loss', loss, on_step=False, on_epoch=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, typ='trn')

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, typ='val')

    def save(self):
        torch.save(self.state_dict(), self.state_dict_path)
        with open(self.tokenizer_path, 'wb') as f:
            pickle.dump(self.tokenizer, f)
        with open(self.config_path, 'wb') as f:
            pickle.dump(self.config, f)




In [None]:

results = []
heads = ['AttentionHead', 'CNNHead', 'LSTMHead', 'TransformerHead']
models = [
#     'roberta-base',
    'distilbert-base-uncased' 
#     'xlnet-base-cased'
]


dir_path = pathlib.Path('./models')

fold_id = 0
models_dict = defaultdict(lambda: defaultdict(dict))
for model_name in models:
    model_name = model_name
    model_path = dir_path / model_name
    model_path.mkdir(exist_ok=True, parents=True)
    for head in heads:
        try:        
            checkpoint_callback = pl.callbacks.ModelCheckpoint()
            print_callback = PrintCallback()
            es_callback = pl.callbacks.EarlyStopping(monitor='val_loss', patience=2) 

            model = CommonLitModel(model_name, fold_id, head=head)
            
            print(model_name, head)
            if model.is_fine_tuned:
                model = model.cuda()
                model.eval()
                print('\talready fine tuned')
                lst = [(model(batch['inputs'].cuda(), batch['masks'].cuda()) - batch['targets'].cuda()).pow(2).cpu().detach().numpy().ravel() for batch in model.val_dataloader()]
                score = np.sqrt(np.concatenate(lst).mean())
                models_dict[model_name][head] = model
                print(model_name, head, score)
                continue
            trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, max_epochs=50, fast_dev_run=False, callbacks=[checkpoint_callback, print_callback, es_callback])

            trainer.fit(model, model.train_dataloader(), model.val_dataloader())

            print_callback.plot()
            plt.show()
            
            model = model.load_from_checkpoint(checkpoint_callback.best_model_path)
            model = model.cuda()
            model.save()
            model.eval()
            lst = [(model(batch['inputs'].cuda(), batch['masks'].cuda()) - batch['targets'].cuda()).pow(2).cpu().detach().numpy().ravel() for batch in model.val_dataloader()]
            score = np.sqrt(np.concatenate(lst).mean())
            torch.cuda.empty_cache()
            models_dict[model_name][head] = model
            print(model_name, head, score)
            results.append([model_name, head, score])
        except Exception as e:
            print(e)
            pass

In [None]:
pd.DataFrame(results, columns=['model_name', 'head_name', 'score']).groupby(['head_name', 'model_name']).max().unstack()

In [None]:
!rm -rf ./lightning_logs