In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.utilities.seed import seed_everything
import pandas as pd
import numpy as np
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AdamW, RobertaConfig
from transformers.modeling_utils import ModuleUtilsMixin

In [None]:
seed_everything(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [None]:
df.head()

In [None]:
BATCH_SIZE = 16
MAX_LENGTH = 210
EMBED_SIZE = 768
DATA_DIR = '/kaggle/input/commonlitreadabilityprize/'

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
def generate_batch(data_batch):
    word_batch = [d[0] for d in data_batch]
    word_batch = tokenizer(word_batch, return_tensors='pt', padding='max_length', 
                           truncation=True, max_length=MAX_LENGTH)
    try:
        target_batch = torch.tensor([d[1] for d in data_batch], dtype=torch.float32)
        return word_batch, target_batch
    except IndexError as e:
        return word_batch

### Build Model

In [None]:
class Model(pl.LightningModule):
    def __init__(self, data_dir, embed_dim, lr=1e-3, batch_size=8):
        super(Model, self).__init__()
        self.data_dir = data_dir
        self.embed_dim = embed_dim
        self.roberta_model = AutoModel.from_pretrained('../input/roberta-base')
#         self.freeze_embedder()
        self.init_encoder()
        self.freeze_pooler()
        self.linear1 = nn.Linear(embed_dim, 1)
        self.lr = lr
        self.batch_size = batch_size
        self.test_preds = []
        
    def freeze_embedder(self):
        for p in self.roberta_model.embeddings.parameters():
            p.requires_grad = False
        return
        
    def init_encoder(self):
        """
        Function to re-initialize the final 4 encoder layers 
        of the encoder.
        """
        for m in list(self.roberta_model.encoder.children())[0][-4:].modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)
            if isinstance(m, nn.LayerNorm):
                nn.init.ones_(m.weight)  # ones
                nn.init.zeros_(m.bias)  # zeros
        return
    
    def freeze_pooler(self):
        for p in self.roberta_model.pooler.parameters():
            p.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        x is of size (bsz, seq_len)
        mask is of size (bsz, seq_len)
        """
        x = self.roberta_model(input_ids, attention_mask).last_hidden_state
        x = x[:, 0, :]  # just take the <cls> token
        pred = self.linear1(x)
        return pred
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        return optimizer

    def shared_step(self, batch, stage):
        if stage in ['train', 'val']:
            x, y = batch
        elif stage == 'test':
            x = batch
        input_ids = x['input_ids']
        attn_mask = x['attention_mask']
        y_hat = self(input_ids, attn_mask)
        y_hat = y_hat.flatten()
        if stage == 'train' or stage == 'val':
            loss = F.mse_loss(y_hat, y)
            return loss
        if stage == 'test':
            return y_hat
    
    def training_step(self, train_batch, batch_idx):
        loss = self.shared_step(train_batch, 'train')
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        loss = self.shared_step(val_batch, 'val')
        self.log('val_loss', loss)
    
    def test_step(self, test_batch, batch_idx):
        preds = self.shared_step(test_batch, 'test')
        self.test_preds.extend(preds.flatten().cpu().numpy())
        
    ################
    # DATA HOOKS ###
    ###############
    def setup(self, stage):
        if stage=='fit':
            df = pd.read_csv(self.data_dir + 'train.csv')
            targets = list(df.target)
        if stage=='test':
            df = pd.read_csv(self.data_dir + 'test.csv')
            
        if stage=='fit':
            paras = []
            for idx, row in df.iterrows():
                paras.append(row.excerpt)
            data = list(zip(paras, targets))
            self.train_data_split, self.val_data_split = torch.utils.data.random_split(data, [2734, 100])
        
        if stage=='test':
            paras = []
            for idx, row in df.iterrows():
                paras.append([row.excerpt])
            self.test_data = paras
        
    def train_dataloader(self):
        tr_dataloader = DataLoader(self.train_data_split, batch_size=self.batch_size, shuffle=True, collate_fn=generate_batch)
        return tr_dataloader

    def val_dataloader(self):
        v_dataloader = DataLoader(self.val_data_split, batch_size=self.batch_size, shuffle=False, collate_fn=generate_batch)
        return v_dataloader

    def test_dataloader(self):
        t_dataloader = DataLoader(self.test_data, batch_size=10, shuffle=False, collate_fn=generate_batch)
        return t_dataloader

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath='./saved_models/',
    filename='model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min'
)

In [None]:
model = Model(
    data_dir=DATA_DIR,
    embed_dim=EMBED_SIZE,
    batch_size=BATCH_SIZE)
# wandb_logger = WandbLogger()
trainer = pl.Trainer(
    gpus=-1, 
    fast_dev_run=False, 
    auto_lr_find=True,
    auto_scale_batch_size=None,
    accumulate_grad_batches=4,
    max_epochs=120,
    progress_bar_refresh_rate=0,  # TODO: CHANGE TO 0
#     logger=wandb_logger,
    callbacks=[checkpoint_callback])
trainer.tune(model)
trainer.fit(model)

In [None]:
model = Model.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    data_dir=DATA_DIR,
    embed_dim=EMBED_SIZE)
model.to(device)

In [None]:
def get_model_embeds(model, df):
    X = []
    model.eval()
    with torch.no_grad():
        for idx, row in df.iterrows():
            tok_output = tokenizer([row.excerpt], return_tensors='pt', padding='max_length', 
                                   truncation=True, max_length=MAX_LENGTH)
            input_ids, attn_mask = tok_output['input_ids'], tok_output['attention_mask']
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            embeds = model.roberta_model(input_ids, attn_mask).last_hidden_state
            embed = embeds[0, 0]  # select batch and <CLS> token
            embed = embed.cpu().detach().numpy()
            X.append(embed)
    X = np.array(X)
    return X

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [None]:
svr = SVR()

parameters = {'kernel':['linear', 'rbf', 'poly'], 'C':[0.01, 0.1, 1, 10]}

gsv = GridSearchCV(estimator=svr, param_grid=parameters, scoring='neg_root_mean_squared_error', cv=5)

train_df = pd.read_csv(DATA_DIR+'train.csv')

X = get_model_embeds(model, train_df)
y = list(train_df.target)

gsv.fit(X, y)

In [None]:
test_df = pd.read_csv(DATA_DIR + 'test.csv')

test_embeds = get_model_embeds(model, test_df)

preds = gsv.predict(test_embeds)

In [None]:
submission = pd.DataFrame(zip(list(test_df.id), preds), columns=['id', 'target'])
submission.to_csv('./submission.csv', index=False)