# Imports

In [None]:
! curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
! python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
! pip install transformers sentencepiece accelerate

In [None]:
from accelerate import Accelerator
from accelerate import notebook_launcher

In [None]:
import os
import pandas as pd
import numpy as np
import copy
import csv
import json
import random
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from pprint import pprint
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch

import transformers
from sklearn.metrics import (
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Config

In [None]:
SEED = 42

# paths
DATA_DIR = "../input/stumbleupon/"
TF_MODEL_PATH = "roberta-base"
TRAINED_MODELS_PATH = f"stumbleupon_{TF_MODEL_PATH}.pt"

# data
TITLE_MAX_LENGTH = 64
BODY_MAX_LENGTH = 1024
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 8
VALIDATION_SPLIT = 0.2
N_LABELS = 2

# model
TF_HIDDEN = 768
FULL_FINETUNING = True
LR = 3e-5
OPTIMIZER = 'AdamW'
CRITERION = 'CrossEntropyLoss'
SAVE_BEST_MODEL = True
EPOCHS = 1

# Data Preprocessing

In [None]:
df_train = pd.read_csv(DATA_DIR + "train.tsv", sep='\t')
df_test = pd.read_csv(DATA_DIR + "test.tsv", sep='\t')

In [None]:
df_train

In [None]:
def get_df_text(df, is_test=False):
    lem = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    def clean_text(s):
        tokens = s.split()
        no_stop = [tok for tok in tokens if tok.lower() not in stop_words]
        lemmas = [lem.lemmatize(tok) for tok in no_stop]
        return ' '.join(lemmas)
    
    def foo(s, keyword):
        fin = ""
        di = json.loads(s)
        if keyword in di.keys() and di[keyword]:
            fin = di[keyword]
            
        if fin == "":
            if keyword == "title":
                if "url" in di.keys() and di["url"]:
                    fin = di["url"]
                    fin = clean_text(fin)
                else:
                    fin = "None"
            else:
                fin = "None"
        else:
            fin = clean_text(fin)
            
        return fin

    df_text = pd.DataFrame()
    df_text["urlid"] = df["urlid"]
    df_text["title"] = df["boilerplate"].apply(lambda x: foo(x, "title"))
    df_text["body"] = df["boilerplate"].apply(lambda x: foo(x, "body"))
    
    if not is_test:
        df_text["label"] = df["label"]
    
    return df_text

In [None]:
df_train_text = get_df_text(df_train)
df_test_text = get_df_text(df_test, is_test=True)
df_train_text

# Dataset

In [None]:
class TransformerDataset(torch.utils.data.Dataset):
    def __init__(self, df, is_test=False):
        super(TransformerDataset, self).__init__()
        
        self.titles = df["title"].values
        self.bodies = df["body"].values
        self.is_test = is_test
        if not self.is_test:
            self.labels = df["label"].values

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(TF_MODEL_PATH)

    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self, index):
        title_tokenized = self.tokenizer.encode_plus(
            str(self.titles[index]),
            max_length=TITLE_MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt"
        )
        body_tokenized = self.tokenizer.encode_plus(
            str(self.bodies[index]),
            max_length=BODY_MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt"
        )

        if not self.is_test:
            return {
                "title": {
                    "input_ids": title_tokenized["input_ids"].squeeze().long(),
                    "attention_mask": title_tokenized["attention_mask"].squeeze().long()
                },
                "body": {
                    "input_ids": body_tokenized["input_ids"].squeeze().long(),
                    "attention_mask": body_tokenized["attention_mask"].squeeze().long()
                },
                "labels": torch.Tensor([self.labels[index]]).long().squeeze()
            }

        return {
            "title": {
                    "input_ids": title_tokenized["input_ids"].squeeze().long(),
                    "attention_mask": title_tokenized["attention_mask"].squeeze().long()
                },
            "body": {
                "input_ids": body_tokenized["input_ids"].squeeze().long(),
                "attention_mask": body_tokenized["attention_mask"].squeeze().long()
            },
        }

# Model

In [None]:
class TFDualHeadModel(torch.nn.Module):
    def __init__(self):
        super(TFDualHeadModel, self).__init__()

        self.tf = transformers.AutoModel.from_pretrained(TF_MODEL_PATH)
        self.dropout = torch.nn.Dropout(p=0.3)
        self.output = torch.nn.Linear(TF_HIDDEN * 2, N_LABELS)

    def forward(
        self,
        title_input_ids,
        title_attention_mask,
        body_input_ids,
        body_attention_mask 
        ):

        title_tf_out = self.tf(
            input_ids=title_input_ids,
            attention_mask=title_attention_mask
        )
        
        title_drop = self.dropout(title_tf_out.pooler_output)
        
        body_tf_out = self.tf(
            input_ids=body_input_ids,
            attention_mask=body_attention_mask
        )
        
        body_drop = self.dropout(body_tf_out.pooler_output)
        
        combined = torch.cat([title_drop, body_drop], dim=1)
        x = self.output(combined)
        
        return x

# Run

In [None]:
def get_model(model_path):
    print("\n-- Loading model")
    model = TFDualHeadModel()
    accelerator = Accelerator()
    
    unwrapped_model = accelerator.unwrap_model(model)
    
    chkpt = torch.load(model_path)
    
    unwrapped_model.load_state_dict(chkpt["state_dict"]) 
    roc = chkpt["roc"]
    return unwrapped_model, roc


def load_data(df):
    dataset_size = len(df)
    indices = list(range(dataset_size))
    split = int(np.floor(VALIDATION_SPLIT * dataset_size))
    
    np.random.seed(SEED)
    np.random.shuffle(indices)

    train_indices, val_indices = indices[split:], indices[:split]
    
    df_train = df.iloc[train_indices]
    df_val = df.iloc[val_indices]
    
    return df_train, df_val


def run(model_path=None, checkpoint=None):
    torch.manual_seed(SEED)

    # Initialize accelerator
    accelerator = Accelerator()

    df_train, df_val = load_data(df_train_text)
    train_data = TransformerDataset(df_train)
    val_data = TransformerDataset(df_val)

    train_dataloader = torch.utils.data.DataLoader(
        train_data, 
        batch_size=TRAIN_BATCH_SIZE,
        drop_last=True
    )

    val_dataloader = torch.utils.data.DataLoader(
        val_data, 
        batch_size=VAL_BATCH_SIZE,
    )

    # init model
    if model_path:
        model = get_model(model_path, checkpoint)
    else:
        accelerator.print('\n-- Initializing Model')
        model = TFDualHeadModel()
    
    criterion = getattr(torch.nn, CRITERION)()

    # define the parameters to be optmized -
    # - and add regularization
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = getattr(torch.optim, OPTIMIZER)(optimizer_parameters, lr=LR)

    model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, val_dataloader
    )

    num_training_steps = len(train_dataloader) * EPOCHS
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    #######################
    # training & validation
    accelerator.print("\n-- Training")

    max_val_roc = float('-inf')
    for epoch in range(EPOCHS):
        progress_bar = tqdm(
            range(len(train_dataloader)), 
            desc="Epoch " + str(epoch),
            disable=not accelerator.is_main_process
        )

        # Training
        train_loss = 0
        for step, batch in enumerate(train_dataloader):
            # set model.eval() every time during training
            model.train()
            
            # unpack the batch contents and push them to the DEVICE (cuda or cpu).
            b_title_input_ids = batch["title"]['input_ids']
            b_title_attention_mask = batch["title"]['attention_mask']
            b_body_input_ids = batch["body"]['input_ids']
            b_body_attention_mask = batch["body"]['attention_mask']
            b_labels = batch['labels']

            # clear accumulated gradients
            optimizer.zero_grad()

            # forward pass
            logits = model(
                b_title_input_ids,
                b_title_attention_mask,
                b_body_input_ids,
                b_body_attention_mask
            )

            # calculate loss
            loss = criterion(logits, b_labels)
            train_loss += loss.item()

            # backward pass
            accelerator.backward(loss)

            # update weights
            optimizer.step()
            
            # update scheduler
            scheduler.step()

            progress_bar.update(1)
            progress_bar.set_postfix({"loss": loss.item()})
        
        avg_train_loss = train_loss / len(train_dataloader)
        accelerator.print('Training loss:', avg_train_loss)

        # Validation
        val_loss = 0
        preds = []
        labels = []
        
        # set model.eval() every time during evaluation
        model.eval()
        
        for step, batch in enumerate(val_dataloader):
            b_title_input_ids = batch["title"]['input_ids']
            b_title_attention_mask = batch["title"]['attention_mask']
            b_body_input_ids = batch["body"]['input_ids']
            b_body_attention_mask = batch["body"]['attention_mask']
            b_labels = batch['labels']

            with torch.no_grad():
                logits = model(
                    b_title_input_ids,
                    b_title_attention_mask,
                    b_body_input_ids,
                    b_body_attention_mask
                )

            loss = criterion(logits, b_labels)
            val_loss += loss.item()

            b_pred = torch.argmax(logits, dim=1)

            preds.append(accelerator.gather(b_pred))
            labels.append(accelerator.gather(b_labels))

        preds = torch.cat(preds)[:len(val_data)]
        labels = torch.cat(labels)[:len(val_data)]

        avg_val_loss = val_loss / len(val_dataloader)
        accelerator.print('Val loss:', avg_val_loss)
        val_roc = roc_auc_score(labels, preds)
        accelerator.print('Val roc-auc:', val_roc)

        if val_roc > max_val_roc:
            accelerator.print(f"-- Best Model. Val loss: {max_val_roc} -> {val_roc}")
            max_val_roc = val_roc
            if SAVE_BEST_MODEL:
                accelerator.print("-- Saving model.")
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)
                accelerator.save(
                    {
                        "state_dict": unwrapped_model.state_dict(), 
                        "roc": max_val_roc
                    },
                    TRAINED_MODELS_PATH
                )

In [None]:
try:
    notebook_launcher(run())
except ValueError:
    pass

# Inference & Submission

In [None]:
def predict(model, roc):
    accelerator = Accelerator()
    
    test_data = TransformerDataset(df_test_text, is_test=True)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=VAL_BATCH_SIZE)
    
    model, test_dataloader = accelerator.prepare(
        model, test_dataloader
    )
    
    preds = []
    model.eval()
    
    print("\n-- Predicting")
    tq = tqdm(test_dataloader, total=len(test_dataloader))     
    for step, batch in enumerate(tq):
        b_title_input_ids = batch["title"]['input_ids']
        b_title_attention_mask = batch["title"]['attention_mask']
        b_body_input_ids = batch["body"]['input_ids']
        b_body_attention_mask = batch["body"]['attention_mask']

        with torch.no_grad():
            logits = model(
                b_title_input_ids,
                b_title_attention_mask,
                b_body_input_ids,
                b_body_attention_mask
            )

            b_pred = torch.argmax(logits, dim=1).detach().cpu().numpy()
            preds.extend(b_pred)

    urlids = df_test_text["urlid"]
    submission = pd.DataFrame({"urlid": urlids, "label": preds})
    roc = round(roc * 100, 2)
    submission.to_csv(f"{TF_MODEL_PATH}_titlebody_roc{roc}.csv", index=False)

In [None]:
model, roc = get_model(TRAINED_MODELS_PATH)
predict(model, roc)