# Libraries

In [None]:
import os
import pandas as pd
import random
import numpy as np

# tqdm: utility for progession bar
from tqdm.notebook import tqdm

# pytorch: helps us make model
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset

# ml_things: help to genrate predicitve report
!pip install git+https://github.com/gmihaila/ml_things
from ml_things import plot_confusion_matrix

# sklearn: help to genrate predicitve report
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

# huggingface: Stores pre-trained models
from transformers import BertTokenizer, BertForSequenceClassification, set_seed
from transformers import pipeline, AdamW, get_linear_schedule_with_warmup

# wandb: kind of a VCS for reports
import wandb

# json: helps pasring JSON
import json

# Environment Variables

In [None]:
# init wandb project
wandb.init(project="stumbleupon")

# logs config
config = wandb.config

# Lets set a seed
## Ensures easy reproducibility
config.seed = seed = 69
set_seed(config.seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Number of epochs to train for
config.epoch = epoch = 3 

# Batch size for training
config.batch = batch = 8

# learniing rate
config.lr = lr = 1e-5

# scheduler warm steps
config.warup_steps = warup_steps = 0

# Each model has number of words they can process
# Setting sequence length
config.maxLength = maxLength = 512

# This chooses GPU for training if it's available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model which we choose from huggingface/tranformers library
config.modelName = modelName = 'bert-base-uncased'


# Data Preprocesing

In [None]:
df = pd.read_table('/kaggle/input/stumbleupon/train.tsv')
dfTest = pd.read_table('/kaggle/input/stumbleupon/test.tsv')
dfSub = pd.read_csv('/kaggle/input/stumbleupon/sampleSubmission.csv')

BERT could only process 512 token, so I need to process the boilerplate text to include most crucial data.

In [None]:
def boilerplatePreproc(data):
    '''
        This function helps us preproc the data stored in the boilerplate column. This function only takes 'body' of the article in account as
        article body contains the context needed to classify article as evergreen or ephemeral. Furthermore, we filter out last 512 words from the
        body as they are generally concluding statement hence has article summary.
    '''
    jsonData = json.loads(data)
    
    # if JSON has no body, we will substitute the body with title
    try:
        x = jsonData['body']
        x = ' '.join(x.split(' ')[-400:])
    except:
        x = jsonData['title']
        
    return x

# applies above func and stores result in a new column
df['body'] = df.boilerplate.map(boilerplatePreproc)
df = df.dropna()
dfTest['body'] = dfTest.boilerplate.map(boilerplatePreproc)

Let's check for class imbalance

In [None]:
df.label.value_counts()

Perfectly balanced, as all things should be.

Now let's move onto embeddings.

In [None]:
# split dataset into train and val for validating model performance
x_train, x_val, y_train, y_val = train_test_split(df.body.values, df.label.values, test_size=0.15, random_state=seed, stratify=df.label.values)

# init tokenizer
tokenizer = BertTokenizer.from_pretrained(modelName, do_lower_case=True)

# get embedings
encoded_data_train = tokenizer.batch_encode_plus(
    list(x_train), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    #pad_to_max_length=True, 
    padding='max_length',
    max_length=maxLength, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(x_val), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    #pad_to_max_length=True,
    padding='max_length', 
    max_length=maxLength, 
    return_tensors='pt',
    truncation=True
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(list(y_train))#.float()

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(list(y_val))#.float()

# making dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# making dataloader
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch)

# Model Helper Functions

In [None]:
# helper that init model obj
def initHelper():
    # get pretrained model
    # model = BertForSequenceClassification.from_pretrained(modelName, num_labels=2, output_attentions=False, output_hidden_states=False)
    model = BertForSequenceClassification.from_pretrained(modelName)
    model = model.to(device)

    # get optimizer
    optimizer = AdamW(model.parameters(), lr=lr, eps=2e-8)

    # init scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warup_steps, num_training_steps=len(dataloader_train) * epoch)
    
    return model, optimizer, scheduler

In [None]:
# helper that validates the model
def evaluate():

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
# Helper that calcs f1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Helper that trains the model
def trainHelper(epoch):
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        # zero previous step's grad
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }       

        outputs = model(**inputs)
        
        # calc loss
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.float().backward()

        # clips from to 1, prevents exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # perform optimizer and scheduler steps
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # save model    
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.pth')
    wandb.save(f'finetuned_BERT_epoch_{epoch}.pth')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    # log loss
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    # log val report
    val_loss, predictions, true_vals = evaluate()
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
    # log to wandb
    wandb.log({
        'epoch': epoch,
        'loss': loss_train_avg,
        'validation loss': val_loss,
        'f1 score': val_f1,
    })

# Train Model

In [None]:
# get model obj
model, optimizer, scheduler = initHelper()

# log model in wandb
wandb.watch(model)

# train model
for i in tqdm(range(1, epoch+1)):
    trainHelper(i)

# Creating Pipeline

In [None]:
# Helper that predicts the class of the article
def predLabels(data):
    encoded_data = tokenizer.batch_encode_plus(
        data, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        #pad_to_max_length=True,
        padding='max_length', 
        max_length=maxLength, 
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=1)
    
    model.eval()
    
    predictions = []
    
    for batch in dataloader:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
#             'labels':         batch[2],
        }

        with torch.no_grad():        
            outputs = model(**inputs)
        
#         print(outputs[0])
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    
    predictions = np.concatenate(predictions, axis=0)
            
    return np.argmax(predictions, axis=1).flatten()

In [None]:
# Validation data list
x_val_list = list(x_val)
y_val_list = list(y_val)

# Make predictions
pred = predLabels(x_val_list)
    
# plot conf matrix
plot_confusion_matrix(y_val_list, pred)

# wandb
wandb.log({
    "conf_mat" : wandb.plot.confusion_matrix(
                    probs=None,
                    y_true=y_val_list,
                    preds=pred,
                    class_names=['ephemeral', 'evergreen']
                ),
})

# Generating Submission File

In [None]:
preds = predLabels(dfTest.body.to_list())

subDict = {
    'urlid': dfTest.urlid.to_list(),
    'label': preds,
}

dfSub = pd.DataFrame.from_dict(subDict)
dfSub.to_csv('submission.csv', index = False)
wandb.save(f'submission.csv')
# dfSub.head()