# Import Libraries

In [None]:
!pip install pytorch_model_summary

In [None]:
import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import sklearn.metrics as skm

from pytorch_model_summary import summary
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, AutoConfig, pipeline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"   # "last_expr"
random_seed=42

# Load DataSet

In [None]:
train_df = pd.read_csv('../input/stumbleupon/train.tsv', delimiter='\t')
train_df.head()

In [None]:
print(train_df.shape) # Shape of train dataset

## Replace `?` with nan as per dataset `?` is null

In [None]:
train_df.replace(to_replace='?', value=np.nan, inplace=True)

## Data info

In [None]:
train_df.info()

In [None]:
train_df['alchemy_category_score'] = train_df['alchemy_category_score'].astype(dtype='float')
train_df['news_front_page'] = train_df['news_front_page'].astype(dtype='float')
train_df['is_news'] = train_df['is_news'].astype(dtype='float')

## Desribe Data

In [None]:
train_df.describe()

# EDA

* We will be only using the text column for classifing the data
* We can use python `eval` function to convert to `dict` object and access as keys
* There are null text in `boilerplate` column replace null with empty string so that we don't have any error while using `eval` function

## Clean the `boilerplate` text column

In [None]:
train_df['boilerplate'] = train_df['boilerplate'].replace(to_replace=':null', value=':""', regex=True) # replace null with empty strings

## Check Label Count Distrubtion

In [None]:
train_df['label'].value_counts()
train_df['label'].value_counts().plot(kind='barh', color='g')

## Concatenate all the boiler plate text

In [None]:
def concat_text(boilerplate_dict):
    text = ''
    for key in boilerplate_dict:
        text += f" {boilerplate_dict[key]}"
    return text.strip()

In [None]:
train_df['total_text'] = train_df['boilerplate'].apply(lambda x: concat_text(eval(x)))

## Stats of text length in words

In [None]:
train_df['total_text_length (words)'] = train_df['total_text'].apply(lambda x: len(x.split()))

In [None]:
train_df['total_text_length (words)'].describe()
train_df['total_text_length (words)'].plot(kind='hist', color='b')

* We will take bert default 512 sequence length for tokenizer because 75% of data is near 632 length this would be better fit
* While using tokenizer we will truncate sequence bigger than this.

# Data preprocessing and DataLoader 

## Target Label Encoding

In [None]:
cat2idx = {label: i for i, label in enumerate(sorted(train_df['label'].unique()))}
idx2cat = {i: label for i, label in enumerate(sorted(train_df['label'].unique()))}

## Defining Text Tokenizer using `BertTokenizer`

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Dataset iterator

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, source_column, target_column, max_length, transform=None):
        self.df = df
        self.transform = transform
        
        # get source and target texts
        self.source_texts = [tokenizer(text, padding='max_length', max_length = max_length, truncation=True,
                                return_tensors="pt") for text in self.df[source_column]]
        self.targets = self.df[target_column].map(cat2idx)
    
    def classes(self):
        return self.targets
    
    def __len__(self):
        return len(self.targets)
    
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.targets[idx])
    
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.source_texts[idx]
    
    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

## Train and Validation Split

In [None]:
y_train = train_df['label'] 
X_train = train_df.drop(['label'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train,random_state=random_seed)

In [None]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [None]:
X_train.shape
y_train.shape
X_val.shape
y_val.shape

In [None]:
y_train.value_counts(normalize=True)*100

y_train.value_counts().plot(kind='barh',color='green')

y_val.value_counts(normalize=True)*100

y_val.value_counts().plot(kind='barh',color='orange')

In [None]:
X_train['label'] = y_train
X_train.head()

In [None]:
X_val['label'] = y_val
X_val.head()

## Training Data Iterator

In [None]:
train_iter = Dataset(X_train, 'total_text', 'label', 512)
X_train['total_text_length (words)'][0]
train_iter[0]

## Validation Data Iterator

In [None]:
val_iter = Dataset(X_val, 'total_text', 'label', 512)
X_val['total_text_length (words)'][0]
val_iter[0]

## DataLoader

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_iter, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_iter, batch_size=32)

# Model Defintion

## Metrics

In [None]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

def f1_score(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(skm.f1_score(preds.cpu(), labels.cpu(), average='weighted'))


## Use GPU or not 

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# Input Size to model and number of classes

In [None]:
input_size = 768
num_classes = len(cat2idx)

## Model Architecture

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', )
        self.linear = nn.Linear(input_size, num_classes)
    
    def forward(self, input_id, mask):
        # pooled outout is 768 dimension vector , _ is rest of vectors of bert model
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask,return_dict=False) 
        out = self.linear(pooled_output)
        return out 
    
    def training_step(self, batch):
        bert_dict, labels = batch 
        out = self(bert_dict['input_ids'].squeeze(1).to(device), 
                                bert_dict['attention_mask'].to(device)) # Generate predictions
        labels = labels.to(device)
        loss = F.cross_entropy(out, labels)   # Calculate loss
        return loss
    
    def validation_step(self, batch):
        bert_dict, labels = batch 
        out = self(bert_dict['input_ids'].squeeze(1).to(device), 
                                bert_dict['attention_mask'].to(device)) # Generate predictions
        labels = labels.to(device)
        loss = F.cross_entropy(out, labels)                             # Calculate loss
        acc = accuracy(out, labels)                                     # Calculate accuracy
        f1_sc = f1_score(out, labels)
        return {'val_loss': loss, 'val_acc': acc, 'f1_score': f1_sc}
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()                    # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()                       # Combine accuracies
        batch_f1 = [x['f1_score'] for x in outputs]
        epoch_f1 = torch.stack(batch_f1).mean()                          # Combine f1 scores
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item(), 'f1_score': epoch_f1.item()}
    
    def epoch_end(self, epoch, result):
        print(f"Epoch [{epoch}], val_loss: {result['val_loss']:.4f}, val_acc: {result['val_acc']:.4f} val_f1_score: {result['f1_score']:.4f}")
    

## Evaluate Model

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

# Train Model

## Intialize  Model

In [None]:
model = LogisticRegression()

## Disable training of bert embedding layer

In [None]:
for param in model.bert.parameters():
    param.requires_grad = False
    #print(param.shape)

## Model Summary

In [None]:
if torch.cuda.is_available():
    model.cuda()

In [None]:
batch, labels  = next(iter(train_dataloader))
mask = batch['attention_mask'].to(device)
input_id = batch['input_ids'].squeeze(1).to(device)
print(summary(model, input_id, mask))

## Intial Model Loss and accuracy using model evaulation

In [None]:
evaluate(model, val_dataloader)

## Train Function

In [None]:
def fit(epochs, learning_rate, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    optimizer=opt_func(model.parameters(), learning_rate)
    history = []
    for epoch in range(epochs):
        
        # Training Phase
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        # Validation Phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result)
        history.append(result)
        
    return history

In [None]:
history1 = fit(5, 0.001, model, train_dataloader, val_dataloader)

In [None]:
history2 = fit(5, 0.001, model, train_dataloader, val_dataloader)

In [None]:
history3 = fit(5, 0.001, model, train_dataloader, val_dataloader)

In [None]:
history4 = fit(5, 0.001, model, train_dataloader, val_dataloader)

# Test Model and Submission

In [None]:
test_df = pd.read_csv('../input/stumbleupon/test.tsv', delimiter='\t')
test_df.head()

In [None]:
test_df['boilerplate'] = test_df['boilerplate'].replace(to_replace=':null', value=':""', regex=True) # replace null with empty strings

In [None]:
test_df['total_text'] = test_df['boilerplate'].apply(lambda x: concat_text(eval(x)))
test_df['total_text_length (words)'] = test_df['total_text'].apply(lambda x: len(x.split()))

In [None]:
predictions = []

for text in test_df['total_text']:
    bert_dict = tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt")
    mask = bert_dict['attention_mask'].to(device)
    input_id = bert_dict['input_ids'].squeeze(1).to(device)
    output = model(input_id, mask)
    pred = output.argmax(dim=1)
    predictions.append(pred.cpu().numpy()[0])


In [None]:
test_df['label'] = predictions
test_df.to_csv('submission.csv',columns=['urlid','label'],index=False)