In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

from sklearn.utils.class_weight import compute_class_weight
import torch.utils
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import MultiStepLR

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
df_train = pd.read_csv(r"Local\Dataset\train.csv")
df_train.target = df_train.target.astype("int16")
df_test = pd.read_csv(r"Local\Dataset\test.csv")
sub_df = pd.read_csv(r"Local\Dataset\sample_submission.csv")

In [5]:
df_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [10]:
train_df, val_df = train_test_split(
    df_train,
    stratify=df_train.target,
    test_size=0.2,
    random_state=42,
)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large", use_fast=True)

In [None]:
class TrainDataset(Dataset):
    def __init__(self, train_dataset, tokenizer, max_length=62):
        self.train_dataset = train_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.train_dataset)

    def __getitem__(self, idx):
        text = str(self.train_dataset.iloc[idx, 1])
        target = float(self.train_dataset.iloc[idx, 2])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        return (
            encoding["input_ids"].squeeze(),
            encoding["attention_mask"].squeeze(),
            torch.tensor(target),
        )

In [None]:
class TestDataset(Dataset):
    def __init__(self, test_dataset, tokenizer, max_length=62):
        self.test_dataset = test_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.test_dataset)

    def __getitem__(self, idx):
        text = str(self.test_dataset.iloc[idx, 1])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        return (encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze())

In [33]:
train_ds = TrainDataset(train_df, tokenizer)
val_ds = TrainDataset(val_df, tokenizer)
train_dl = DataLoader(train_ds, batch_size=32)
val_dl = DataLoader(val_ds, batch_size=32)

In [None]:
class BERT_MODEL(nn.Module):
    def __init__(self, model_name):
        super(BERT_MODEL, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.linear = nn.Sequential(
            nn.Linear(768, 1024), 
            nn.ReLU(), 
            nn.Dropout(0.3), 
            nn.Linear(1024, 1)
        )

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        output = self.linear(pooled_output)
        return output

In [14]:
help(nn.BCEWithLogitsLoss)

Help on class BCEWithLogitsLoss in module torch.nn.modules.loss:

class BCEWithLogitsLoss(_Loss)
 |  BCEWithLogitsLoss(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', pos_weight: Optional[torch.Tensor] = None) -> None
 |  
 |  This loss combines a `Sigmoid` layer and the `BCELoss` in one single
 |  class. This version is more numerically stable than using a plain `Sigmoid`
 |  followed by a `BCELoss` as, by combining the operations into one layer,
 |  we take advantage of the log-sum-exp trick for numerical stability.
 |  
 |  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 |  
 |  .. math::
 |      \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
 |      l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
 |      + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
 |  
 |  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
 |  (default ``'mean'``), then
 |  
 |  .. math::
 |      \ell

In [None]:
class trainer:
    def __init__(
        self,
        dataset_path,
        model_card,
        device="cuda" if torch.cuda.is_available() else "cpu",
        length_percentile=99.9,
        batch_size=32,
        epochs=10,
    ):
        self.device = device
        self.batch_size = batch_size
        self.epochs = epochs
        print(f"Using {self.device} device")
        self.model_name = model_card
        self.train_dl, self.val_dl = self.get_dataloader(dataset_path, length_percentile)
        self.model = BERT_MODEL(self.model_name)
        self.criterion = nn.BCEWithLogitsLoss()
        
        
        
        
        
        
    @staticmethod
    def get_max_length(self, df_train, tokenizer, length_percentile=99.9):
        df_train["question_length"] = tokenizer(
        df_train.question_text.tolist(), truncation=True
        )["input_ids"]
        df_train["question_length"] = df_train["question_length"].apply(
            lambda x: len(x)
        )
        max_length = np.percentile(df_train["question_length"], length_percentile)
        
        return int(max_length)
    
    def get_dataloader(self, path, length_percentile=99.9):
        df_train = pd.read_csv(path)
        df_train.target = df_train.target.astype("int16")
        
        tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
        max_length = self.get_max_length(df_train, tokenizer, length_percentile)
        
        train_df, val_df = train_test_split(
            df_train,
            stratify=df_train.target,
            test_size=0.2,
            random_state=42,
        )
        
        train_ds = TrainDataset(train_df, tokenizer, max_length)
        val_ds = TrainDataset(val_df, tokenizer, max_length)
        
        train_dl = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=self.batch_size)
        
        return train_dl, val_dl
        
    @staticmethod
    def find_best_f1(outputs, labels):
        tmp = [0, 0, 0]  # idx, current, max
        threshold = 0

        for tmp[0] in np.arange(0.1, 0.99, 0.01):
            tmp[1] = f1_score(labels, outputs > tmp[0])
            if tmp[1] > tmp[2]:
                threshold = tmp[0]
                tmp[2] = tmp[1]

        return tmp[2], threshold
    
    @staticmethod
    def get_preds(logits, threshold):
        """Convert logits to binary predictions based on the threshold"""
        predictions = (torch.sigmoid(logits) > threshold).float()
        return predictions

In [20]:
def evaluate(model, val_dl):
    losses = []
    val_outputs = []
    val_targets = []
    accuracy = []
    f1 = []
    i = 1
    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_dl):
            input_ids, attention_mask, targets = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)  ## target is of type 0.0 and 1.0

            output = model(input_ids, attention_mask)

            loss = BCE(output.squeeze(), targets.float())

            val_outputs.append(torch.sigmoid(output).squeeze().cpu().numpy())
            val_targets.append(targets.cpu().numpy())
    #             losses.append(loss.item())
    #             f1_Score = f1_score(output.cpu().numpy(),targets.cpu().numpy().astype(int))
    #             f1.append(f1_Score)
    #             if i==5:
    #                 return  val_outputs,val_targets
    #                 break
    #             i+=1
    #             print(f'Val_loss: {loss},val_f1_Score:{f1_Score}')
    return val_outputs, val_targets

In [21]:
BCE = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(15, device=device))

In [22]:
def fit(epochs, model, train_dl, val_dl):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    history = []
    losses = []
    train_outputs = []
    i = 1
    milestones = [1500, 3000]
    scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1)
    model.train()
    for epoch in range(epochs):
        for batch in tqdm(train_dl):
            input_ids, attention_mask, targets = batch

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)

            output = model(input_ids, attention_mask)

            loss = BCE(output.squeeze(), targets)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5000)
            optimizer.step()
            scheduler.step()

            if i % 10 == 0:
                #                 for name, param in model.named_parameters():
                #                     if (param.grad is not None) & (param.grad.abs().sum()>5000):
                #                         print(name, param.grad.abs().sum())
                #                         torch.save(model.state_dict(), 'insincere_model.pth')
                f1, thres = find_best_f1(
                    torch.sigmoid(output.detach()).squeeze().cpu().numpy(),
                    targets.cpu().numpy(),
                )
                print(
                    f"Batch:{i} ; Loss: {loss:.3f}; Pred at {thres:.3}:{get_preds(output.squeeze(), thres)}; best_Train_f1:{f1:.3f}"
                )

            if i % 10 == 0:
                current_lr = optimizer.param_groups[0]["lr"]
                print("\nCurrent learning rate:", current_lr)
                torch.save(model.state_dict(), "insincere_model.pth")
            i += 1

        val_outputs, val_targets = evaluate(model, val_dl)
        val_outputs = np.concatenate(val_outputs)
        val_targets = np.concatenate(val_targets)
        val_f1, threshold = find_best_f1(val_outputs, val_targets)
        print(
            "Epoch {}; Val F1: {:.3f}, Threshold: {:.3f}".format(
                epoch, val_f1, threshold
            )
        )
    return [val_f1, threshold, val_outputs, val_targets]

In [23]:
results = fit(5, model, train_dl, val_dl)

0it [00:00, ?it/s]


AttributeError: 'TrainDataset' object has no attribute 'texts'

In [None]:
# results = fit(1,model,train_dl,val_dl) #### Hyper parameter tuning (again getting bad resuts)

In [None]:
# results = []

In [None]:
# results.append(fit(5,model,train_dl,val_dl))

In [None]:
# torch.save(model.state_dict(), 'insincere_model_final_successful.pth')

In [None]:
# results = evaluate(model,val_dl)

In [None]:
# val_outputs,val_targets = results
# val_outputs = np.concatenate(val_outputs)
# val_targets = np.concatenate(val_targets)
# val_f1, threshold = find_best_f1(val_outputs, val_targets)
# print("Val F1: {:.3f} at Threshold: {:.3f}".format(val_f1, threshold))

In [None]:
best_threshold = 0.9

In [None]:
# encoding = bert_tokenizer.encode_plus(
#             "Why are IITs so bad at research?",
#             max_length=60,
#             padding='max_length',
#             truncation=True,
#             add_special_tokens=True,
#             return_tensors='pt'
#         ) 

In [None]:
# output = model(encoding['input_ids'].to(device), encoding['attention_mask'].to(device))

In [None]:
# get_preds(output.squeeze(),0.9)

In [None]:
# del model

In [None]:
model = BERT_MODEL()
# model.to(device)

In [None]:
model.load_state_dict(torch.load('/kaggle/input/insincere_question_final_model/pytorch/1.01/1/insincere_model_final_successful.pth'))

In [None]:
def test(df_test,model):
    test_dataset = TestDataset(df_test,tokenizer = bert_tokenizer)
    test_dl = DataLoader(test_dataset, batch_size = 32)
    preds = []
    for batch in tqdm(test_dl):
        input_ids , attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        output = model(input_ids, attention_mask)
        del input_ids,attention_mask
        prediction = (torch.sigmoid(output).squeeze() > best_threshold).int()
        preds.append(prediction.cpu().squeeze().int().numpy())
    print('test predictions generated successfully!!')    
    return preds

In [None]:
def submission(sub_df,df_test,model):
    preds = test(df_test,model)
    predictions = np.array([])
    for pred in preds:
       predictions = np.concatenate([predictions,np.array(pred)])
    sub_df.prediction = predictions
    print('Submission Dataframe created successfully')
    return sub_df

In [None]:
model.to(device)

In [None]:
submission_df = submission(sub_df,df_test,model)
submission_df.prediction = submission_df.prediction.astype(int)
submission_df.to_csv('submission.csv', index = False)