In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaModel
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm

from sklearn.utils.class_weight import compute_class_weight
import torch.utils
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim.lr_scheduler import MultiStepLR

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
df_train = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
df_test = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")
sub_df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')

In [None]:
class TrainDataset(Dataset):
    def __init__(self, question_text,targets, tokenizer, max_length=60):
        self.texts = question_text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = float(self.targets[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        return (encoding['input_ids'].flatten(), encoding['attention_mask'].flatten(),torch.tensor(target))

In [None]:
class TestDataset(Dataset):
    def __init__(self,df_test, tokenizer, max_length = 60):
        self.texts = df_test.question_text
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
        text,
        max_length = self.max_length,
        padding = 'max_length',
        truncation = True,
        add_special_tokens = True,
        return_tensors = 'pt'
        )
        return (encoding['input_ids'].flatten(),encoding['attention_mask'].flatten())

In [None]:
bert_tokenizer = torch.load('/kaggle/input/bert_tokenizer/pytorch/1.01/1/Bert Tokenizer.pth')

In [None]:
# Load pre-trained BERT tokenizer
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# train_inputs, val_inputs,train_targets, val_targets = train_test_split(df_train.question_text, df_train.target,stratify=df_train.target,test_size=0.2, random_state=42)

In [None]:
# train_inputs.reset_index(drop=True,inplace=True)
# train_targets.reset_index(drop=True,inplace=True)
# val_targets.reset_index(drop=True,inplace=True)
# val_inputs.reset_index(drop=True,inplace=True)

In [None]:
# train_ds = TrainDataset(train_inputs,train_targets,bert_tokenizer)
# val_ds = TrainDataset(val_inputs,val_targets,bert_tokenizer)
# train_dl = DataLoader(train_ds,  batch_size=256)
# val_dl = DataLoader(val_ds, batch_size=512)

In [None]:
# class BERT_MODEL(nn.Module):
#     def __init__(self):
#         super(BERT_MODEL,self).__init__()
#         self.bert_model = BertModel.from_pretrained('bert-base-uncased')
#         self.linear = nn.Sequential(
#         nn.Linear(768,1024),
#         nn.ReLU(),
#         nn.Dropout(0.3),
#         nn.Linear(1024,1)
#         )
    
#     def forward(self,input_ids,attention_mask):   
#         output = self.bert_model(input_ids = input_ids,attention_mask = attention_mask)
#         pooled_output = output.pooler_output
#         output = self.linear(pooled_output)
#         return output

In [None]:
class BERT_MODEL(nn.Module):
    def __init__(self):
        super(BERT_MODEL,self).__init__()
        self.bert_model = torch.load('/kaggle/input/bert-base-uncased/pytorch/1.01/1/BertModel_base_uncased.pth')
        self.linear = nn.Sequential(
        nn.Linear(768,1024),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(1024,1)
        )
    
    def forward(self,input_ids,attention_mask):   
        output = self.bert_model(input_ids = input_ids,attention_mask = attention_mask)
        pooled_output = output.pooler_output
        output = self.linear(pooled_output)
        return output

In [None]:
# model = BERT_MODEL()

In [None]:
# model.to(device)

In [None]:
# def find_best_f1(outputs, labels):
#     tmp = [0, 0, 0]  # idx, cur, max
#     threshold = 0

#     for tmp[0] in np.arange(0.1, 0.99, 0.01):
#         tmp[1] = f1_score(labels, outputs > tmp[0])
#         if tmp[1] > tmp[2]:
#             threshold = tmp[0]
#             tmp[2] = tmp[1]

#     return tmp[2], threshold

In [None]:
# def get_preds(logits,threshold):
    
#     # Convert logits to binary predictions based on the threshold
#     predictions = (torch.sigmoid(logits) > threshold).float()
    
#     return predictions

In [None]:
# def evaluate(model,val_dl):
#     losses=[]
#     val_outputs = []
#     val_targets = []
#     accuracy = []
#     f1 = []
#     i=1
#     model.eval()
#     with torch.no_grad():
#         for batch in tqdm(val_dl):
#             input_ids,attention_mask, targets = batch
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             targets = targets.to(device)## target is of type 0.0 and 1.0
            
#             output = model(input_ids,attention_mask)
            
#             loss = BCE(output.squeeze(), targets.float())
            
#             val_outputs.append(torch.sigmoid(output).squeeze().cpu().numpy())
#             val_targets.append(targets.cpu().numpy())
# #             losses.append(loss.item())
# #             f1_Score = f1_score(output.cpu().numpy(),targets.cpu().numpy().astype(int))
# #             f1.append(f1_Score)
# #             if i==5:
# #                 return  val_outputs,val_targets
# #                 break
# #             i+=1
# #             print(f'Val_loss: {loss},val_f1_Score:{f1_Score}')
#     return  val_outputs,val_targets

In [None]:
# BCE = nn.BCEWithLogitsLoss(pos_weight = torch.tensor(15,device=device))


In [None]:
# def fit(epochs,model,train_dl,val_dl):
#     optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4)
#     history = []
#     losses = []
#     train_outputs = []
#     i = 1
#     milestones = [1500, 3000]
#     scheduler = MultiStepLR(optimizer, milestones= milestones, gamma=0.1)
#     model.train()
#     for epoch in range(epochs):
#         for batch in tqdm(train_dl):
#             input_ids, attention_mask, targets = batch
            
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             targets = targets.to(device) 
            
#             output = model(input_ids, attention_mask)
            
#             loss = BCE(output.squeeze(),targets)
#             losses.append(loss.item())
            
#             optimizer.zero_grad()
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5000)
#             optimizer.step()
#             scheduler.step()
            
#             if (i % 10 == 0):
# #                 for name, param in model.named_parameters():
# #                     if (param.grad is not None) & (param.grad.abs().sum()>5000):
# #                         print(name, param.grad.abs().sum())
# #                         torch.save(model.state_dict(), 'insincere_model.pth')
#                 f1, thres = find_best_f1(torch.sigmoid(output.detach()).squeeze().cpu().numpy(), targets.cpu().numpy())
#                 print(f'Batch:{i} ; Loss: {loss:.3f}; Pred at {thres:.3}:{get_preds(output.squeeze(),thres)}; best_Train_f1:{f1:.3f}')
                
#             if (i % 10 == 0):
#                 current_lr = optimizer.param_groups[0]['lr']
#                 print("\nCurrent learning rate:", current_lr)
#                 torch.save(model.state_dict(), 'insincere_model.pth')
#             i += 1
         
#         val_outputs,val_targets = evaluate(model,val_dl)
#         val_outputs = np.concatenate(val_outputs)
#         val_targets = np.concatenate(val_targets)
#         val_f1, threshold = find_best_f1(val_outputs, val_targets)
#         print("Epoch {}; Val F1: {:.3f}, Threshold: {:.3f}".format(epoch,val_f1, threshold))
#     return [val_f1,threshold,val_outputs,val_targets]

In [None]:
# results = fit(1,model,train_dl,val_dl) Hyper parameter tuning, getting bad results

In [None]:
# results = fit(1,model,train_dl,val_dl) #### Hyper parameter tuning (again getting bad resuts)

In [None]:
# results = []

In [None]:
# results.append(fit(5,model,train_dl,val_dl))

In [None]:
# torch.save(model.state_dict(), 'insincere_model_final_successful.pth')

In [None]:
# results = evaluate(model,val_dl)

In [None]:
# val_outputs,val_targets = results
# val_outputs = np.concatenate(val_outputs)
# val_targets = np.concatenate(val_targets)
# val_f1, threshold = find_best_f1(val_outputs, val_targets)
# print("Val F1: {:.3f} at Threshold: {:.3f}".format(val_f1, threshold))

In [None]:
best_threshold = 0.9

In [None]:
# encoding = bert_tokenizer.encode_plus(
#             "Why are IITs so bad at research?",
#             max_length=60,
#             padding='max_length',
#             truncation=True,
#             add_special_tokens=True,
#             return_tensors='pt'
#         ) 

In [None]:
# output = model(encoding['input_ids'].to(device), encoding['attention_mask'].to(device))

In [None]:
# get_preds(output.squeeze(),0.9)

In [None]:
# del model

In [None]:
model = BERT_MODEL()
# model.to(device)

In [None]:
model.load_state_dict(torch.load('/kaggle/input/insincere_question_final_model/pytorch/1.01/1/insincere_model_final_successful.pth'))

In [None]:
def test(df_test,model):
    test_dataset = TestDataset(df_test,tokenizer = bert_tokenizer)
    test_dl = DataLoader(test_dataset, batch_size = 32)
    preds = []
    for batch in tqdm(test_dl):
        input_ids , attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        output = model(input_ids, attention_mask)
        del input_ids,attention_mask
        prediction = (torch.sigmoid(output).squeeze() > best_threshold).int()
        preds.append(prediction.cpu().squeeze().int().numpy())
    print('test predictions generated successfully!!')    
    return preds

In [None]:
def submission(sub_df,df_test,model):
    preds = test(df_test,model)
    predictions = np.array([])
    for pred in preds:
       predictions = np.concatenate([predictions,np.array(pred)])
    sub_df.prediction = predictions
    print('Submission Dataframe created successfully')
    return sub_df

In [None]:
model.to(device)

In [None]:
submission_df = submission(sub_df,df_test,model)
submission_df.prediction = submission_df.prediction.astype(int)
submission_df.to_csv('submission.csv', index = False)