# Quora Insincere Questions Classification
## Detect toxic content to improve online conversations



In [None]:
!wget https://github.com/ravi-ilango/acm-dec-2020-nlp/blob/main/lab2_1/quora_data.zip?raw=true -O quora_data.zip

!unzip quora_data.zip

In [None]:
import random
import os
from datetime import datetime
import pandas as pd

import torch   
from torchtext import data 
from torch.autograd import Variable

In [None]:
#Reproducing same results
SEED = 2315

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True 

### Load custom dataset

In [None]:
df = pd.read_csv('quora_data/train.csv')

print (f"Number of records: {len(df)}")

df.head()

In [None]:
df['length'] = df['question_text'].apply(lambda s: len(s.split()))
df.sort_values(by=['length'], ascending=False, inplace=True)

In [None]:
df['length'].hist()

In [None]:
df = df[(df['length'] >= 5) & (df['length'] <= 30)]

In [None]:
df.head()

In [None]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')


In [None]:
# This step takes few minutes

counter = Counter()
for _, row in df.iterrows():
    counter.update(tokenizer(row['question_text']))

vocab = Vocab(counter, min_freq=1)

In [None]:
print(f"Size of TEXT vocabulary: {len(vocab)}\n")

print(f"Commonly used words: {vocab.freqs.most_common(10)}\n")

In [None]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

label_pipeline = lambda x: int(x)

### Custom DataLoader

In [None]:
#Split into training and validation datasets

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.5)
train, valid = train_test_split(train, test_size=0.4)

train.sort_values(by=['length'], ascending=False, inplace=True)
test.sort_values(by=['length'], ascending=False, inplace=True)
valid.sort_values(by=['length'], ascending=False, inplace=True)

In [None]:
test

In [None]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return self.dataframe.iloc[index]

def generate_batch(batch):
    label_list, text_list, text_length_list = [], [], []
    SEQSIZE = len(batch[0]['question_text'].split())
    for row in batch:
        _text = row['question_text']
        _label = row['target']
        label_list.append(label_pipeline(_label))
        processed_text = text_pipeline(_text)[:SEQSIZE]
        text_length_list.append(len(processed_text))
        processed_text += [1]*(SEQSIZE-len(processed_text))
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = torch.tensor(text_list, dtype=torch.int64)
    text_length_list = torch.tensor(text_length_list, dtype=torch.int64)
    return label_list.to(device), text_list.to(device), text_length_list
    
batch_size = 64

dataloader_train = DataLoader(PandasDataset(train), batch_size=batch_size, shuffle=False, collate_fn=generate_batch)

dataloader_test = DataLoader(PandasDataset(test), batch_size=batch_size, shuffle=False, collate_fn=generate_batch)

dataloader_valid = DataLoader(PandasDataset(valid), batch_size=batch_size, shuffle=False, collate_fn=generate_batch)

In [None]:
len(dataloader_train)

### LSTM Classifier

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            text_lengths,
                                                            batch_first=True,
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]

        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs


#### Instantiate a LSTM Classifier model

In [None]:
#define hyperparameters
size_of_vocab = len(vocab)
embedding_dim = 100
hidden_dim = 32
output_dim = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, hidden_dim, output_dim, num_layers, 
                   bidirectional = True, dropout = dropout)



In [None]:
#architecture
print(model)

#No. of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

# #Initialize the pretrained embedding  ()
# pretrained_embeddings = TEXT.vocab.vectors
# # model.embedding.weight.data.copy_(pretrained_embeddings) # TODO PLEASE USE THIS FOR LSTM

# print(pretrained_embeddings.shape)

In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
device

### Model Train function 

In [None]:
def train(model, dataloader, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for idx, (label, text, text_lengths) in enumerate(dataloader):
        
        #resets the gradients after every batch
        optimizer.zero_grad()   

        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()
    
        #compute the loss
        loss = criterion(predictions, label)
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
        if idx % 1000 == 0:
            print (f"Completed {idx}/{len(dataloader)}")
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

### Model Evaluate function

In [None]:
def evaluate(model, dataloader, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for _, (label, text, text_lengths) in enumerate(dataloader):
        
            #predict
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, label)
            acc = binary_accuracy(predictions, label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

### Check model's forward pass

In [None]:
#Check model device type
next(model.parameters()).is_cuda, device

In [None]:
len(dataloader_train)

In [None]:
for idx, (label, text, text_lengths) in enumerate(dataloader_train):
    print ("text.shape: ", text.shape)
    predictions = model(text, text_lengths)
    print ("predictions.shape: ", predictions.shape)
    break

In [None]:
text[0], predictions[0][:10], label[:10]

### Train the model

This step takes around ~4 min

In [None]:
model_path = 'saved_weights.pt'

N_EPOCHS = 3
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    ts_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
    print(f'\n {ts_string} Epoch: {epoch}')
    #train the model
    train_loss, train_acc = train(model, dataloader_train, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, dataloader_valid, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)

    print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

### Predict

In [None]:
#load weights
#model.load_state_dict(torch.load(model_path));
model.eval();

def prepare_text(sentence):
    text = text_pipeline(sentence)
    text_length = len(text)
    tensor = torch.tensor(text, dtype=torch.int64)
    length = torch.tensor(text_length, dtype=torch.int64)
    tensor = tensor.unsqueeze(0)
    length = length.unsqueeze(0)
    return tensor.to(device), length

def predict(model, sentence):
    tensor, length = prepare_text(sentence)
    prediction = model(tensor, length)                  #prediction 
    return prediction.item()  

In [None]:
def insincere_or_not(pred):
    return 'Insincere Question' if pred > .5 else 'Normal Question'

In [None]:
#sincere question
pred = predict(model, "What is your favorite person in history?")
print (insincere_or_not(pred))

In [None]:
#insincere question
pred = predict(model, "Why Indian girls go crazy about marrying Shri. Rahul Gandhiji?")
print (insincere_or_not(pred))

### Note

This notebook used data and code from a blog in https://www.analyticsvidhya.com