In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from zipfile import ZipFile 
import seaborn as sns
from sklearn.metrics import confusion_matrix
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

    
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



### set random seed

In [None]:
import random
random.seed(1234)

In [None]:
import torch
import torch.nn as nn

torch.__version__

# **Unzip word embeddings**

In [None]:
with ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip') as z: 
    z.extract('glove.840B.300d/glove.840B.300d.txt')

**Read embeddings and store in dictionary.**

In [None]:
word2vec = {}
f = open('glove.840B.300d/glove.840B.300d.txt')
for line in f:
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word2vec[word] = coefs
f.close()

**Read and split data into train and val sets.**

In [None]:
train_df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
train_df, val_df = train_test_split(train_df, test_size=0.1)
train_df.head()

**Choose only sentences that have lenght less than 40, for performance purpouces and see all sentences length distribution**

# Data analyze

In [None]:
max_length = 40

# distribution of sentence lengths
lengths = train_df['question_text'].apply(lambda x: len(x.split(' '))).to_list()
sns.distplot(lengths)

train_df = train_df[train_df.apply(lambda x : len(x['question_text'].split(" ")) <= max_length,axis=1)]

## Distribution of positive and negative sentences

In [None]:
import plotly.express as px
fig = px.pie(train_df, names='target', title='Distribution of sentiment',width=600, height=400)
fig.show()

As we can see data is pretty unbalanced, thats why we use f1 score and not regular accuracy.

## WordCloud for negative sentences

In [None]:
wordcloud = WordCloud().generate(' '.join(train_df[train_df['target'] == 0]['question_text'].tolist()))

# Display the generated image:
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## WordCloud for positive sentences

In [None]:
wordcloud = WordCloud().generate(' '.join(train_df[train_df['target'] == 1]['question_text'].tolist()))

# Display the generated image:
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

**Batch generator, returns tensor [bath_size, sentence_length, embedding_size], where sentence_length=40 and  embedding_size= 300**

In [None]:
def put_embeddings(text):
    words = text.split(' ')
    embs = [word2vec.get(x.lower(), np.zeros(300)) for x in words] + [np.zeros(300)] * (max_length - len(words))
    return np.asarray(embs[0:max_length])

def batch_gen(df, batch_size):
    n_batches = (len(df) + batch_size - 1) // batch_size
    df = df.sample(frac=1.)  # Shuffle the data.
    for i in range(n_batches):
        texts = df[i*batch_size:(i+1)*batch_size]['question_text'].to_list()
        embs = np.array([put_embeddings(txt) for txt in texts], dtype=np.dtype('float64'))
        targets = (df["target"][i*batch_size:(i+1)*batch_size]).to_list()
        yield torch.tensor(embs, device=torch.device('cuda:0')), torch.tensor(targets, device=torch.device('cuda:0'))

# Training

### LSTM model which uses pytorchs lstm module, adds dropout layer and also adds linear layer for output, so output could be 2 dimensional for our problem.

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, dropout):

        super().__init__()

        self.hidden_dim = hidden_dim
        
        self.lstm = torch.nn.LSTM(input_size=embedding_dim, 
                                  hidden_size=hidden_dim, 
                                  bias=True,
                                  batch_first=True
                                 )
        
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim, 2)

    def forward(self, inp):   
        inp = self.dropout(inp)
        _, (hn, _) = self.lstm(inp)

        return self.classifier(hn)

### Classical Train loop, uses learning rate scheduler, for adapting learning rate. I only use 30 epoch, but increasing amount of epoch gives us better results.

In [None]:
def get_score(y_pred, y):
    y_pred = [0 if p[0] > p[1] else 1 for p in y_pred]
    return f1_score(y.cpu(), y_pred, zero_division=0)

def compute_perplexity(model, dl):
    model.eval()
    score = 0
    length = 0
    with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
        loss = 0
        for x, y in dl:
            x = x.float()
            y_pred = model(x).squeeze(0)
            if y_pred.shape[0] == 128:
                score += get_score(y_pred, y)
                length += 1
            loss += torch.nn.functional.cross_entropy(y_pred, y).item()
        model.train()

    return np.exp(loss / length), score/(length)
 
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def train_loop(model):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, min_lr=1e-6, patience=10)
    crit = nn.CrossEntropyLoss(reduction='mean')
    bs = 128
    
    
    curr_perplexity = None
    perplexity = None
    
    for epoch in range(30):
        train_dl = batch_gen(train_df, bs)
        total_loss = 0 
        batch_num = 0

        for x, y in train_dl:
            optimizer.zero_grad()
            x = x.float()            
            y_pred = model(x).squeeze(0)
            loss = crit(y_pred, y)
            total_loss += loss.item()
            batch_num += 1
            loss.backward()

            # doing gradient descent step.
            optimizer.step()
            
        
        eval_dl = batch_gen(val_df, bs)
        curr_perplexity, score = compute_perplexity(model, eval_dl)
        lr_scheduler.step(curr_perplexity)

        print('Epoch', epoch + 1, '| Avg Train Loss', total_loss / batch_num, '| Dev Perplexity', curr_perplexity, '| LR ', get_lr(optimizer), '| F1 score', score)

hidden_dim = 30 
embedding_dim = 300
dropout = 0.2
            
model = LSTMModel(embedding_dim, hidden_dim, dropout).cuda()
train_loop(model)

# Testing

### Read test data frame

In [None]:
import math

test_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")

print(len(test_df))

### Compute model results for every test and put output in submission.csv

In [None]:
def getBatchedTest(batch_size):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([put_embeddings(txt[:max_length]) for txt in texts])
        yield torch.Tensor(text_arr).cuda()
        
        
all_preds = []
model.eval()
with torch.no_grad():
    for X_batch in getBatchedTest(256):
        y_test_pred = model(X_batch).squeeze(0)
        all_preds += [0 if y[0] > y[1] else 1 for y in y_test_pred]

submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": all_preds})
submit_df.to_csv("submission.csv", index=False)

Test for custom sentence

In [None]:

def get_prediction(sentence):
    inp = put_embeddings(sentence)
    inp = torch.tensor(inp, device=torch.device('cuda:0')).unsqueeze(0).float()
    y_pred = model(inp).squeeze()    
    return 0 if y_pred[0] > y_pred[1] else 1 

neg = ['Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?',
             'Why do Bengali and Kolkata people dominate Hindi language speaking people from Delhi, Mumbai and other parts of India?']
             
pos = ['When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?',
       'What kind of knife can I own in California?'
      ]

print('negatives')
print(neg[0], ' prediction --', get_prediction(neg[0]), '\n')
print(neg[1], ' prediction --', get_prediction(neg[1]), '\n')
print('positives')
print(pos[0], ' prediction --', get_prediction(pos[0]), '\n')
print(pos[1], ' prediction --', get_prediction(pos[1]), '\n')
