In [None]:
from zipfile import *
from tqdm import tqdm
import os
import random

import seaborn as sns

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

**Set the random seeds for deterministic results.**

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# kaggle starter code, just to know full paths of files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data analysis

In [None]:
# load data from train.csv file and split it to train and validation
train_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
train_df, val_df = train_test_split(train_df, test_size=0.1)

### Draw graph which shows number of words in sentences of train and validation sets.

In [None]:
train_sentences_lens = train_df['question_text'].apply(lambda x: len(x.split(' '))).tolist()
val_sentences_lens = val_df['question_text'].apply(lambda x: len(x.split(' '))).tolist()
sns.distplot(train_sentences_lens)
sns.distplot(val_sentences_lens)

As we see from the graph, the number of cases where words counts greater than 40 is too small.

In [None]:
SENTENCE_MAX_LEN = 40

### Draw pie chart which shows distribution of positive and negative examples

In [None]:
def pie_chart(positive_samples_num, negative_samples_num, set_type):
    labels = 'Insincere', 'Sincere'
    sizes = [positive_samples_num, negative_samples_num]

    fig1, ax1 = plt.subplots()
    ax1.set_title(set_type)
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)

    plt.show()

train_positive_samples_num = sum(train_df['target'].values)
val_positive_samples_num = sum(val_df['target'].values)
pie_chart(train_positive_samples_num, len(train_df['target']) - train_positive_samples_num, 'train set')
pie_chart(val_positive_samples_num, len(train_df['target']) - val_positive_samples_num, 'validation set')

From the graphs we can see that data is unbalanced and later we need to use some techniques to avoid overfitting.
In this case accuracy won't be the best metric to evaluate your model.

## Word clouds

In [None]:
sincere_text = ' '.join(train_df[train_df['target'] == 0]['question_text'].tolist())
insincere_text = ' '.join(train_df[train_df['target'] == 1]['question_text'].tolist())

In [None]:
def draw_word_cloud(text):
    wordcloud = WordCloud(background_color="white", max_words=100, min_word_length=5).generate(text)

    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

### Sincere words cloud

In [None]:
draw_word_cloud(sincere_text)

As sincere words clouds shows, there are words like use, work, help, make and so on. Such kind of words used very frequently in the normal questions and They are mostly neutral and indicate someone actually looking for advice.

### Insincere words cloud

In [None]:
draw_word_cloud(insincere_text)

We can see difference from the previous graph, because in this case the most frequently used words are all very political(for example Donald Trump) Instead of the generic advice topics. 

Also can be seen that the words in general are mostly not negative. for example word "people" is neither positive or negative. But coupled with other words it should be possible to perform useful topic analysis, thats why simple sentiment analysis of words is not sufficient and we need something more powerful, for example lstm.

# Word2vec functions

**store embeddings in dictionary for every word.
embeddings_index (where key is word and value is embedding array)**


In [None]:
embeddings_index = {}

with ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip') as myzip:
    with myzip.open('glove.840B.300d/glove.840B.300d.txt') as myfile:
        lines = myfile.readlines()
        for line in tqdm(lines):
            values = line.decode().split(" ")
            word = values[0] 
            embeddings_index[word] = np.asarray(values[1:], dtype='float32')

In [None]:
# this function takes text as argument and returns array of embeddings.
def to_emb(text, embedding_dim): 
    unk_emb = np.zeros(embedding_dim)
    text = text[:-1].split()[:SENTENCE_MAX_LEN]
    embeds = [embeddings_index.get(x, unk_emb) for x in text]
    
    padding = np.zeros(embedding_dim)
    embeds += [padding] * (SENTENCE_MAX_LEN - len(embeds))
    return np.array(embeds)

In [None]:
# this function takes array of texts and returs array of embedding arrays
def to_embeddings(texts, embedding_dim):
    return torch.tensor(np.array([to_emb(text, embedding_dim) for text in texts])).float().cuda()

In [None]:
# this function works like iterator, returns next batch_size arrays of texts and targets on every call
def batch_iterator(frame, batch_size):
    frame = frame.sample(frac=1).reset_index(drop=True)
    frame_len = len(frame)
    
    for ind in range(0, frame_len, batch_size):
      x = frame['question_text'][ind : ind + batch_size].values
      y = torch.tensor(frame['target'][ind : ind + batch_size].values).cuda()
      yield x, y

# Model

In [None]:
class LSTMModel(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, dropout):

    super().__init__()
    
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.dropout = nn.Dropout(dropout)
    
    self.lstm = nn.LSTM(input_size=embedding_dim, 
                   hidden_size=hidden_dim, 
                   bias=True,
                   batch_first=True 
                  )
    
    self.classifier = nn.Linear(hidden_dim, output_dim)

  def forward(self, inp):
    # translate texts to embeddings
    # after this operation inp will be (batch_size, max_seq_len, embedding_dim)
    inp = to_embeddings(inp, self.embedding_dim)
    
    # add dropout to prevent neural network from overfitting.
    inp = self.dropout(inp)
    
    # Initialize hidden state with zeros
    h0 = torch.zeros(1, inp.size(0), self.hidden_dim, device=inp.device) 
    
    # Initialize cell state
    c0 = torch.zeros(1, inp.size(0), self.hidden_dim, device=inp.device)

    out, (hn,cn) = self.lstm(inp, (h0, c0))

    hn = hn.reshape(-1, self.hidden_dim)
    
    return self.classifier(hn).view(inp.size(0), self.output_dim)

# Training

This part is similar to the one discussed in the lecture, but difference is f1 score metric.

In [None]:
# No need many epochs, because after 2-3 epochs model does not learn new things. 
EPOCHS = 3
BS = 128
EMBEDDING_DIM = 300

In [None]:
# this function returns arg max of predictions.
def get_predictions_from_prob(y_pred):
    return [1 if predictions[1] > predictions[0] else 0 for predictions in y_pred]

In [None]:
def compute_f1_and_perplexity(model, val_df):
  model.eval()

  val_BS = 4 * BS
  loss = 0
  score = 0
  with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
    iterator = batch_iterator(val_df, val_BS)
    for x, y in iterator:
      y_pred = model(x)
      loss += torch.nn.functional.cross_entropy(y_pred, y).item()
      score += f1_score(y.cpu(), get_predictions_from_prob(y_pred), zero_division=0)
  
  model.train()


  batchs_num = int((len(val_df) + val_BS - 1) / val_BS)
  return score / batchs_num,  np.exp(loss / batchs_num)

In [None]:
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']

In [None]:
def train_loop(model, train_df, val_df):

  model.train() 

  # we add weight decay (L2 regularization) to avoid overfitting.
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

  # we will reduce initial learning rate by 'lr=lr*factor' every time validation perplexity doesn't improve within certain range.
  lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, min_lr=1e-6, patience=10) 

  crit = nn.CrossEntropyLoss(reduction='mean')

  it = 1
  total_loss = 0
  curr_perplexity = None
  perplexity = None
  f1_score = None

  for epoch in range(EPOCHS):
    iterator = batch_iterator(train_df, BS)
    for x, y in iterator:

      optimizer.zero_grad()
      
      # do forward pass, will save intermediate computations of the graph for later backprop use.
      y_pred = model(x)
         
      loss = crit(y_pred, y)
      
      total_loss += loss.item()
      
      # running backprop.
      loss.backward()

      # doing gradient descent step.
      optimizer.step()

      # we are logging current loss/perplexity in every 1000 iteration
      if it % 1000 == 0:
      
        # computing validation set perplexity in every 2000 iteration.
        if it % 2000 == 0:
          f1_score, curr_perplexity = compute_f1_and_perplexity(model, val_df)

          lr_scheduler.step(curr_perplexity)

          # making checkpoint of best model weights.
          if not perplexity or curr_perplexity < perplexity:
            torch.save(model.state_dict(), 'model')
            perplexity = curr_perplexity

        print('Epoch', epoch + 1, '| Iter', it, '| Avg Train Loss', total_loss / 1000, '| F1 score', f1_score, '| Dev Perplexity', curr_perplexity, '| LR ', get_lr(optimizer))
        total_loss = 0

      it += 1
    
model = LSTMModel(EMBEDDING_DIM, 100, 2, 0.1).cuda()
train_loop(model, train_df, val_df)

# Testing

In [None]:
# load test data from test.csv file
test_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv"); test_df.head()

In [None]:
# this function predicts targets for every test question.
def get_predictions(model, question_texts, batch_size):
    model.eval()
    res = []
    question_texts_len = len(question_texts)
    with torch.no_grad():
        for ind in range(0, question_texts_len, batch_size):
          x = question_texts[ind : ind + batch_size]
          y_pred = model(x)
          res += get_predictions_from_prob(y_pred)
    return res

predicitons = get_predictions(model, test_df['question_text'].values, 1000)

In [None]:
# write down answers in prediciton column.
test_df['prediction'] = predicitons

In [None]:
test_df[['qid', 'prediction']]

In [None]:
# save results to submission.csv. this is file, which should be commited
test_df[['qid', 'prediction']].to_csv('submission.csv', index=False)

# Evaluation

evaluate our model on some examples

In [None]:
def evaluate_some_examples(model, texts, realTarget):
    print('{} questions evaluation'.format('Sincere' if realTarget == 0 else 'Incinecere'))
    predictions = get_predictions(model, texts, len(texts))
    for text, prediction in zip(texts, predictions):
        print('For text "{}" Model predicts {} and real target is {}'.format(text, prediction, realTarget))
    print('\n')

In [None]:
# sincere text are real quora questions.
sincere_text_examples = ['Why are there crushed stones alongside rail tracks?',
                        'What are some of the most inspirational photos ever taken?',
                        'What is the greatest single image in movie history?',
                        'What are the best proggraming blogs?',
                        'What is the most horrific picture you have ever seen?']

insincere_text_examples = ['Why Jews Did not Leave Europe?',
                          'What are some of the false things the U.S. government claimed about the conflict in the Ukraine?',
                          'Why are women shameless?']



evaluate_some_examples(model, sincere_text_examples, 0)
evaluate_some_examples(model, insincere_text_examples, 1)