In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
from zipfile import ZipFile 
  
with ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip', 'r') as embd_zip: 
    print(embd_zip.namelist())


**START WORKING ON DATA**

In [None]:
def DEBUG_DICTIONARY(dct, limit=10):
    for i, key in enumerate(dct.keys()):
        if i > limit: break
        print(key, dct[key])

In [None]:
from sklearn.model_selection import train_test_split

# configure train and validation data
train_data, val_data = train_test_split(pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv'), test_size=0.2, random_state=42)
sentences, targets = train_data['question_text'], train_data['target']
val_sentences, val_targets = val_data['question_text'], val_data['target']

In [None]:
train_data.head()

In [None]:
targets.value_counts(), val_targets.value_counts()

In [None]:
# for each word - counts how many times it occurs totally in sentences 
def configure_sentences(sentences, lower = True):
    words = {}
    for sentence in sentences:
        for word in sentence.split():
            if lower: word = word.lower()
            words[word] = words.get(word, 0) + 1
    return words

words = configure_sentences(sentences)
# look to frequencies of words in sentences
DEBUG_DICTIONARY(words)

In [None]:
# find out what kind of words are frequently used
DEBUG_DICTIONARY({word: cnt for word, cnt in sorted(words.items(), key=lambda item: item[1], reverse=True)})

In [None]:
# indexing words - in case needed
def configure_words(words):
    vocabulary = {}
    for i, word in enumerate(words.keys()):
        vocabulary[word] = i # vocabulary[i] = word
    return vocabulary

vocabulary = configure_words(words)
DEBUG_DICTIONARY(vocabulary)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# returns min, avrg and max sentence length - and also displays plot(histogram) for length distribution 
def configure_sentence_statistic(sentences):
    def sentence_len(s):
        return len(s.split())
    
    sentences.apply(sentence_len).plot(title='Sentence Length Distribution',y='Length Frequency',kind='hist', colormap='autumn', logy=True);
    return np.min(sentences.apply(sentence_len)), np.round(np.mean(sentences.apply(sentence_len))), np.max(sentences.apply(sentence_len))

min, avrg, max = configure_sentence_statistic(sentences)

print('minimum sentence length {} - average sentence length {} - maximum sentence length {}'.format(min, avrg, max))

In [None]:
HIDDEN_SIZE = 30

Average length is 13, but if we take length as hiden dimension about 30-35 it should be better - as we see these lengths(30-35) are in the middle of data.
if we got 13 - most of sentences would be cut and their 70-80% would be lost...  also the cut parts should be important for the final target, but i think its no need to filter them - as start of 30 word must have enough content to predict final target - if dont, some almost same sentences should have different targets and i think this will help to learn still the right way.

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

# display how the targets are distributed
def configure_target_statistic(targets):
    trg_cnt = targets.value_counts()
    labels, sizes = (np.array(trg_cnt.index)), (np.array(100*(trg_cnt/trg_cnt.sum())))
    py.iplot(go.Figure(data=[go.Pie(labels=labels, values=sizes)], layout=go.Layout(title='Target Distribution',font=dict(size=15),width=500, height=500)))
    return trg_cnt

configure_target_statistic(targets)

it seems that only 6.2% of targets are 1. so we need some careful model to dont go overboard - overfit ^^

In [None]:
# filters data according to given parameters
def filter_and_display_data(sentences, targets, target=0, min_len=5, max_len=30, limit=3):
    result = []
    for i, sentence in enumerate(sentences):
        sent_len = len(sentence.split(' '))
        if min_len <= sent_len and sent_len <= max_len:
            if targets[i] == target:
                result.append(sentence)
                if len(result) >= limit: break
    
    if(len(result) ==- 0):
        print('no such sequencies found.')
        return
    
    print('{} {} sentences with length between {}-{}:\n'.format(limit, 'GOOD' if target == 0 else 'BAD', min_len, max_len))
    for i, s in enumerate(result):
        print(str(i+1)+")",s)


lets see some examples of our data

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=0)

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=1)

for length <= 30 - its seems our data is normally.

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=0, min_len=120, max_len=140)

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=1, min_len=120, max_len=140)

for length 120-140 its seems our data is some kind of hard - to determine its target even by human and for target = 1 we dont have any examples - so its bad for train data to dont have all kind of basic examples, but nvm.

In [None]:
# reads and returns dictionary - key: word; value: word's embedding vector (vec. length=300)
def confnigure_embeddings(embd_path):
    word2vecs = {}
    with ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip') as embd_zip:
        for embd in embd_zip.open(embd_path, 'r'):
            word2vec = embd.decode().split(' ')
            word2vecs[word2vec[0]] = np.asarray(word2vec[1:], dtype='float32')
    return word2vecs
            
word2vecs = confnigure_embeddings('glove.840B.300d/glove.840B.300d.txt')
DEBUG_DICTIONARY(word2vecs, limit=1)

In [None]:
# in each sentence replaces words with its own embedding vectors 
def configure_word2vecs(sentences, word2vecs):
    def configure_sentence(sentence, len=HIDDEN_SIZE):
        return ([word2vecs.get(word.lower(), np.zeros(300)) for word in sentence.split()] + [np.zeros(300)]*len)[:len] 
    
    return [configure_sentence(sentence) for sentence in sentences]

# embedding_sentences = configure_word2vecs(sentences, word2vecs)
# print(embedding_sentences[:10])

**START WORKING ON MODEL**

In [None]:
import torch
import torch.nn as nn

import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
BATCH_SIZE = 256
BATCHES = (len(sentences)+BATCH_SIZE-1)//BATCH_SIZE

EPOCHS = 2 # gpu :(
EMBD_SIZE = 300

In [None]:
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu, torch.cuda.is_available()

In [None]:
# long short-term memory is best suit for this case i think - as we got 1M+ train data and embedding vectors with length 300, 
# if we just convert evrything once in tensors we need more than 16gb ram and much more resources to train this data.
# also use linear layer should be good enough as there is no hard dependences - as if sentence contains 'bad' word its target is most likly 1.
# dropout layer would be good also - but as we are using only one lstm layer bc of cpu - we dont...

class LSTM(nn.Module):
    def __init__(self, input_dim=1, emb_dim=EMBD_SIZE, hid_dim=HIDDEN_SIZE, n_layers=1, output_dim=1, dropout=0.3):
        super().__init__()
        self.hid_dim, self.n_layers = hid_dim, n_layers
        
        # nn's
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, batch_first=True)
        self.linear = nn.Linear(hid_dim, output_dim)
        
        ### NOTE ### for dropout lstm layers has to be more than 1 - but bc of my code works only cpu i got one layer :( so it doesn't works...
#         self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, src):
        outputs, (hidden, cell) = self.lstm(src)
        return self.linear(hidden.reshape(-1, self.hid_dim))


In [None]:
# craete model - with lstm and linear layers
model = LSTM().to(gpu)

# init loss function
loss_function = nn.BCEWithLogitsLoss().to(gpu) #nn.MSELoss()

# init optimizer with learning rate 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
model

In [None]:
# evaluates and returns accuracy for predicted Y by model 
def acc_function(y_pred, y_test):
    y_pred = torch.round(torch.sigmoid(y_pred).to(gpu)).to(gpu)
    correct = (y_pred == y_test).sum().float()
    return torch.round(100*(correct/y_pred.shape[0]))

# generates and returns idx-th batch as torch tensor according to given data(sentences and targets)
def get_batch(sentences, targets, idx):
    src = configure_word2vecs(sentences[BATCH_SIZE*idx:BATCH_SIZE*(idx+1)], word2vecs)
    trg = np.asarray(targets[BATCH_SIZE*idx:BATCH_SIZE*(idx+1)], dtype='bool')
    return torch.FloatTensor(src).to(gpu), torch.FloatTensor(trg).to(gpu)

# evaluates and returns f1 score for predicted Y by model 
def f1_score(y_pred, y_test):
    tp = (y_test * y_pred).sum().to(torch.float32)
    tn = ((1 - y_test) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_test) * y_pred).sum().to(torch.float32)
    fn = (y_test * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7 # for avoid crash
    precision, recall = tp / (tp + fp + epsilon), tp / (tp + fn + epsilon)
    
    return 2*(precision*recall)/(precision + recall + epsilon)
    

**START TRAINING OF MODEL**

In [None]:
# ready for training
model.train()

VALIDATION_BATCHES = 10
# init validation data for accuracy while training - but taking only VALIDATION_BATCHES while whole data is too big.
val_sents = configure_word2vecs(val_sentences[:VALIDATION_BATCHES*BATCH_SIZE], word2vecs)
val_targs = np.asarray(val_targets[:VALIDATION_BATCHES*BATCH_SIZE], dtype='bool')

val_batch = torch.FloatTensor(val_sents).to(gpu)
val_target = torch.FloatTensor(val_targs).to(gpu)
print(type(val_batch), val_batch.shape, type(val_targets), val_targets.shape)

In [None]:
BATCHES, BATCH_SIZE, get_batch(sentences, targets, 0)[0].shape, get_batch(sentences, targets, 0)[1].shape

In [None]:
# training
for e in range(EPOCHS):
    # save epoch loss and accuracy
    epoch_loss, epoch_acc = 0, 0
    for b in range(BATCHES):
        # get current batch from data
        X_batch, y_batch = get_batch(sentences, targets, b)
        
        # set the gradients to zero, before starting to do backpropragation - avoiding gradient miss direction for minimum. 
        optimizer.zero_grad()

        # predict targets for current batch and learn by comparing it to real targets with loss func.
        y_pred = model(X_batch)
        loss = loss_function(y_pred, y_batch.unsqueeze(1))
        
        # predict targets for validation data and eval. accuracy
        val_pred = model(val_batch)
        acc = acc_function(val_pred, val_target.unsqueeze(1))

        # gradients are "stored" by the tensors themselves - once call backward on the loss.
        loss.backward()
        
        # updates the model parameters
        optimizer.step()
        
        # add batch loss and acc to evaluate epoch loss/acc
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        if b == 0 or (b+1) % 100 == 0:
            print(f'Epoch {(e+1)+0:03} | Batch {(b+1)+0:04}: | Loss: {epoch_loss/(b+1):.5f} | Acc: {epoch_acc/(b+1):.3f} | F1: {f1_score(val_pred, val_target.unsqueeze(1)):.3f}')
            # print(next(model.parameters()).is_cuda, X_batch.get_device(), y_batch.get_device(), y_pred.get_device(), val_pred.get_device())

    print(f'Epoch {(e+1)+0:03}: | Epoch Loss: {epoch_loss/BATCHES:.5f} | Epoch Acc: {epoch_acc/BATCHES:.3f}')

**START WORKING ON TEST DATA**

In [None]:
# init test data
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
sentences, targets = test_data['question_text'], []
TEST_BATCHES = (len(sentences)+BATCH_SIZE-1)//BATCH_SIZE

In [None]:
test_data.head()

In [None]:
min, avrg, max = configure_sentence_statistic(sentences)

print('minimum sentence length {} - average sentence length {} - maximum sentence length {}'.format(min, avrg, max))

In [None]:
len(sentences), len(targets), TEST_BATCHES

**PREDICT TEST DATA ACCORDING TO OUR MODEL**

In [None]:
model.eval()
with torch.no_grad():
    for b in range(TEST_BATCHES):
        # get current batch
        X_batch = torch.FloatTensor(configure_word2vecs(sentences[BATCH_SIZE*b:BATCH_SIZE*(b+1)], word2vecs)).to(gpu)
        
        # predict batch according to our trained model
        trg = torch.round(torch.sigmoid(model(X_batch))).cpu().numpy().squeeze()
        targets.extend(trg)
        
        if b == 0 or (b+1) % 100 == 0: print(f'Batch {(b+1)+0:04} predicted')

In [None]:
# save data to submit
test_targets = (np.array(targets) >= 0.5).astype(np.int)

submit = pd.DataFrame({"qid": test_data['qid'], "prediction": test_targets})
submit.to_csv("submission.csv", index=False)

In [None]:
# display results
submit.head()

As it seems % of our prediction is almost like train data. that seems good ^^ 

lets see some examples of our prediction

In [None]:
configure_target_statistic(submit['prediction'])

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=0)

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=1)

in one look - it seems our model is working well.

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=1, min_len=100, max_len=150)

In [None]:
filter_and_display_data(sentences, np.asarray(targets, dtype='int'), target=1, min_len=100, max_len=150)

if there are no long sequencies in test set - it's just fine.