In [None]:
import pickle
import re
import pandas as pd
import numpy as np

## Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Sklearn imports
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
spacy_en = spacy.load('en')
download('stopwords')

In [None]:
train = pd.read_csv('train.csv', delimiter=",")
print("Train size: {}".format(len(train)))
train.tail()

In [None]:
test = pd.read_csv('test.csv', delimiter=",")
print("Test size: {}".format(len(test)))
test.tail()

In [None]:
def clean_text(text, do_stop=False):
    text = str(text)
    text = gensim.parsing.preprocessing.strip_numeric(text)  # Strip all the numerics
    text = re.sub(r'[^\x00-\x7f]',r' ',text) # Removing non ASCII chars
    text = text.replace("\n","") # Removing line breaks
    text = text.replace("=","") # Removing =
    text = text.replace(":","") # Removing :
    text = text.replace("#","") # Removing #
    text = text.replace("%","") # Removing #
    text = text.replace("&","") # Removing #
    text = text.replace('"',"") # Removing #
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)# Strip multiple whitespaces

    text = text.lower()
    stops = set(stopwords.words("english"))
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    text = " ".join(filtered_words)
    return text

In [None]:
train['cleaned_comment']=train['comment_text'].apply(lambda x:clean_text(x, do_stop=True))
train.tail()

In [None]:
test['cleaned_comment']=test['comment_text'].apply(lambda x:clean_text(x, do_stop=True))
test.head()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train['cleaned_comment'],
                                                    list(zip(train['toxic'], train['severe_toxic'],
                                                             train['obscene'], train['threat'],
                                                             train['insult'], train['identity_hate'])),
                                                      test_size=0.2)

In [None]:
x_test = np.array(test['cleaned_comment'])
x_test

In [None]:
train_data=list(zip(x_train,y_train))
train_data[-5:-1]

In [None]:
valid_data=list(zip(x_valid,y_valid))
valid_data[-5:-1]

In [None]:
## Build Vocabulary
word_to_ix = {}
for (sent) in list(x_train) + list(x_valid)+list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [None]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 6
VOCAB_SIZE,NUM_LABELS

## Loading word vectors

In [None]:
from gensim.models import KeyedVectors

In [None]:
!ls ../../vectors/

In [None]:
w2v = KeyedVectors.load_word2vec_format('../../vectors/GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
W2V_DIM=300
sd = 1/np.sqrt(W2V_DIM) ## standard deviation to use
weights = np.random.normal(0, scale=sd, size=[VOCAB_SIZE, W2V_DIM])
weights = weights.astype(np.float32)

In [None]:
for word in word_to_ix:
    id = word_to_ix.get(word,None)
    if id is not None:
        try:
            weights[id]=w2v.wv.word_vec(word)
        except:
            weights[id]=np.random.normal(0, scale=sd, size=[1, W2V_DIM]) ## If word not present, initialize randomly

In [None]:
word_to_ix['sky']

In [None]:
w2v.wv.word_vec("delete")[0:50]

In [None]:
word=word_to_ix['delete']
word

In [None]:
weights[word][0:50]

## Defining model

In [None]:
class GruClassifierW2vec(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, label_size, pre_trained_weights, dropout):
        super(GruClassifierW2vec, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        #print("Pre-trained weights for word [delete]: \n{}".format(pre_trained_weights[word_2_idx['delete']][0:50]))
        self.word_embeddings.weight.data=torch.Tensor(pre_trained_weights)
        self.gru = nn.GRU(input_size = embedding_dim,
                            hidden_size = hidden_dim,
                            num_layers = num_layers,
                            dropout = dropout)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        return (Variable(torch.zeros(self.num_layers, 1, self.hidden_dim))).cuda()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), 1, -1)
        for i in range(self.num_layers):
            gru_out, self.hidden = self.gru(x, self.hidden)
        y  = self.hidden2label(gru_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

In [None]:
def make_context_vector(seq, to_ix):
    idxs = [to_ix[w] for w in seq.split()]
    tensor = torch.LongTensor(idxs)
    return tensor

In [None]:
def make_target(label):
    return torch.FloatTensor(label).view(1,-1)

In [None]:
W2V_DIM = 300
HIDDEN_DIM = 80
NUM_LAYERS = 2
DROPOUT = 0.3

In [None]:
model = GruClassifierW2vec(embedding_dim=W2V_DIM,
                            hidden_dim=HIDDEN_DIM,
                            num_layers=NUM_LAYERS,
                            vocab_size=VOCAB_SIZE,
                            label_size=NUM_LABELS,
                            pre_trained_weights = weights,
                            dropout = DROPOUT)

In [None]:
model.cuda()

In [None]:
msg=train_data[0][0]
msg

In [None]:
samp_1=Variable(make_context_vector(msg,word_to_ix)).cuda()
samp_1

In [None]:
out=model(samp_1)
out

In [None]:
loss_function = nn.BCEWithLogitsLoss()
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

In [None]:
sample=train_data[2][0]
sample

In [None]:
sample_context=Variable(make_context_vector(sample,word_to_ix)).cuda()
sample_context

In [None]:
out=model(sample_context)
out

In [None]:
for (sent,label) in train_data[0:2]:
    print(sent)
    print(label)
    print(Variable(make_target(label)))
    print("----------------------------")

In [None]:
len(x_train)

In [None]:
len(train)

In [None]:
batch_size = 10
n_iters = 2000
num_epochs = n_iters/(len(x_train))/batch_size
num_epochs=int(num_epochs)
num_epochs = 1

In [None]:
valid_data[0:5]

In [None]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
        model.hidden = model.init_hidden()
        
        ## Step 2- Prepare input and label
        context_vec = Variable(make_context_vector(sent, word_to_ix)).cuda()
        target = Variable(make_target(label)).cuda()
        
        # Step 3 - Run forward pass
        output = model(context_vec)
       
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 500 ==0:
            correct = 0
            total = 0
            for (sent,label) in valid_data:
                context_vec = Variable(make_context_vector(sent, word_to_ix)).cuda()
                target = Variable(make_target(label)).cuda()
                output = model(context_vec)
                _,predicted = torch.max(output.data,1)
                total += target.size(0)
                correct += (predicted[0] == make_target(label)).sum()
            accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))