In [2]:
import pickle
import re
import pandas as pd
import numpy as np

## Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Sklearn imports
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss

## NLP Libraries
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

In [3]:
!ls

pytorch-baseline.ipynb     test.csv
pytorch-gru-word2vec.ipynb train.csv


In [4]:
train = pd.read_csv('train.csv', delimiter=",")
print("Train size: {}".format(len(train)))
train.tail()

Train size: 95851


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0


In [5]:
test = pd.read_csv('test.csv', delimiter=",")
print("Test size: {}".format(len(test)))
test.tail()

Test size: 226998


Unnamed: 0,id,comment_text
226993,999966872214,*{Persondata
226994,999968525410,'' — is wishing you a [WIKI_LINK: Mary Poppin...
226995,999980053494,==Fair use rationale for [WIKI_LINK: Image:D.R...
226996,999980680364,== Employment Practices at Majestic ==
226997,999997819802,Welcome to Wikipedia. Although everyone is wel...


## Cleaning a little bit

In [6]:
def clean_text(text, do_stop=False):
    text = str(text)
    text = gensim.parsing.preprocessing.strip_numeric(text)  # Strip all the numerics
    text = re.sub(r'[^\x00-\x7f]',r' ',text) # Removing non ASCII chars
    text = text.replace("\n","") # Removing line breaks
    text = text.replace("=","") # Removing =
    text = text.replace(":","") # Removing :
    text = text.replace("#","") # Removing #
    text = text.replace("%","") # Removing #
    text = text.replace("&","") # Removing #
    text = text.replace('"',"") # Removing #
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)# Strip multiple whitespaces

    text = text.lower()
    stops = set(stopwords.words("english"))
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    text = " ".join(filtered_words)
    return text

In [7]:
msg = "\n ##?? %&that is not cool"
clean_text(msg)

'?? that is not cool'

In [8]:
train['cleaned_comment']=train['comment_text'].apply(lambda x:clean_text(x, do_stop=True))
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comment
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0,"discussed it, unlike revert (heonsi pure sockp..."
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0,"ps. almost forgot, paine reply back shit, want..."
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0,mamoun darkazanlifor reason unable fix bold fo...
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0,salafi would better term. politically correct ...
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0,making wikipedia better inviting place.


In [9]:
test['cleaned_comment']=test['comment_text'].apply(lambda x:clean_text(x, do_stop=True))
test.head()

Unnamed: 0,id,comment_text,cleaned_comment
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...,orphaned non-free media (imagecdjboevl. ss .jpg)
1,6102620,::Kentuckiana is colloquial. Even though the ...,kentuckiana colloquial. even though area often...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie...","hello fellow wikipedians,i modified [wiki_link..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2...","akc suspensions morning call - feb , ..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==,[wiki_link talkcelts]


## Analysing classes, inbalance, most command words

In [None]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from pprint import pprint

### Toxic category

In [None]:
toxic=train[train['toxic']==1].reset_index(drop=True)
toxic.drop(labels=['severe_toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)
print(len(toxic))
toxic.tail()

### Severe Toxic category

In [None]:
severe_toxic=train[train['severe_toxic']==1].reset_index(drop=True)
severe_toxic.drop(labels=['toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)
print(len(severe_toxic))
severe_toxic[0:10]

In [None]:
obscene=train[train['obscene']==1].reset_index(drop=True)
obscene.drop(labels=['toxic','severe_toxic','threat','insult','identity_hate'], axis=1, inplace=True)
print(len(obscene))
obscene[0:10]

In [None]:
threat=train[train['threat']==1].reset_index(drop=True)
threat.drop(labels=['toxic','severe_toxic','obscene','insult','identity_hate'], axis=1, inplace=True)
print(len(threat))
threat[0:10]

In [None]:
insult=train[train['insult']==1].reset_index(drop=True)
insult.drop(labels=['toxic','severe_toxic','obscene','threat','identity_hate'], axis=1, inplace=True)
print(len(insult))
insult[0:10]

In [None]:
identity_hate=train[train['identity_hate']==1].reset_index(drop=True)
identity_hate.drop(labels=['toxic','severe_toxic','obscene','threat','insult'], axis=1, inplace=True)
print(len(identity_hate))
identity_hate[0:10]

In [None]:
train[0:10]

## Train/test split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train['cleaned_comment'],
                                                    list(zip(train['toxic'], train['severe_toxic'],
                                                             train['obscene'], train['threat'],
                                                             train['insult'], train['identity_hate'])),
                                                      test_size=0.2)

In [None]:
x_test = np.array(test['cleaned_comment'])
x_test

In [None]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

In [None]:
valid_data=list(zip(x_valid,y_valid))
valid_data[-5:-1]

In [None]:
## Build Vocabulary
word_to_ix = {}
for (sent) in list(x_train) + list(x_valid)+list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [None]:
len(word_to_ix)

In [None]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 6
VOCAB_SIZE,NUM_LABELS

## Model 2 - BoW Classifier with Handcrafted features

In [None]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        ## Defining parameters for linear model
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        ## do the foward pass and implement non-linearity
        return F.log_softmax(self.linear(bow_vec),dim=1)

In [None]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence.split():
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

In [None]:
def make_target(label):
    return torch.FloatTensor(label).view(1,-1)

In [None]:
train_data[1][0]

In [None]:
n=4
sample_phrase=make_bow_vector(train_data[n][0],word_to_ix)
print(">> SENTENCE: {}".format(train_data[n][0]))
#$print(">> SENTIMENT: {}".format(y_train[n]))
print(">> INPUT SIZE: {}".format(sample_phrase.size()))
print(">> INPUT FORMAT: {}".format(type(sample_phrase)))

In [None]:
sample_phrase

In [None]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
model.cuda()

In [None]:
## simple forward to see if its working
out=model(Variable(sample_phrase).cuda())
out

## Training

In [None]:
loss_function = nn.BCEWithLogitsLoss()
learning_rate = 0.01
optimizer = optim.SGD(params=model.parameters(), lr=learning_rate)

In [None]:
train_data[0][1]

In [None]:
make_target(train_data[0][1])

In [None]:
batch_size = 50
n_iters = 5000000
num_epochs = n_iters/(len(x_train))/batch_size
num_epochs=int(num_epochs)
num_epochs

In [None]:
train_data[1][0]

In [None]:
train_data[0][1]

In [None]:
bow_vec = Variable(make_bow_vector(train_data[0][0],word_to_ix)).cuda()
bow_vec

In [None]:
target = Variable(make_target(train_data[0][1])).cuda()
target

In [None]:
output = model(bow_vec)
output

In [None]:
loss = loss_function(output, target)
loss

In [None]:
len(train_data)

In [None]:
len(valid_data)

In [None]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
       
        ## Step 2- Prepare input and label
        bow_vec = Variable(make_bow_vector(sent,word_to_ix)).cuda()
        target = Variable(make_target(label)).cuda()
        
        # Step 3 - Run forward pass
        output = model(bow_vec)
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 2000 ==0:
            print("I'm validating now!")
            correct = 0
            total = 0
            i=0
            for (sent,label) in valid_data:
                i+=1
                bow_vec = Variable(make_bow_vector(sent,word_to_ix)).cuda()
                target = Variable(make_target(label)).cuda()
                output = model(bow_vec)
                #_,predicted = torch.max(output.data,1)
                #total += target.size(0)
                #correct += (predicted[0] == make_target(label)).sum()
            #accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}'.format(iter,loss.data[0]))