In [1]:
import pickle
import re
import pandas as pd
import numpy as np
from string import punctuation

## Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Sklearn imports
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss,roc_auc_score

## NLP Libraries
# Spacy
import spacy
spacy_en = spacy.load('en')
# NLTK
from nltk import download
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
download('stopwords')
download('wordnet')
# Gensim
import gensim
from gensim import utils

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
print(spacy.__version__)

2.0.5


In [3]:
train = pd.read_csv('train.csv', delimiter=",")
print("Train size: {}".format(len(train)))
train.tail()

Train size: 95851


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0


In [4]:
test = pd.read_csv('test.csv', delimiter=",")
print("Test size: {}".format(len(test)))
test.tail()

Test size: 226998


Unnamed: 0,id,comment_text
226993,999966872214,*{Persondata
226994,999968525410,'' — is wishing you a [WIKI_LINK: Mary Poppin...
226995,999980053494,==Fair use rationale for [WIKI_LINK: Image:D.R...
226996,999980680364,== Employment Practices at Majestic ==
226997,999997819802,Welcome to Wikipedia. Although everyone is wel...


## Cleaning text

In [6]:
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    #print(doc)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

In [7]:
def strip_punctuation(text):
    return ''.join(c for c in text if c not in punctuation)

In [8]:
def clean_text(text, do_stop=False, do_lemma= False):
    text = str(text)
    #text = gensim.parsing.preprocessing.strip_numeric(text)  # Strip all the numerics
    text = re.sub(r'[^\x00-\x7f]',r' ',text) # Removing non ASCII chars
    text = text.replace("\n"," ") # Removing line breaks

    # Remove the punctuation
    text = strip_punctuation(text)

    text = text.lower()
    stops = set(stopwords.words("english"))
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    text = " ".join(filtered_words)
    
    ## Lemmatization
    if (do_lemma==True):
    #    text = lemmatizer_spacy(text)
        text = lemmatizer.lemmatize(text) ## using NLTK lemmatizer
        
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)# Strip multiple whitespaces
    return text

In [9]:
msg = "\n ##?? %&that is not cool"
clean_text(msg, do_lemma = True)

'that is not cool'

In [10]:
train['cleaned_comment']=train['comment_text'].apply(lambda x:clean_text(x, do_lemma = True))
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comment
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0,i have discussed it unlike most of those who r...
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0,ps almost forgot paine dont reply back to this...
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0,mamoun darkazanli for some reason i am unable ...
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0,salafi would be a better term it is more polit...
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0,making wikipedia a better and more inviting place


In [11]:
test['cleaned_comment']=test['comment_text'].apply(lambda x:clean_text(x, do_lemma=True))
test.head()

Unnamed: 0,id,comment_text,cleaned_comment
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...,orphaned nonfree media image41cd1jboevl ss500 jpg
1,6102620,::Kentuckiana is colloquial. Even though the ...,kentuckiana is colloquial even though the area...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie...",hello fellow wikipedians i have just modified ...
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2...",akc suspensions the morning call feb 24 2001 7...
4,22982444,== [WIKI_LINK: Talk:Celts] ==,wikilink talkcelts


## Train/Test split

In [12]:
labels = list(zip(train['toxic'], 
                    train['severe_toxic'],
                    train['obscene'], 
                    train['threat'],
                    train['insult'], 
                    train['identity_hate']))

In [13]:
x_train, x_valid, y_train, y_valid = train_test_split(train['cleaned_comment'],
                                                      labels, 
                                                      test_size=0.2,random_state=2017)

In [14]:
x_test = np.array(test['cleaned_comment'])

In [15]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

[('filmfare award for best male debut venky that editor shshshis not updating the correct years for filmfare award for best male debut please check my addition done on 2325 17 june 2011 which is absolutely true please inform shshto update correct years also please check my talk in shsh page regarding same header',
  (0, 0, 0, 0, 0, 0)),
 ('charles sumner article hello dr jensen i have recently been making edits on the charles sumner article i have expanded on the dominican republic annexation treaty and information on president grant are there any other areas that need work on the cs article',
  (0, 0, 0, 0, 0, 0)),
 ('nsa conspiracy theory please contain all discussion of the dubious sourcestatements here',
  (0, 0, 0, 0, 0, 0)),
 ('noted ill try to dig around a bit more to see if theres anything else that could be used to address this additionally ive added classical to the genre bit as per his choices and flowers release this makes me wonder if new age could also be used which would

In [16]:
valid_data=list(zip(x_valid,y_valid))
valid_data[-5:-1]

[('reversing her early casual antisemitism when did this get added and where was it discussed meantime i have taken it out',
  (0, 0, 0, 0, 0, 0)),
 ('i dont fix disambig by awb but just and it is working well',
  (0, 0, 0, 0, 0, 0)),
 ('the phrase in europe it is refered to as white spirit was especially helpful for me it let me know that white spirit and mineral spirits are the same',
  (0, 0, 0, 0, 0, 0)),
 ('fhu editing please explain to me how my editing of the freedhardeman university page was biased and not neutral preceding unsigned comment added by talk contribs',
  (0, 0, 0, 0, 0, 0))]

In [17]:
## Build vocabulary of words
word_to_ix = {}
for (sent) in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [18]:
print(len(word_to_ix))
word_to_ix

468842


{'filmfare': 0,
 'award': 1,
 'for': 2,
 'best': 3,
 'male': 4,
 'debut': 5,
 'venky': 6,
 'that': 7,
 'editor': 8,
 'shshshis': 9,
 'not': 10,
 'updating': 11,
 'the': 12,
 'correct': 13,
 'years': 14,
 'please': 15,
 'check': 16,
 'my': 17,
 'addition': 18,
 'done': 19,
 'on': 20,
 '2325': 21,
 '17': 22,
 'june': 23,
 '2011': 24,
 'which': 25,
 'is': 26,
 'absolutely': 27,
 'true': 28,
 'inform': 29,
 'shshto': 30,
 'update': 31,
 'also': 32,
 'talk': 33,
 'in': 34,
 'shsh': 35,
 'page': 36,
 'regarding': 37,
 'same': 38,
 'header': 39,
 'charles': 40,
 'sumner': 41,
 'article': 42,
 'hello': 43,
 'dr': 44,
 'jensen': 45,
 'i': 46,
 'have': 47,
 'recently': 48,
 'been': 49,
 'making': 50,
 'edits': 51,
 'expanded': 52,
 'dominican': 53,
 'republic': 54,
 'annexation': 55,
 'treaty': 56,
 'and': 57,
 'information': 58,
 'president': 59,
 'grant': 60,
 'are': 61,
 'there': 62,
 'any': 63,
 'other': 64,
 'areas': 65,
 'need': 66,
 'work': 67,
 'cs': 68,
 'nsa': 69,
 'conspiracy': 70,
 '

In [19]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 6
VOCAB_SIZE,NUM_LABELS

(468842, 6)

## Loading Embeddings

In [20]:
!ls ../../vectors/

GoogleNews-vectors-negative300.bin


In [21]:
from gensim.models import KeyedVectors

In [22]:
w2v = KeyedVectors.load_word2vec_format('../../vectors/GoogleNews-vectors-negative300.bin', binary = True)

In [23]:
w2v.word_vec('test')[0:50]

array([-0.14257812, -0.03686523,  0.13574219, -0.06201172,  0.07958984,
        0.01904297, -0.08154297, -0.12792969, -0.02954102,  0.23632812,
       -0.12158203, -0.21484375,  0.12988281, -0.02709961, -0.05200195,
        0.21582031, -0.18164062,  0.05102539, -0.16015625, -0.17675781,
        0.01831055, -0.04125977, -0.23242188, -0.01031494,  0.14550781,
        0.05249023, -0.39648438, -0.01928711,  0.0025177 , -0.01269531,
       -0.04394531,  0.03076172,  0.09570312, -0.17578125,  0.01043701,
        0.18945312, -0.23632812,  0.04370117,  0.28125   , -0.02075195,
       -0.18164062, -0.21777344,  0.23339844,  0.05297852, -0.11376953,
        0.00939941, -0.14941406,  0.19921875, -0.17578125,  0.31640625],
      dtype=float32)

In [24]:
W2V_DIM=300
sd = 1/np.sqrt(W2V_DIM) ## standard deviation to use
weights = np.random.normal(0, scale=sd, size=[VOCAB_SIZE, W2V_DIM])
weights = weights.astype(np.float32)

In [25]:
no_w2v_count = 0
oov_words = []
for word in word_to_ix:
    id = word_to_ix.get(word)
    #print(word,id)
    if id is not None:
        try:
            weights[id]=w2v.word_vec(word)
            #print(weights[id][0:20])
            #weights[id]=glove_vector[word]
        except:
            #print("OOV word - {}".format(word))
            no_w2v_count +=1
            oov_words.extend([word])
            weights[id]=np.random.normal(0, scale=sd, size=[1, W2V_DIM]) ## If word not present, initialize randomly

In [26]:
print(">> Total vocabulary: {}, OOV: {}".format(len(word_to_ix),no_w2v_count))
print(">> % of OOV words: {:0.4f} %".format(no_w2v_count/len(word_to_ix)))

>> Total vocabulary: 468842, OOV: 385950
>> % of OOV words: 0.8232 %


## Model - BoW Classifier - Bag of Embedding features

In [27]:
class BoWClassifierEmbeddings(nn.Module):
    def __init__(self, embedding_dim, vocab_size, num_labels):
        super(BoWClassifierEmbeddings, self).__init__()
        
        self.embeddings_bag = nn.EmbeddingBag(VOCAB_SIZE, W2V_DIM,scale_grad_by_freq=True, mode='sum')
        self.embeddings_bag.weight.data=torch.Tensor(weights)
        self.linear = nn.Linear(embedding_dim, num_labels)
        self.softmax = nn.Softmax(dim=None)
    
    def forward(self, sentence):
      
        embeds = self.embeddings_bag(sentence,Variable(torch.LongTensor([0]).cuda()))
        linear = self.linear(embeds)
        log_soft = F.log_softmax(linear, dim=None)
        out = self.softmax(log_soft)
        return out

In [28]:
def make_sentence_embeddings(seq, to_ix):
    idxs = [to_ix[w] for w in seq.split()]    
    tensor = torch.LongTensor(idxs)
    return tensor 

In [29]:
def make_target(label):
    return torch.LongTensor([label])

In [30]:
W2V_DIM = 300
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 6

In [31]:
torch.cuda.is_available()

True

In [32]:
model = BoWClassifierEmbeddings(embedding_dim = W2V_DIM,
                        vocab_size=VOCAB_SIZE,
                        num_labels=NUM_LABELS)
model.cuda()

BoWClassifierEmbeddings(
  (embeddings_bag): EmbeddingBag(468842, 300, scale_grad_by_freq=True, mode=sum)
  (linear): Linear(in_features=300, out_features=6)
  (softmax): Softmax()
)

In [34]:
n=0
print(train_data[n][0])
sample_phrase=Variable(make_sentence_embeddings(train_data[n][0],word_to_ix)).cuda()
print(sample_phrase)

filmfare award for best male debut venky that editor shshshis not updating the correct years for filmfare award for best male debut please check my addition done on 2325 17 june 2011 which is absolutely true please inform shshto update correct years also please check my talk in shsh page regarding same header
Variable containing:
  0
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
  2
  0
  1
  2
  3
  4
  5
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 15
 29
 30
 31
 13
 14
 32
 15
 16
 17
 33
 34
 35
 36
 37
 38
 39
[torch.cuda.LongTensor of size 53 (GPU 0)]



In [35]:
## Testing Bag of Embeddings

In [36]:
embeddings_bag = nn.EmbeddingBag(VOCAB_SIZE, W2V_DIM, mode='mean')
embeddings_bag.weight.data=torch.Tensor(weights)

In [37]:
n=3
print(train_data[n][0])
sample_phrase=Variable(make_sentence_embeddings(train_data[n][0],word_to_ix)).cuda()
sample_phrase

noted ill try to dig around a bit more to see if theres anything else that could be used to address this additionally ive added classical to the genre bit as per his choices and flowers release this makes me wonder if new age could also be used which would kind of help fill the search for something that details his music other than the widely applicable hiphop thoughts friend


Variable containing:
  79
  80
  81
  82
  83
  84
  85
  86
  87
  82
  88
  89
  90
  91
  92
   7
  93
  94
  95
  82
  96
  97
  98
  99
 100
 101
  82
  12
 102
  86
 103
 104
 105
 106
  57
 107
 108
  97
 109
 110
 111
  89
 112
 113
  93
  32
  94
  95
  25
 114
 115
  75
 116
 117
  12
 118
   2
 119
   7
 120
 105
 121
  64
 122
  12
 123
 124
 125
 126
 127
[torch.cuda.LongTensor of size 70 (GPU 0)]

In [38]:
out=model(sample_phrase)
out

  
  from ipykernel import kernelapp as app


Variable containing:
 0.0024  0.0018  0.0001  0.0045  0.0043  0.9869
[torch.cuda.FloatTensor of size 1x6 (GPU 0)]

In [39]:
print(torch.__version__)

0.3.0.post4


## Training model 

In [40]:
loss_function = nn.MultiLabelMarginLoss()
learning_rate = 0.001
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

In [41]:
valid_data[0:5]

[('sigh i see domer and big dunc are doing their tag thing routine again i dont know how you have the energy to persevere here your a better man than i',
  (0, 0, 0, 0, 0, 0)),
 ('april 2010 please stop if you continue to add promotional material to wikipedia you will be blocked from editing talk',
  (0, 0, 0, 0, 0, 0)),
 ('we also have this link it explains the negative stance on cair by investors weekly maybe its useful',
  (0, 0, 0, 0, 0, 0)),
 ('no because i realized that adding tables to every section is sort of cruft mayby if we moved the fcw roster somewhere else then the page would be smaller and mayby one big table could be work also citations are just needed for their job the stuff written in italics by each superstar the roster pages cover really who is on what brand co',
  (0, 0, 0, 0, 0, 0)),
 ('it wasnt me that edited those pages httpurdirtcom20091001ufc108silvavsbelfortrumors its ufc 108 silva vs belfort until further notice now please leave it alone and have a nice day'

In [42]:
len(train_data)

76680

In [43]:
batch_size = 10
n_iters = 1000000
num_epochs = n_iters/(len(x_train))/batch_size
num_epochs=int(num_epochs)
num_epochs

1

In [44]:
## Checking incorrect predictions
incorrect = 0
incorrect_vect = pd.DataFrame(columns={'wrong_pred_intent', 'correct_intent','usersays'})
incorrect_vect=incorrect_vect[['wrong_pred_intent', 'correct_intent','usersays']]
incorrect_vect
incorrect = 0

In [45]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()

        ## Step 2- Prepare input and label
        sample = Variable(make_sentence_embeddings(sent,word_to_ix)).cuda()
        target = Variable(make_target(label)).cuda()
    
        # Step 3 - Run forward pass
        output = model(sample)
        #print("Log probabilities - {}".format(log_probs))
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 1000 ==0:
            correct = 0
            total = 0
            for (sent,label) in valid_data:
                sample = Variable(make_sentence_embeddings(sent,word_to_ix)).cuda()
                target = Variable(make_target(label)).cuda()
                output = model(sample)
                #print(output)
                #_,predicted = torch.max(output.data,1)
                #pred_label=list(label_to_ix.keys())[list(label_to_ix.values()).index(predicted[0])]
                #total += target.size(0)
                #if (pred_label == label):
                #    correct += (predicted[0] == make_target(label, label_to_ix)).sum()
                #else:
                #    incorrect_vect.loc[incorrect] = [pred_label, label, sent]
                #    incorrect +=1
            #accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}'.format(iter,loss.data[0]))

  
  from ipykernel import kernelapp as app


Iterations: 1000. Loss: 4.36184047864982e-13
Iterations: 2000. Loss: 2.6666667461395264
Iterations: 3000. Loss: 1.1798955202102661
Iterations: 4000. Loss: 0.0
Iterations: 5000. Loss: 0.0
Iterations: 6000. Loss: 0.0
Iterations: 7000. Loss: 0.0
Iterations: 8000. Loss: 0.0
Iterations: 9000. Loss: 0.0
Iterations: 10000. Loss: 0.00011645290214801207
Iterations: 11000. Loss: 0.0
Iterations: 12000. Loss: 5.54816580784719e-36
Iterations: 13000. Loss: 1.2364866262310869e-28
Iterations: 14000. Loss: 0.0
Iterations: 15000. Loss: 2.0
Iterations: 16000. Loss: 0.0
Iterations: 17000. Loss: 0.0
Iterations: 18000. Loss: 7.145776521610969e-07
Iterations: 19000. Loss: 2.646989615343043e-13
Iterations: 20000. Loss: 0.0
Iterations: 21000. Loss: 1.463844019981491e-39
Iterations: 22000. Loss: 0.0
Iterations: 23000. Loss: 0.0
Iterations: 24000. Loss: 0.0
Iterations: 25000. Loss: 5.426860050207392e-23
Iterations: 26000. Loss: 0.0
Iterations: 27000. Loss: 0.0
Iterations: 28000. Loss: 0.0
Iterations: 29000. Loss

In [44]:
torch.save(model,'model_1.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
modelito = torch.load('model_1.pt')

In [None]:
def make_preds(model,test):
    my_sub = pd.DataFrame(columns={'id', 'toxic','severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'})
    my_sub=my_sub[['id', 'toxic','severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
    for i in range(len(test['cleaned_comment'])):
        sample=test['cleaned_comment'][i]
        if (sample != ""):
            sample_context=Variable(make_sentence_embeddings(sample,word_to_ix)).cuda()
            out=model(sample_context).data.cpu().numpy()[0]
            #print("{} - {}".format(test.loc[i]['id'],out))
        else:
            out = [1, 0, 0, 0, 0, 0]
        my_sub.loc[i] = [test.loc[i]['id'], out[0], out[1], out[2], out[3], out[4], out[5]]
    return my_sub

In [None]:
preds = make_preds(model,test)