In [16]:
import pickle
import re
import pandas as pd
import numpy as np
from string import punctuation

## Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Sklearn imports
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, log_loss

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords
spacy_en = spacy.load('en')
download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train = pd.read_csv('train.csv', delimiter=",")
print("Train size: {}".format(len(train)))
train.tail()

Train size: 95851


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0


In [4]:
test = pd.read_csv('test.csv', delimiter=",")
print("Test size: {}".format(len(test)))
test.tail()

Test size: 226998


Unnamed: 0,id,comment_text
226993,999966872214,*{Persondata
226994,999968525410,'' — is wishing you a [WIKI_LINK: Mary Poppin...
226995,999980053494,==Fair use rationale for [WIKI_LINK: Image:D.R...
226996,999980680364,== Employment Practices at Majestic ==
226997,999997819802,Welcome to Wikipedia. Although everyone is wel...


## Cleaning text

In [86]:
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    #print(doc)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

In [87]:
def strip_punctuation(text):
    return ''.join(c for c in text if c not in punctuation)

In [88]:
def clean_text(text, do_stop=False, do_lemma= False):
    text = str(text)
    #text = gensim.parsing.preprocessing.strip_numeric(text)  # Strip all the numerics
    text = re.sub(r'[^\x00-\x7f]',r' ',text) # Removing non ASCII chars
    text = text.replace("\n","") # Removing line breaks
    
    # Remove the punctuation
    text = strip_punctuation(text)
    
    text = text.lower()
    
    ## Removing Stopwords
    #stops = set(stopwords.words("english"))
    #if (do_stop==True):
    #    filtered_words = [word for word in text.split() if word not in stops]
    #else:
    #    filtered_words = [word for word in text.split()]
    #text = " ".join(filtered_words)
    
    ## Lemmatization
    if (do_lemma==True):
        text = lemmatizer_spacy(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

    return text

In [89]:
spacy.__version__

'2.0.9'

In [90]:
msg = "\n ##?? %&that is not cool"
clean_text(msg, do_lemma = True)

' that be not cool'

In [None]:
train['cleaned_comment']=train['comment_text'].apply(lambda x:clean_text(x, do_lemma = True))
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comment
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0,i have discuss it unlike most of those who rev...
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0,ps almost forget paine do not reply back to th...
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0,mamoun darkazanlifor some reason i be unable t...
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0,salafi would be a good term it be more politic...
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0,make wikipedia a good and more invite place


In [None]:
test['cleaned_comment']=test['comment_text'].apply(lambda x:clean_text(x, do_lemma=True))
test.head()

## Train/Test split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train['cleaned_comment'],
                                                      list(zip(train['toxic'], 
                                                               train['severe_toxic'],
                                                               train['obscene'], 
                                                               train['threat'],
                                                               train['insult'], 
                                                               train['identity_hate']), 
                                                      test_size=0.2,random_state=2017)

In [None]:
x_test = np.array(test['cleaned_comment'])
x_test

In [None]:
x_test = np.array([train['toxic'],
                  train['severe_toxic'],
                  train['obscene'],
                  train['threat'],
                  train['insult'],
                  train['identity_hate']])

In [None]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

In [None]:
valid_data=list(zip(x_valid,y_valid))
valid_data[-5:-1]

In [None]:
## Build vocabulary of words
word_to_ix = {}
for (sent) in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [None]:
print(len(word_to_ix))
word_to_ix