## Import

First we need to import some popular python libraries.

In [1]:
import csv
import errno
import glob
import os
import re
import string
import sys

from collections import OrderedDict, Counter
from subprocess import check_call
from shutil import copyfile

from tqdm import tqdm

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost as xgb

In [3]:
from sklearn import decomposition, ensemble, metrics, model_selection, naive_bayes, preprocessing, pipeline
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder



In [4]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tag import StanfordNERTagger



In [5]:
from keras import initializers
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D,Merge,Lambda,Input,GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D,TimeDistributed
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence, text
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

Using TensorFlow backend.


In [6]:
# from sner import Ner 

## Setting

We let IPython to show all the variables automatically rather than only the last one as default.

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Then we mute the `pandas` when we try to use chained assignments. Check this for reference: <https://www.dataquest.io/blog/settingwithcopywarning/>

In [8]:
pd.options.mode.chained_assignment = None

## Read CSV

Here is how we use pandas to read csv files to get the training data and test data.

In [9]:
train_path = '../input/train.csv'
test_path = '../input/test.csv'


## Read the train and test dataset and check the top few lines ##
# TODO:
train_df = pd.read_csv(train_path).fillna('NANN').head(5000)
test_df = pd.read_csv(test_path).fillna('NANN')

train_df.shape[0]
test_df.shape[0]
train_df.head()
test_df.head()

5000

226998

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


We can also access the target labels and select the content of particular labels:

In [10]:
labels = list(train_df)[2:]
len(labels)
labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_df[labels].head()

label0 = labels[0]
train_df[label0].head()

train_df[train_df['toxic']==1]['comment_text'].head()

6

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


0    1
1    0
2    0
3    0
4    0
Name: toxic, dtype: int64

0     Nonsense?  kiss off, geek. what I said is true...
20    Why hasn't Alitalia been removed rom the allia...
26    "\nThe Graceful Slick....\nIs non other than a...
30    "\n\n Stupid? \n\nAs soon as I saw the phrase ...
32    "\nBan one side of an argument by a bullshit n...
Name: comment_text, dtype: object

Then we store the `id` field of test file for submission, then drop it from train and test data frame for simplicity. We also get the target labels as `train_y`, and drop them from train data frame.

In [11]:
## Prepare the data for modeling ###
train_y = train_df[labels].values
test_id = test_df['id'].values
labels
train_y

cols_to_drop = ['id']
train_df = train_df.drop(cols_to_drop + labels, axis=1)
test_df = test_df.drop(cols_to_drop, axis=1)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

## Tokenize

In [12]:
for df in [train_df, test_df]:
    df['split'] = df['comment_text'].apply(word_tokenize)
    print('split finished...')

split finished...
split finished...


## Named Entity

In [13]:
def feature_ner(df):
    st = StanfordNERTagger('../stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
                          '../stanford-ner-2017-06-09/stanford-ner.jar',
                          encoding='utf-8')
    st = Ner(host='localhost',port=9199)
    df['st'] = df['comment_text'].apply(lambda x: [w[1] for w in st.get_entities(x)])
    df['n_person'] = df['st'].apply(lambda x: x.count('PERSON'))
    df['n_location'] = df['st'].apply(lambda x: x.count('LOCATION'))
    print('NER finished...')

## Sentiment Intensity

In [14]:
def feature_sid(df):
    sid = SentimentIntensityAnalyzer()
    df['sid'] = df['comment_text'].apply(sid.polarity_scores)
    for k in ['neu', 'compound', 'pos', 'neg']:
        df['sid_'+k] = df['sid'].apply(lambda x: x[k])
    print('polarity_scores finished...') 

## Part of Speech

In [15]:
def feature_pos(df): 
    df['pos_tag'] = df['split'].apply(lambda x: [w[1] for w in nltk.pos_tag(x)])
    for pos in ['CC', 'RB', 'IN', 'NN', 'VB', 'VBP', 'JJ', 'PRP', 'TO', 'DT']:
        df['n_pos_' + pos] = df['pos_tag'].apply(lambda x: x.count(pos))
    print('pos_tag finished...')

In [16]:
def feature_cnt(df):
    ## Number of words in the text ##
    df["num_words"] = df["split"].apply(len)
    
    return
    ## Number of unique words in the text ##
    df["num_unique_words"] = df["split"].apply(lambda x: len(set(x)))
    
    ## Number of characters in the text ##
    df["num_chars"] = df['comment_text'].apply(len)
    
    ## Number of stopwords in the text ##
    #eng_stopwords = set(stopwords.words("english"))
    df["num_stopwords"] = df["split"].apply(lambda x: len([w for w in x if w in eng_stopwords]))
    
    ## Number of punctuations in the text ##
    df["num_punctuations"] = df['split'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
    
    ## Number of title case words in the text ##
    df["num_words_upper"] = df["split"].apply(lambda x: len([w for w in x if w.isupper()]))
    
    ## Number of title case words in the text ##
    df["num_words_title"] = df["split"].apply(lambda x: len([w for w in x if w.istitle()]))
    
    ## Average length of the words in the text ##
    df["mean_word_len"] = df["split"].apply(lambda x: np.mean([len(w) for w in x]))


    anchor_words = ['the', 'a', 'appear', 'little', 'was', 'one', 'two', 'three', 'ten', 'is', 
                    'are', 'ed', 'however', 'to', 'into', 'about', 'th', 'er', 'ex', 'an', 
                    'ground', 'any', 'silence', 'wall']

    gender_words = ['man', 'woman', 'he', 'she', 'her', 'him', 'male', 'female']

    for word in anchor_words + gender_words:
        df['n_'+word] = df["split"].apply(lambda x: len([w for w in x if w.lower() == word]))

In [17]:
%%time
for df in [train_df, test_df]:
    feature_cnt(df)

Wall time: 117 ms


In [18]:
list(train_df)

['comment_text', 'split', 'num_words']

## Topic Model

In [19]:
def feature_lda(train_df, test_df): 
    
    ### Fit transform the tfidf vectorizer ###
    tfidf_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
    full_tfidf = tfidf_vec.fit_transform(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
    train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
    test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())
    
    no_topics = 20 
    lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(full_tfidf)
    train_lda = pd.DataFrame(lda.transform(train_tfidf))
    test_lda = pd.DataFrame(lda.transform(test_tfidf))
    
    train_lda.columns = ['lda_'+str(i) for i in range(no_topics)]
    test_lda.columns = ['lda_'+str(i) for i in range(no_topics)]
    train_df = pd.concat([train_df, train_lda], axis=1)
    test_df = pd.concat([test_df, test_lda], axis=1)
    del full_tfidf, train_tfidf, test_tfidf, train_lda, test_lda

    print("LDA finished...")


## Neural Network

### Glove Vector

In [20]:
# load the GloVe vectors in a dictionary:

wv = '../input/glove.6B.100d.txt'

def loadWordVecs():
    embeddings_index = {}
    f = open(wv)
    for line in f:
        print(line)
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

def sent2vec(embeddings_index,s): # this function creates a normalized vector for the whole sentence
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum())

def doGlove(x_train,x_test):
    embeddings_index = loadWordVecs()
    # create sentence vectors using the above function for training and validation set
    xtrain_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_train)]
    xtest_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_test)]
    xtrain_glove = np.array(xtrain_glove)
    xtest_glove = np.array(xtest_glove)
    return xtrain_glove,xtest_glove,embeddings_index

glove_vecs_train,glove_vecs_test,embeddings_index = doGlove(train_df['comment_text'], test_df['comment_text'])
train_df[['sent_vec_'+str(i) for i in range(100)]] = pd.DataFrame(glove_vecs_train.tolist())
test_df[['sent_vec_'+str(i) for i in range(100)]] = pd.DataFrame(glove_vecs_test.tolist())
print("Glove sentence vector finished...")

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158


for -0.14401 0.32554 0.14257 -0.099227 0.72536 0.19321 -0.24188 0.20223 -0.89599 0.15215 0.035963 -0.59513 -0.051635 -0.014428 0.35475 -0.31859 0.76984 -0.087369 -0.24762 0.65059 -0.15138 -0.42703 0.18813 0.091562 0.15192 0.11303 -0.15222 -0.62786 -0.23923 0.096009 -0.46147 0.41526 -0.30475 0.1371 0.16758 0.53301 -0.043658 0.85924 -0.41192 -0.21394 -0.51228 -0.31945 0.12662 -0.3151 0.0031429 0.27129 0.17328 -1.3159 -0.42414 -0.69126 0.019017 -0.13375 -0.096057 1.7069 -0.65291 -2.6111 0.26518 -0.61178 2.095 0.38148 -0.55823 0.2036 -0.33704 0.37354 0.6951 -0.001637 0.81885 0.51793 0.27746 -0.37177 -0.43345 -0.42732 -0.54912 -0.30715 0.18101 0.2709 -0.29266 0.30834 -1.4624 -0.18999 0.92277 -0.099217 -0.25165 0.49197 -1.525 0.15326 0.2827 0.12102 -0.36766 -0.61275 -0.18884 0.10907 0.12315 0.090066 -0.65447 -0.17252 2.6336e-05 0.25398 1.1078 -0.073074

- -1.2557 0.61036 0.56793 -0.96596 -0.45249 -0.071696 0.57122 -0.31292 -0.43814 0.90622 0.06961 -0.053104 0.25029 0.27841 0.77724 0.26329 0


it -0.30664 0.16821 0.98511 -0.33606 -0.2416 0.16186 -0.053496 0.4301 0.57342 -0.071569 0.36101 0.26729 0.27789 -0.072268 0.13838 -0.26714 0.12999 0.22949 -0.18311 0.50163 0.44921 -0.020821 0.42642 -0.068762 0.40337 0.095198 -0.31944 -0.54651 -0.13345 -0.56511 -0.20975 1.1592 -0.194 0.19828 -0.11924 0.41781 0.0068383 -0.20537 -0.53375 -0.52225 -0.38227 -0.0065833 0.14265 -0.42502 -0.3115 0.0027352 0.75093 -0.48218 -0.18595 -0.77104 -0.046406 -0.06914 0.41688 1.3235 -0.81742 -3.3998 -0.11307 -0.34123 2.0775 0.61369 0.14792 0.93753 -0.10138 0.28426 0.97899 -0.32335 0.63697 0.58308 0.2282 -0.31696 0.21061 -0.6506 0.21653 -0.24347 0.55519 -0.34351 -0.095093 -0.14715 -1.2876 0.3931 0.30163 -0.21767 -1.1146 0.51349 -1.341 -0.30381 0.32499 -0.45236 -0.17743 -0.048504 -0.12178 -0.42108 -0.40327 0.038452 -0.36084 0.037738 -0.21885 -0.38775 0.36916 0.54521

by -0.20875 -0.1174 0.26478 -0.28339 0.19584 0.7446 -0.03887 0.028499 -0.44252 -0.30426 0.27133 -0.51907 0.52183 -0.76648 0.28043 -0.48344 


be -0.46953 0.38432 0.54833 -0.63401 0.010133 0.11364 0.10612 0.58529 0.032302 -0.12274 0.030265 0.52662 1.0398 -0.082143 0.19118 -0.83784 0.50763 0.44488 -0.72604 0.036893 0.24211 -0.28878 0.33657 0.13656 0.14579 -0.13221 0.098428 -0.45276 -0.13029 0.015762 -0.010161 0.4967 -0.28461 0.29655 0.92979 0.42447 -0.082773 0.30438 -0.39219 -0.30585 -0.43201 -0.27333 0.24388 -0.58081 0.22679 0.027226 0.53473 -0.37527 -0.16119 -1.1235 0.12768 -0.69898 0.41341 1.2291 -0.41248 -2.5173 -0.15354 -0.043107 1.9111 0.80754 -0.14759 0.9609 -0.84267 0.084422 1.2616 -0.10938 0.54846 0.75255 -0.071289 -0.73987 0.094808 -0.97589 0.0078721 -0.23928 0.2882 -0.41516 0.034366 0.1197 -1.2142 -0.11306 0.52847 -0.42273 -0.93378 -0.046645 -2.122 -0.341 0.64229 -0.10097 -0.22875 -1.0776 -0.68044 -0.26372 -0.18331 -0.051632 -0.30836 0.066537 0.20422 -0.68914 0.4511 0.25125

has 0.093736 0.56152 0.48364 -0.45987 0.56067 -0.1694 0.018687 0.45529 0.065615 0.25181 -0.14251 0.10532 0.77865 0.1428 -0.08114 -0.069555 0.3


had 0.63256 -0.12718 -0.084182 -0.30718 -0.2526 -0.16172 0.47123 0.46553 -0.051526 0.17231 0.42743 0.57854 0.64548 0.31367 0.32752 -0.47608 0.25888 0.035845 -0.95154 -0.20671 0.65171 0.010712 0.3894 0.069552 0.061198 -0.72152 -0.22334 -0.34747 -0.1434 -0.32482 0.79539 0.84708 -0.046052 0.74384 0.18185 0.18666 0.17123 0.93485 -0.1299 0.39219 -1.016 -0.27859 0.79293 -0.40433 0.41505 0.017283 0.2936 -0.72944 0.98233 -0.96504 -0.19016 -0.010012 0.26765 1.3316 -0.19376 -2.4401 -0.39477 -0.36232 0.88315 1.5974 0.31113 0.72201 -0.55023 0.54995 0.6846 -0.40378 0.52677 0.23314 -0.12764 0.12305 -0.12489 -0.25052 0.11632 -0.41648 -0.19738 0.093786 -0.24532 0.29676 -1.6156 0.38933 0.68466 0.048371 -0.4177 0.10663 -2.0738 -0.81679 0.56651 -0.32536 -0.76685 -0.27123 -0.10798 -0.61763 -0.27098 0.077119 -0.89352 0.17811 -0.50156 -0.30966 0.22378 0.038183

i -0.046539 0.61966 0.56647 -0.46584 -1.189 0.44599 0.066035 0.3191 0.14679 -0.22119 0.79239 0.29905 0.16073 0.025324 0.18678 -0.31001 -0.28108 0.6


new -0.043959 0.18936 0.6611 -0.49007 0.32211 -0.34161 -0.06848 0.31364 -0.71142 0.57436 -0.33588 -0.52279 -0.39075 -0.089694 0.46371 -0.3561 0.84576 -0.026188 -0.19328 -0.083846 0.31806 -0.19812 0.30009 0.069189 0.5447 -0.59193 0.54221 -0.62876 -0.53447 0.42334 0.030869 0.97164 -0.56222 0.045752 -0.571 0.080185 -0.081434 -0.6026 0.16466 -0.40281 -0.47701 -0.5195 0.12777 -0.43775 0.26602 0.48752 -0.06022 -0.52622 0.37687 -0.18007 0.030166 -0.094577 0.1633 0.59041 -0.48877 -3.423 0.13113 -0.080386 1.8978 0.18857 -0.573 0.86358 0.0021116 0.3606 0.80475 -0.13954 -0.053935 0.38873 0.30673 -0.31395 0.083238 -0.41737 -1.0998 -0.88005 0.2155 -0.26132 -0.10091 0.079584 -1.2341 -0.65281 0.63363 -0.098491 0.33518 0.26332 -0.96427 -0.01415 0.30849 -0.31418 -0.40793 -0.429 0.085451 -0.20073 0.05505 -0.040922 -0.94015 0.069544 -0.45397 -0.14168 0.92789 0.59058

been -0.12135 0.15341 -0.014315 -0.50695 0.30361 0.080512 0.39152 0.2933 0.035886 -0.15228 0.09502 0.3393 1.2554 0.11321 -0.051129 -0.5700


up 0.21469 0.43367 0.33964 -0.65715 0.15546 0.15318 -0.62081 0.27839 -0.3704 0.0029626 0.37131 0.32756 -0.32802 0.10206 0.52715 -0.33415 -0.012657 0.20382 -0.19846 0.10483 0.72682 0.30136 0.73955 0.2264 0.5213 -0.46339 -0.56209 -0.47684 0.056159 -0.46364 -0.18426 0.15954 0.23868 -0.030124 -0.18315 0.27942 0.031251 -0.16198 -0.18941 0.2571 -0.48811 -0.70303 -0.0055224 -0.63184 -0.17694 0.38916 -0.64778 -0.08909 0.17655 -1.2462 -0.21257 -0.20355 0.11958 1.6196 -0.77112 -2.8367 -0.21148 0.11873 2.1393 0.78805 0.41318 0.97607 -0.67157 0.29821 0.12548 0.10129 0.69104 0.61075 0.58256 0.3346 0.042307 -0.45933 -0.24029 -0.73154 -0.3054 0.19878 -0.34562 0.0035721 -0.57002 0.027172 0.68865 0.4502 -0.62077 0.36449 -1.2001 0.15149 0.58623 0.35867 -0.22877 -0.032302 -0.18218 0.18319 -0.34823 -0.36982 -0.61882 -0.38964 0.0028948 0.046601 0.83004 0.40299

when 0.073242 0.11134 0.62281 -0.35905 -0.70731 0.43756 0.12819 0.13478 0.34282 -0.31661 0.58363 -0.093659 0.5544 -0.038733 0.62641 -0.51071 -0.03


n't 0.15731 0.3953 0.63586 -1.0975 -0.95768 -0.013841 -0.19853 0.25418 0.36731 -0.17486 0.27685 0.31943 0.30078 0.068531 -0.15917 -0.21944 0.064097 0.84745 -0.61989 0.54173 0.27921 0.50383 0.02146 -0.20571 0.077994 0.32229 -0.49183 -1.1411 0.23333 -0.54358 0.092285 0.8686 0.069127 0.19229 0.28374 0.46014 -0.2832 0.45384 0.35209 -0.49173 -0.14771 -0.071767 -0.24355 -0.63089 -0.67797 -0.13164 0.35974 -0.75292 0.038204 -1.7695 0.18893 -0.18872 -0.20268 0.8309 0.07787 -2.6213 0.081941 0.27262 1.6216 0.86166 -0.21582 1.0098 -0.78122 -0.11663 1.0629 0.1583 1.1009 0.70324 -0.60481 -0.45907 0.079862 -0.61794 -0.093896 -0.50363 -0.12217 -0.0017857 -0.032235 -0.1059 -0.69232 0.076485 0.60384 -0.56075 -0.96372 -0.070192 -2.0788 -0.56423 0.17574 -0.024961 -0.45349 -0.39287 -0.080573 -0.37634 0.035083 -0.39662 -0.76165 0.15113 -0.13033 -0.28513 0.19853 0.67464

her 0.3339 -0.52136 0.26848 0.17416 0.15808 0.95567 -0.39404 0.75332 -0.12433 0.64539 -0.12848 0.61024 0.14794 0.56136 -0.11478 -0.23334 0


$ 0.97469 1.2276 0.45377 0.20713 0.54067 0.19638 0.65192 0.96774 -0.61037 0.1568 0.95285 0.20265 -0.68878 -0.51994 0.69407 -1.042 0.24674 0.096724 0.2747 1.1922 0.65161 -0.24445 -0.036481 0.47561 0.15128 -0.056215 -0.19301 -0.77174 -0.37396 -1.1527 0.56227 0.56411 0.11622 -0.20223 -0.20121 1.2015 -0.14958 0.73682 0.45533 0.29635 0.8632 -1.063 -0.58136 -0.67818 -0.39116 0.12867 -0.64317 -0.99771 0.14453 -1.2594 0.27609 -0.38135 0.58074 0.80809 -1.0686 -2.6764 0.25807 -0.7799 2.1359 0.40925 0.40489 0.020735 -0.41791 0.076502 -0.63365 -1.4099 0.49511 0.96748 0.95423 -0.08271 0.11921 0.26844 -0.97633 0.79273 -0.49569 -0.12125 -0.075251 0.61632 -1.9643 0.62209 0.68695 0.40478 0.60071 -0.659 -0.29618 0.64787 0.74807 -0.015422 0.11191 0.039671 0.0087593 0.54317 0.28244 -1.6857 -0.88419 -0.095231 0.70215 0.22091 1.7439 -0.89909

you -0.49886 0.76602 0.89751 -0.78547 -0.6855 0.62609 -0.39655 0.34913 0.33334 -0.45233 0.61223 0.075948 0.22531 0.16365 0.28095 -0.24758 0.0099009 0.71108 -0.75859 0


president -0.064549 -0.13812 0.50017 0.41434 0.45832 -0.048331 -0.21651 0.34987 -0.83235 -0.62282 -0.40099 -0.31978 0.42928 -0.026035 -0.16171 -0.32513 0.77604 -0.39852 -0.67013 -0.55876 -0.39863 -0.28143 0.68614 0.29229 -0.7146 0.36995 -0.45309 -0.23193 0.76916 -0.031589 1.0603 1.5446 -0.13272 0.33701 -0.97561 0.73617 0.60841 0.558 -0.39274 0.1121 -1.0474 -0.03692 0.99908 -0.044925 -0.41186 0.19605 0.093281 -0.59623 -0.60765 -0.4742 -0.30162 -0.70128 -0.20055 0.99519 0.20088 -2.5636 -0.14627 0.77824 1.4913 0.29724 -0.12084 -0.060082 0.079881 -0.38348 0.41371 -0.41284 0.61702 1.1316 -0.047434 0.2848 0.57694 -0.45501 -0.76359 -1.1659 0.068946 -0.22498 0.18691 0.64142 -1.7505 -0.091248 0.85848 -0.52203 -0.38122 -0.43335 -0.73706 -0.51609 -0.13884 0.65128 0.66747 -2.1334 0.93429 0.24036 -0.43398 0.86494 -0.78319 -0.032875 -0.19761 -0.23146 -0.20256 0.1193

only -0.14166 0.62942 0.57058 -0.040109 0.02072 0.45121 0.26174 0.34943 0.1109 -0.36037 0.35116 0.054917 0.35091 -0.16997 0.46228 -0.


so -0.39551 0.5466 0.50315 -0.63682 -0.4547 0.30889 -0.04924 0.27191 0.31562 -0.32879 0.25089 0.14508 0.35136 -0.22793 -0.15894 -0.51527 -0.27978 0.3647 -0.39425 0.33299 0.43051 0.183 0.25095 -0.18547 0.34698 0.055137 -0.45979 -0.82963 -0.018523 -0.36772 0.045566 0.71052 -0.022782 -0.080889 0.20685 0.49855 -0.059794 -0.0080048 -0.23823 -0.33759 -0.24201 -0.23788 -0.0011362 -0.40395 -0.44859 -0.32189 0.48405 -0.027999 0.10148 -0.93585 -0.087522 -0.39959 0.36545 1.3726 -0.30713 -2.594 0.22431 -0.041168 1.7765 0.4001 -0.10996 1.4178 -0.26154 0.18617 0.79328 -0.11709 0.87541 0.43911 0.34711 -0.28515 0.076269 -0.63038 0.16408 -0.37053 0.58485 -0.15472 -0.26382 -0.1859 -0.75228 -0.15752 0.78539 -0.018846 -0.8013 0.15561 -1.8624 -0.16969 0.19419 -0.30683 -0.78067 -0.49689 -0.18256 -0.042016 -0.2629 0.058531 -0.44664 -0.099765 -0.4305 -0.23693 -0.014519 0.31981

them -0.10131 0.10941 0.24065 -0.66767 -0.18687 0.91068 -0.49355 0.22234 -0.00068759 -0.2633 0.35566 0.25153 0.2762 0.25009 0.58282 


while 0.094157 0.46457 0.4535 -0.15074 0.27223 0.4545 -0.14906 0.15345 -0.061775 -0.080787 0.53914 -0.39179 0.083668 -0.10328 0.27425 -0.80995 -0.11588 -0.32288 -0.23434 0.19782 0.47749 0.027463 0.49629 0.41455 0.55198 0.13814 -0.14193 -0.65181 -0.055301 -0.026074 -0.26557 0.16076 -0.32292 -0.10203 0.08234 0.13615 0.27754 0.19405 -0.2348 -0.12201 -0.39889 -0.6782 0.42633 0.21963 -0.20309 0.16836 0.013425 -0.35281 -0.069011 -0.93563 0.16361 -0.13117 0.099808 1.8998 -0.26605 -2.4321 -0.34386 -0.46084 1.3691 0.72702 -0.18504 0.18016 0.085648 0.46807 0.12802 0.28034 0.68951 0.36221 0.66845 0.32295 -0.58005 -0.27069 0.15057 -0.46084 -0.21336 0.36952 -0.23539 0.075712 -0.71302 -0.27551 0.64845 0.10345 -0.64706 0.29101 -1.4154 -0.31586 -0.26086 0.24959 -0.20852 -0.28688 -0.075658 -0.63833 -0.0040848 0.21971 -0.91796 0.271 -0.30677 -0.23741 0.69147 -0.16581

where 0.051044 0.59824 0.31195 -0.066913 -0.29111 0.46091 -0.27781 0.44026 0.044429 -0.22897 0.20414 -0.1458 0.77876 0.28536 0.32293 -0.


just 0.075026 0.39325 0.90314 -0.30451 -0.32768 0.5963 0.22834 0.59028 0.13495 -0.26515 0.77353 0.22579 0.099035 -0.30459 0.66393 -0.33059 -0.23244 0.50205 -0.41178 0.48518 0.81604 0.79918 0.15908 -0.39856 0.30397 0.16379 -0.50475 -0.41057 0.15685 -0.63114 -0.35185 0.65554 0.53268 -0.18448 -0.068132 0.22603 -0.33779 0.17877 0.10681 0.0042481 -0.38168 -0.34329 0.19398 -0.58059 -0.31946 -0.073714 0.48785 -0.18261 0.012377 -1.0071 0.043909 -0.44222 -0.17537 1.3441 -0.64945 -2.9087 -0.35697 -0.013799 1.5677 0.7974 0.099162 1.0972 -0.70742 0.0083953 0.44134 0.085282 0.82608 0.33661 -0.21601 -0.065609 -0.13995 -0.34545 0.045238 -0.29333 0.15719 0.12832 -0.1509 -0.0070848 -0.48914 0.060983 0.40143 0.12505 -0.67478 -0.049925 -1.3447 -0.24093 0.33254 -0.084075 -0.14705 -0.41043 0.181 0.096809 -0.2399 -0.05232 -0.9476 -0.041478 0.02727 -0.18816 0.46636 0.66819

national -0.0033138 0.38946 0.2635 -0.29199 0.38065 0.001211 -0.1323 0.1252 -0.81223 -0.16678 -0.19634 -0.34263 -0.097131 -0.42568 -0.4


any -0.23676 0.15659 0.30243 -0.15578 -0.39025 0.11214 -0.42827 -0.13996 0.5 0.24438 0.078795 0.46397 0.14506 -0.046024 0.37514 0.0081255 -0.36021 0.44383 -0.0091813 0.52886 -0.20502 -0.32799 -0.17999 -0.64124 -0.15088 -0.040234 -0.12477 -0.7817 0.054377 -0.25998 0.030353 0.38581 -0.13027 0.034101 0.35053 0.26333 -0.14078 -0.074158 -0.40511 0.016175 -1.035 0.12603 0.70112 -0.4682 0.10376 -0.075453 0.38597 -0.44495 -0.62957 -1.2194 0.71918 0.27349 0.23366 0.8171 -0.10224 -2.4382 0.40213 -0.34281 2.4308 0.53376 -0.43856 0.65178 -0.63478 0.47431 1.3655 -0.32165 0.88175 0.46126 -0.036273 -0.089717 -0.48678 -0.628 -0.0014089 -0.4462 0.76526 0.18874 -0.28533 -0.71153 -1.5213 0.32432 0.65619 -0.46233 -0.23229 0.52435 -1.6439 0.047064 0.23839 0.41897 0.40535 -0.49672 -0.66878 -0.008549 -0.26385 -0.31069 -0.44691 -0.32531 -0.17917 -0.16095 0.75331 0.29294

through 0.059074 -0.042707 0.2587 -0.18368 0.81468 0.58292 -0.26083 0.82002 -0.23465 -0.33098 0.25085 0.070227 0.24189 0.094269 0.39473 -0.


american 0.38666 0.64827 0.72807 -0.077056 0.1545 -0.19704 0.092145 -1.1485 -0.37113 0.24019 -0.24023 -0.89308 0.12862 0.013445 -0.29047 0.26244 0.87932 -0.18065 -0.44722 0.21253 0.76651 0.17738 0.57841 -0.23391 0.93055 0.16157 0.10065 -1.445 0.58182 -0.1713 -0.42354 0.50944 -0.41707 0.060952 -0.25351 0.08773 -0.078008 0.75344 0.28149 0.43221 -0.95782 -0.48864 -0.43305 0.98063 0.41688 -0.25731 -0.11483 -0.22824 -0.3195 -0.92569 -0.51809 0.11046 0.25846 0.31938 0.031387 -2.1859 -0.18954 -0.3671 2.2003 0.74812 0.36128 0.72895 0.53194 -0.45152 0.55719 -0.91869 -0.0022865 0.8208 0.35648 0.29181 -0.25615 -0.036604 -0.81117 0.16188 -0.046315 0.47251 0.57548 0.17264 -1.2795 -0.54711 0.39105 -0.046598 0.025004 0.93374 -1.113 -0.37478 -0.092246 0.30074 0.22905 -0.91934 -0.26528 -0.26074 0.043642 0.077836 -0.9252 -0.16832 -0.62285 -0.45475 0.39219 0.54088

minister -1.5181 -0.74831 0.26892 0.63476 0.32357 -0.95472 -0.6337 0.13456 0.49422 0.24721 -0.63219 -0.010343 -0.54033 1.1952 0.36358 -0.795


both -0.30687 0.16697 0.0040692 0.016687 0.45926 -0.025039 -0.21391 -0.030337 -0.42455 -0.23867 0.16391 -0.3877 0.4979 0.28546 -0.060268 -0.3068 -0.15175 0.21051 -0.58625 0.36114 0.41502 -0.38151 0.50545 0.24807 0.25556 -0.42775 0.10067 -0.57864 0.11877 0.2288 0.21748 0.75993 -0.22467 -0.087453 0.45775 0.46619 0.4355 0.561 -0.59455 0.21146 -0.53216 -0.40402 0.11441 0.033054 -0.039832 0.11314 0.42284 -0.27464 0.013095 -0.39035 -0.010706 0.28875 0.04262 0.97421 0.067472 -2.3519 0.21347 -0.29792 1.2593 0.65242 -0.29867 0.80374 0.10806 0.41319 0.66098 0.14132 0.57976 0.7575 0.30592 -0.032431 -0.19615 -0.01936 0.18367 -0.40983 0.32454 0.022286 -0.31395 -0.20126 -0.71793 -0.22198 0.822 0.047957 -0.38488 0.17195 -1.6246 -0.080424 -0.33738 0.12957 -0.22077 -0.61825 -0.47566 -0.31097 -0.48041 0.57019 -0.8467 -0.014167 -0.84985 -0.082094 0.70251 0.2309

even -0.15308 0.63194 0.65512 -0.30706 -0.23919 0.137 -0.29819 -0.1408 0.36013 -0.13795 -0.12024 0.006781 0.073785 -0.16167 0.23611 -0.40154 -0


billion 1.3143 1.1977 0.87891 0.086727 0.70483 -0.70591 0.09403 0.10557 -0.13163 0.3701 1.0045 0.86857 -0.96557 -0.059646 -0.052112 -1.2234 -0.0087372 -0.11916 -0.12142 0.45845 0.61894 0.31431 -0.56601 1.3571 0.032826 -0.30287 0.94117 0.63071 -0.70389 -0.79644 0.37281 0.8155 -0.61868 -0.95969 -0.54034 0.77195 0.448 -0.24677 -0.52844 0.71881 0.55858 -1.269 -0.13988 0.71042 -0.10272 0.30446 -0.73999 -0.71953 -1.1492 -1.462 -0.21551 -0.61601 1.6121 0.46553 -0.75174 -2.3178 -0.25878 -1.1629 2.2763 0.33507 0.69034 -0.27817 -0.73657 0.32455 -0.95897 0.47627 -0.77134 0.41595 0.78333 -0.59911 0.42468 0.058197 -1.1176 0.31961 -0.099809 -0.38049 -0.86129 0.86457 -1.6845 1.0779 1.1674 0.2116 -0.37088 0.26164 -0.92719 0.32965 0.3945 -0.57535 -0.090807 -0.15782 0.28291 0.53087 -0.5677 -1.5692 -0.85263 0.51391 0.1043 0.063291 1.1551 -1.1632

work -0.11619 0.45447 -0.69216 0.03458 0.26348 -0.38139 -0.2279 0.37233 -0.20579 0.2902 0.12114 -0.42729 0.55573 -0.094286 -0.49967 -0.29478 0.74109 0.25191 -0


another -0.13669 0.16266 0.32851 -0.23838 0.37632 0.512 0.43825 0.2659 0.10699 0.10075 0.099575 0.23082 0.030345 -0.33396 0.38298 -0.24366 -0.011523 -0.35039 -0.20009 0.46843 0.79308 -0.30159 0.032797 0.28515 0.5024 -0.27093 -0.42409 -0.41382 0.094871 0.072463 -0.55992 0.4625 0.27367 -0.11924 -0.055437 -0.053994 -0.22305 0.19366 0.42554 0.12814 -0.30268 0.11386 0.66691 -0.56826 0.49106 0.35975 0.41939 0.055777 -0.52086 -0.44886 0.058267 -0.082595 0.082818 1.148 -0.44502 -3.0693 -0.38092 -0.22423 1.6572 0.69771 0.057998 0.87766 -0.032617 0.24173 0.72651 -0.24736 0.50189 0.58423 -0.22143 0.18851 -0.46677 -0.022507 -0.21069 -0.086704 0.31232 0.27984 -0.21408 -0.41711 -1.1358 0.018686 0.89265 -0.00067525 -0.26069 -0.14054 -1.1586 -0.32087 0.4694 0.19181 0.1376 -0.55373 0.42719 0.14677 -0.18532 -0.1694 -0.27725 -0.29067 0.17023 -0.1319 0.48349 0.011721

tuesday 0.035314 -0.45705 0.435 0.042954 0.16798 -0.082904 -0.00059971 0.77181 -0.57037 0.042372 -0.4633 0.11997 0.017448 -0.47586 -0.0806


used -0.4713 0.57094 -0.50343 -0.16902 0.207 0.20779 0.087041 0.049987 -0.14483 0.26928 -0.19927 -0.6827 0.27491 0.82178 0.31601 -0.11952 1.043 0.24766 -0.34924 -0.30117 0.40915 -0.34067 0.3716 0.093613 -0.13949 -0.5461 -0.3396 -0.2273 -0.05824 0.11287 0.15304 0.86929 -0.97688 0.22133 0.6068 0.56773 0.0032481 0.19031 0.48249 -0.10228 -0.14569 -0.36808 -0.25728 -0.15778 -0.091743 0.16179 0.2565 -0.57449 -0.22681 -0.89427 -0.22239 0.52927 0.76028 1.3736 -0.17043 -2.0083 -0.51106 -0.26381 1.5273 0.13254 -0.25583 0.92246 0.22393 0.90556 1.0088 -0.16988 0.74343 -0.35974 -0.077833 -0.35517 -0.79085 -0.44397 0.47479 0.096046 0.0029278 -0.30727 0.28722 0.51917 -0.94567 0.35826 0.82026 -0.57482 -1.0386 -0.14005 -1.8042 0.51904 0.6171 -0.1785 -0.18061 0.057552 -0.49046 -0.17216 0.086454 -0.073225 -0.021868 -0.60123 -0.72079 -0.55905 0.7363 -0.069655

much -0.3384 0.6032 0.61412 -0.05686 -0.37309 -0.061981 -0.40583 -0.11304 0.018956 -0.086392 -0.19767 0.30401 -0.17332 -0.58002 0.25126 -0.83477 -


take -0.27064 0.0051896 0.1497 -0.098242 -0.34941 0.053679 -0.49698 0.65251 -0.34078 -0.23466 0.091924 0.4328 -0.05257 0.25661 -0.073174 -0.31834 0.24386 0.52261 -0.64237 0.3446 0.67449 -0.41091 -0.068067 0.11036 -0.31174 -0.1838 -0.32548 -0.56073 0.46353 -0.38417 -0.66699 0.51162 -0.19582 -0.16548 -0.11617 0.40172 -0.27041 0.12839 -0.24684 -0.086713 -0.51182 -0.11955 0.14814 -0.85205 -0.42312 0.27046 -0.19395 -0.1686 -0.022328 -0.79142 -0.13786 0.084995 -0.19315 1.2555 0.0041198 -2.7418 0.083024 -0.20155 1.8789 0.079497 -0.13951 0.83795 -0.28992 0.035695 0.81729 0.25042 0.06956 0.61749 -0.33027 -0.49086 -0.16137 -0.88796 -0.36941 -0.63618 0.11441 0.087835 -0.7473 -0.1224 -0.50284 -0.28315 0.78754 -0.41615 -0.58013 0.0071536 -1.3391 -0.21096 0.51283 0.4465 -0.0473 -0.47531 -0.101 -0.21284 0.26688 -0.43676 -0.5557 -0.0011168 -0.0016866 -0.23098 0.54587 0.49992

very -0.84136 0.30985 0.05817 -0.1282 -0.57563 -0.090958 -0.14138 0.2938 -0.1028 -0.32226 -0.14369 -0.15385 0.27397 -0.41289 -


according -0.068258 -0.047649 0.49786 -0.40922 -0.0051759 0.012998 0.38682 0.16049 0.13969 0.3777 0.12385 -0.32508 0.76584 -0.33999 -0.032049 -0.58342 0.53179 -0.71871 -0.88889 0.095524 0.17448 -0.4459 0.19492 0.71823 -0.37576 -0.61964 0.44474 -0.5949 -0.26213 0.20811 0.13722 0.033131 -0.23305 -0.0027318 -0.46937 0.47767 0.25512 0.53612 0.045807 -0.28083 -0.415 -0.067618 -0.26842 0.36729 -0.081097 -0.52428 -0.2608 -0.85743 -0.76121 -0.56286 0.7719 -0.03485 0.61521 0.67937 0.074163 -2.0818 -0.26734 -0.17753 1.9872 0.52471 -0.23144 0.41671 0.082092 -0.18117 0.56496 0.10529 -0.24687 0.42676 0.90579 0.10628 -0.26557 0.36568 0.19739 -0.34528 0.22259 0.20374 -0.19925 -0.10147 -1.4524 -0.29982 1.029 0.33442 -0.26938 -0.26667 -0.96846 -0.12894 -0.32834 -0.21234 0.13487 -0.081348 0.81305 -0.40582 -0.19017 -0.006391 -0.31896 0.40277 -0.2682 -0.39942 0.18658 -0.47027

several -0.353 0.010869 -0.72551 -0.043148 0.71976 0.46265 -0.069802 0.28948 -0.29837 -0.081974 0.27666 -0.14696 0.39687 0.28386 




UnicodeDecodeError: 'gbk' codec can't decode byte 0x93 in position 5456: illegal multibyte sequence

### NN

In [None]:
# Using Neural Networks and Facebook's Fasttext
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')

# NN
def doAddNN(X_train,X_test,pred_train,pred_test):
    for i in range(6):
        X_train['nn_'+str(i)] = pred_train[:,i]
        X_test['nn_'+str(i)] = pred_test[:,i]
    return X_train,X_test

def initNN(nb_words_cnt,max_len):
    model = Sequential()
    model.add(Embedding(nb_words_cnt,32,input_length=max_len))
    model.add(Dropout(0.3))
    model.add(Conv1D(64,
                     5,
                     padding='valid',
                     activation='relu'))
    model.add(Dropout(0.3))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(800, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(labels), activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

def doNN(X_train,X_test,Y_train):
    max_len = 70
    nb_words = 10000
    
    print('Processing text dataset')
    texts_1 = []
    for text in X_train['comment_text']:
        texts_1.append(text)

    print('Found %s texts.' % len(texts_1))
    test_texts_1 = []
    for text in X_test['comment_text']:
        test_texts_1.append(text)
    print('Found %s texts.' % len(test_texts_1))
    
    tokenizer = Tokenizer(num_words=nb_words)
    tokenizer.fit_on_texts(texts_1 + test_texts_1)
    sequences_1 = tokenizer.texts_to_sequences(texts_1)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)

    xtrain_pad = pad_sequences(sequences_1, maxlen=max_len)
    xtest_pad = pad_sequences(test_sequences_1, maxlen=max_len)
    del test_sequences_1
    del sequences_1
    nb_words_cnt = min(nb_words, len(word_index)) + 1

    # we need to binarize the labels for the neural net
    
    # ytrain_enc = np_utils.to_categorical(Y_train)
    ytrain_enc = Y_train
        
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_pad.shape[0], len(labels)])
    for dev_index, val_index in kf.split(xtrain_pad):
        dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initNN(nb_words_cnt,max_len)
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=4, verbose=1,validation_data=(val_X, val_y),callbacks=[earlyStopping])
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(xtest_pad)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
    return doAddNN(X_train,X_test,pred_train,pred_full_test/5)

train_df,test_df = doNN(train_df,test_df,train_y)
print('NN finished...')

### NN Glove

In [None]:
## NN Glove

def doAddNN_glove(X_train,X_test,pred_train,pred_test):
    for i in range(6):
        X_train['nn_glove_'+str(i)] = pred_train[:,i]
        X_test['nn_glove_'+str(i)] = pred_test[:,i]
    return X_train,X_test

def initNN_glove():
    # create a simple 3 layer sequential neural net
    model = Sequential()

    model.add(Dense(128, input_dim=100, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())

    model.add(Dense(len(labels)))
    model.add(Activation('softmax'))

    # compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def doNN_glove(X_train,X_test,Y_train,xtrain_glove,xtest_glove):
    # scale the data before any neural net:
    scl = preprocessing.StandardScaler()
    #ytrain_enc = np_utils.to_categorical(Y_train)
    ytrain_enc = Y_train
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    xtrain_glove = scl.fit_transform(xtrain_glove)
    xtest_glove = scl.fit_transform(xtest_glove)
    pred_train = np.zeros([xtrain_glove.shape[0], len(labels)])
    
    for dev_index, val_index in kf.split(xtrain_glove):
        dev_X, val_X = xtrain_glove[dev_index], xtrain_glove[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initNN_glove()
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=10, verbose=1,validation_data=(val_X, val_y),callbacks=[earlyStopping])
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(xtest_glove)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
    return doAddNN_glove(X_train,X_test,pred_train,pred_full_test/5)

train_df,test_df = doNN_glove(train_df,test_df,train_y,glove_vecs_train,glove_vecs_test)
print('NN Glove finished...')

### Fast Text

In [None]:
# Fast Text

def doAddFastText(X_train,X_test,pred_train,pred_test):
    for i in range(6):
        X_train['ff_'+str(i)] = pred_train[:,i]
        X_test['ff_'+str(i)] = pred_test[:,i]
    return X_train,X_test


def initFastText(embedding_dims,input_dim):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(len(labels), activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def preprocessFastText(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df['comment_text']:
        doc = preprocessFastText(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

def doFastText(X_train,X_test,Y_train):
    min_count = 2

    docs = create_docs(X_train)
    tokenizer = Tokenizer(lower=False, filters='')
    tokenizer.fit_on_texts(docs)
    num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

    tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
    tokenizer.fit_on_texts(docs)
    docs = tokenizer.texts_to_sequences(docs)

    maxlen = 300

    docs = pad_sequences(sequences=docs, maxlen=maxlen)
    input_dim = np.max(docs) + 1
    embedding_dims = 20

    # we need to binarize the labels for the neural net
    #ytrain_enc = np_utils.to_categorical(Y_train)
    ytrain_enc = Y_train

    docs_test = create_docs(X_test)
    docs_test = tokenizer.texts_to_sequences(docs_test)
    docs_test = pad_sequences(sequences=docs_test, maxlen=maxlen)
    xtrain_pad = docs
    xtest_pad = docs_test
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_pad.shape[0], len(labels)])
    for dev_index, val_index in kf.split(xtrain_pad):
        dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initFastText(embedding_dims,input_dim)
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=25, verbose=1,validation_data=(val_X, val_y),callbacks=[earlyStopping])
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(docs_test)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
    return doAddFastText(X_train,X_test,pred_train,pred_full_test/5)

train_df,test_df = doFastText(train_df,test_df,train_y)
print('FastText finished...')

In [None]:
cols_to_drop = ['comment_text', 'split']
train_X = train_df.drop(cols_to_drop, axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

## MNB & SVD

In [None]:
### Fit transform the tfidf vectorizer ###
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
full_tfidf = tfidf_vec.fit_transform(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)

    return pred_test_y, pred_test_y2, model

cv_scores = []
pred_full_test = np.zeros([test_df.shape[0], len(labels)])
pred_train = np.zeros([train_df.shape[0], len(labels)])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    dev_X, dev_y
    for i, j in enumerate(labels):
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y[:,i], val_X, val_y[:,i], test_tfidf)
        pred_test_y[:, 0]
        pred_full_test[:, i] = pred_full_test[:,i] + pred_test_y[:,0] # FIXME
        pred_train[val_index,i] = pred_val_y[:,0]
        cv_scores.append(metrics.log_loss(val_y[:,i], pred_val_y[:,0]))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

In [None]:
n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    
train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

In [None]:
### Fit transform the count vectorizer ###
tfidf_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

In [None]:
cv_scores = []
pred_full_test = np.zeros([test_df.shape[0], len(labels)])
pred_train = np.zeros([train_df.shape[0], len(labels)])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    dev_X, dev_y
    for i, j in enumerate(labels):
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y[:,i], val_X, val_y[:,i], test_tfidf)
        pred_test_y[:, 0]
        pred_full_test[:, i] = pred_full_test[:,i] + pred_test_y[:,0] # FIXME
        pred_train[val_index,i] = pred_val_y[:,0]
        cv_scores.append(metrics.log_loss(val_y[:,i], pred_val_y[:,0]))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.


# add the predictions as new features #
for i in range(6):
    train_df["nb_cvec_"+str(i)] = pred_train[:,i]
    test_df["nb_cvec_"+str(i)] = pred_full_test[:,i]
print("Naive Bayesian Count Vector finished...")

In [None]:
### Fit transform the tfidf vectorizer ###
tfidf_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
tfidf_vec.fit(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

In [None]:
cv_scores = []
pred_full_test = np.zeros([test_df.shape[0], len(labels)])
pred_train = np.zeros([train_df.shape[0], len(labels)])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    dev_X, dev_y
    for i, j in enumerate(labels):
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y[:,i], val_X, val_y[:,i], test_tfidf)
        pred_test_y[:, 0]
        pred_full_test[:, i] = pred_full_test[:,i] + pred_test_y[:,0] # FIXME
        pred_train[val_index,i] = pred_val_y[:,0]
        cv_scores.append(metrics.log_loss(val_y[:,i], pred_val_y[:,0]))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.


# add the predictions as new features #
for i in range(6):
    train_df["nb_cvec_char_"+str(i)] = pred_train[:,i]
    test_df["nb_cvec_char_"+str(i)] = pred_full_test[:,i]
print("Naive Bayersian Count Vector Char finished...")

In [None]:
### Fit transform the tfidf vectorizer ###
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
full_tfidf = tfidf_vec.fit_transform(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

In [None]:
cv_scores = []
pred_full_test = np.zeros([test_df.shape[0], len(labels)])
pred_train = np.zeros([train_df.shape[0], len(labels)])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    dev_X, dev_y
    for i, j in enumerate(labels):
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y[:,i], val_X, val_y[:,i], test_tfidf)
        pred_test_y[:, 0]
        pred_full_test[:, i] = pred_full_test[:,i] + pred_test_y[:,0] # FIXME
        pred_train[val_index,i] = pred_val_y[:,0]
        cv_scores.append(metrics.log_loss(val_y[:,i], pred_val_y[:,0]))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

# add the predictions as new features #
for i in range(6):
    train_df["nb_tfidf_char_"+str(i)] = pred_train[:,i]
    test_df["nb_tfidf_char_"+str(i)] = pred_full_test[:,i]
print("Naive Bayersian TFIDF Vector Char finished...")

In [None]:
n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    
train_svd.columns = ['svd_char_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_char_'+str(i) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

## XGB

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = len(labels)
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train_df.shape[0], len(labels)])
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, test_X, seed_val=0, colsample=0.7)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("cv scores : ", cv_scores)

pred_full_test /= 5.0

## To CSV

In [None]:
out_df = pd.DataFrame(pred_full_test)
out_df.columns = labels
out_df.insert(0, 'id', test_id)
out_df.to_csv("../output/result.csv", index=False)

In [None]:
file_name = '../output/train_df.pkl'
train_df.to_pickle(file_name)  
train_df = pd.read_pickle(file_name)
file_name = '../output/test_df.pkl'
test_df.to_pickle(file_name)
test_df = pd.read_pickle(file_name)