In [1]:
import sys, os, re, csv, codecs
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding
from keras.layers import Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.initializers import Constant
import gc
import gensim.models.keyedvectors as word2vec

Using TensorFlow backend.


In [2]:
train = pd.read_csv('jigsaw/train.csv')
test = pd.read_csv('jigsaw/test.csv')

In [3]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = train[list_classes].values

In [4]:
list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']

In [5]:
#tokenizer
max_num_words = 20000
tokenizer = Tokenizer(num_words = max_num_words)
tokenizer.fit_on_texts(list_sentences_train)

In [6]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [7]:
maxlen = 200
x_train = pad_sequences(list_tokenized_train, maxlen = maxlen)
x_test = pad_sequences(list_tokenized_test, maxlen = maxlen)

In [8]:
def loadEmbeddingMatrix(typeToLoad):
    if typeToLoad == 'glove':
        embedding_file = 'wordEmbedding/glove.twitter.27B.25d.txt'
        embed_size = 25
    elif typeToLoad == 'word2vec':
        embedding_file = 'wordEmbedding/GoogleNews-vectors-negative300.bin'
        word2vecDict = word2vec.KeyedVectors.load_word2vec_format(embedding_file, binary = True)
        embed_size = 300
    elif typeToLoad == 'fasttext':
        embedding_file = 'wordEmbedding/wiki-news-300d-1M.vec'
        embed_size = 300
    
    if typeToLoad == 'glove':
        embedding_index = dict()
        f = open(embedding_file)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            embedding_index[word] = coefs
        f.close()
    elif typeToLoad == 'fasttext':
        embedding_index = dict()
        f = open(embedding_file, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
        f.readline() #skip first line
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            embedding_index[word] = coefs
        f.close()
    elif typeToLoad == 'word2vec':
        embedding_index = dict()
        for word in word2vecDict.wv.vocab:
            embedding_index[word] = word2vecDict.word_vec(word)
    
    nb_words = min(len(tokenizer.word_index), max_num_words) + 1
    embedding_matrix = np.zeros((nb_words, embed_size))
    gc.collect()
    for word, i in tokenizer.word_index.items():
        if i > max_num_words:
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    del embedding_index
    gc.collect()
    return embedding_matrix

In [9]:
embedding_matrix = loadEmbeddingMatrix('fasttext')

In [10]:
num_words, embed_size = embedding_matrix.shape

In [11]:
inp = Input(shape = (maxlen,))
x = Embedding(num_words, embed_size, embeddings_initializer=Constant(embedding_matrix), trainable = False)(inp)
x = Bidirectional(LSTM(60, return_sequences = True, name = 'lstm_layer', dropout = 0.1, recurrent_dropout = 0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation = 'sigmoid')(x)
    
model = Model(inp, x)
model.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          6000300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 120)          173280    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [13]:
batch_size = 32
epochs = 4
model.fit(x_train, y, batch_size=batch_size, epochs = epochs, validation_split = 0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fad5d0e22e8>

In [26]:
num_test = x_test.shape[0]
prediction = np.zeros((num_test, 6))
i = 0
while True:
    bounded = min(num_test, i + batch_size)
    p = model.predict(x_test[i:bounded])
    prediction[i:bounded] = p
    i = bounded
    if bounded == num_test:
        break

In [33]:
a = prediction > 0.5
res = np.asarray(a, dtype = np.int8)

In [44]:
docid = np.asarray(test['id']) 

In [50]:
pre = pd.DataFrame({'id':docid, 'toxic':prediction[:,0], 'severe_toxic':prediction[:,1], 'obscene':prediction[:,2], 'threat':prediction[:,3], 'insult':prediction[:,4], 'identity_hate':prediction[:,5]})

In [51]:
pre.to_csv("submission.csv",index=False,sep=',')