# Toxic Comment Classification Challenge

In [1]:
import numpy as np
import pandas as pd
from keras_tqdm import TQDMNotebookCallback

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import codecs
from tqdm import tqdm

from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Conv1D, Conv2D, MaxPooling1D, GlobalMaxPool1D, Bidirectional, GlobalMaxPooling1D
from keras.layers import LSTM, GRU, Dropout , BatchNormalization, Embedding, Flatten, GlobalAveragePooling1D, concatenate, Input


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Read train data
train = pd.read_csv('train.csv')
train.dropna(inplace=True)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_train = train[list_classes]

In [3]:
# Read test data
test = pd.read_csv('test.csv')
test.dropna(inplace=True)

In [4]:
# Create tools to preprocess data
# we will remove english stop words from text as well as punctuation
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

In [5]:
# the maximum number of words considered is 100000
MAX_NB_WORDS = 100000
# the size of the sentences will be 250
max_seq_len = 250

raw_docs_train = train['comment_text'].tolist()
raw_docs_test = test['comment_text'].tolist()

num_classes = len(list_classes)

tokenizer = RegexpTokenizer(r'\w+')

print("pre-processing train data...")
processed_docs_train = []
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))

print("pre-processing test data...")
processed_docs_test = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_test.append(" ".join(filtered))


print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

print("Done !!")

pre-processing train data...


100%|████████████████████████████████████████████████████████████████████| 159571/159571 [00:05<00:00, 31835.40it/s]


pre-processing test data...


100%|████████████████████████████████████████████████████████████████████| 153164/153164 [00:04<00:00, 33923.02it/s]


tokenizing input data...
dictionary size:  348520
Done !!


In [6]:
embed_dim = 300

#load embeddings
print('loading first word embeddings...')
embeddings_index = {}
f = codecs.open('glove.840B.300d.txt', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix_glove = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_glove[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_glove, axis=1) == 0))

loading word embeddings...


2196018it [04:45, 7700.72it/s]


found 2196016 word vectors
preparing embedding matrix...
number of null word embeddings: 23510


In [8]:
print('loading second word embeddings...')
EMBEDDING_FILE = 'crawl-300d-2M.vec'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))

word_index = tokenizer.word_index
embedding_matrix_crawl = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_crawl[i] = embedding_vector


del embeddings_index

In [9]:
# Creating model

inp = Input(shape=(max_seq_len, ))
emb_glove = Embedding(nb_words, 300, 
                weights=[embedding_matrix_glove], input_length = max_seq_len, trainable=False)(inp)

emb_crawl = Embedding(nb_words, 300,
                weights=[embedding_matrix_crawl], input_length = max_seq_len, trainable=False)(inp)

conc1 = concatenate([emb_glove, emb_crawl])
x = Bidirectional(LSTM(400, return_sequences=True))(conc1)
x = Bidirectional(GRU (400, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
out = concatenate([avg_pool, max_pool])

out = Dense(200, activation="relu")(out)
out = Dense(y_train.shape[1], activation="sigmoid")(out)

model = Model(inputs=inp, outputs=out)

model.compile(loss='binary_crossentropy', optimizer = keras.optimizers.Adam(lr=0.001),
              metrics=['accuracy'])

In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 250, 600)     0           embedding_1[0][0]                
                                                                 embedding_2[0][0]                
__________

In [11]:
model.fit(word_seq_train, y_train, epochs=6, batch_size=80, shuffle=True, validation_split=0.1,verbose = 2)

Train on 143613 samples, validate on 15958 samples
Epoch 1/6
 - 2629s - loss: 0.0484 - acc: 0.9816 - val_loss: 0.0485 - val_acc: 0.9807
Epoch 2/6
 - 2547s - loss: 0.0389 - acc: 0.9844 - val_loss: 0.0410 - val_acc: 0.9838
Epoch 3/6
 - 2547s - loss: 0.0350 - acc: 0.9858 - val_loss: 0.0409 - val_acc: 0.9839
Epoch 4/6
 - 2544s - loss: 0.0354 - acc: 0.9858 - val_loss: 0.0453 - val_acc: 0.9826
Epoch 5/6
 - 2543s - loss: 0.0333 - acc: 0.9866 - val_loss: 0.0483 - val_acc: 0.9830
Epoch 6/6
 - 2549s - loss: 0.0307 - acc: 0.9875 - val_loss: 0.0467 - val_acc: 0.9834


<keras.callbacks.History at 0x2510b1d4048>

In [12]:
y_test = model.predict(word_seq_test)

In [13]:
sample_submission = pd.read_csv("sample_submission.csv")

sample_submission[list_classes] = y_test
sample_submission.to_csv("results.csv", index=False)