In [1]:
%matplotlib inline

In [39]:
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import BatchNormalization, Conv1D, Dense, Dropout, Embedding, Flatten, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

from sklearn.model_selection import train_test_split

In [3]:
VOCAB_SIZE = 50000
MAX_TEXT_LEN = 75
EMBEDDING_DIM = 300

In [4]:
train_df = pd.read_csv("train.csv")

In [5]:
len(train_df)

95851

In [6]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)

In [7]:
train = train_df[:10000]

In [47]:
tokenizer.fit_on_texts(list(train_df["comment_text"]))

word_2_index = tokenizer.word_index
index_2_word = {ix: word for word, ix in word_2_index.items()}

In [48]:
tokens = tokenizer.texts_to_sequences(train_df["comment_text"])

Load fast text weights from downloaded file

In [49]:
fasttext_weights = pd.read_csv("fasttext/fasttext.vec", sep=" ", skiprows=1,
                               quoting=csv.QUOTE_NONE, header=None, index_col=0
                               )

In [51]:
# Reading CSV puts an empty column at the end. Drop that column.
fasttext_weights.drop(labels=len(fasttext_weights.columns), axis=1, inplace=True)

In [52]:
embedding_weights = np.zeros((len(index_2_word) + 1, EMBEDDING_DIM))

for word, index in word_2_index.items():
    if word in fasttext_weights.index:
        embedding_weights[index] = fasttext_weights.loc[word]


In [53]:
X = pad_sequences(tokens, 75)

In [54]:
y = [[train_df.iloc[row]["toxic"], train_df.iloc[row]["severe_toxic"],
                     train_df.iloc[row]["obscene"], train_df.iloc[row]["threat"], 
                     train_df.iloc[row]["insult"], train_df.iloc[row]["identity_hate"]
                     ]for row in range(len(train_df))]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15)

In [56]:
model = Sequential()

embedding = Embedding(len(word_2_index) + 1, 300, input_length=75, trainable=False)

model.add(embedding)
model.add(Conv1D(32, 5))
model.add(GlobalMaxPool1D())
model.add(Dropout(.3))
model.add(Dense(512, input_shape=(10,)))
model.add(BatchNormalization())
model.add(Dropout(.3))
model.add(Dense(1024, input_shape=(10,)))
model.add(BatchNormalization())
model.add(Dense(6, activation="sigmoid"))

In [57]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 75, 300)           45956700  
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 71, 32)            48032     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 32)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               16896     
_________________________________________________________________
batch_normalization_5 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
__________

In [58]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [59]:
model.fit(x=np.array(X_train), y=np.array(y_train), validation_data=(X_test, y_test), epochs=10)

Train on 81473 samples, validate on 14378 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19be5d5f8>

In [63]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  717,    1, 1039,   56,   16,  517,   15, 2602,    3,
        463,  143,  354,    2,   16, 3018,    1,  239,    8,  332,    1,
       1026,    3,   11,   17,    5, 8606,    8,  366,  243,   26,   41,
          8,   45,  123,    2,  313,   11,   15,  145, 2602], dtype=int32)

In [75]:
submission = pd.read_csv("test.csv", index="id")
submission.set_index("id", inplace=True)

TypeError: parser_f() got an unexpected keyword argument 'index'

In [None]:
submission_tokens = tokenizer.texts_to_sequences(submission["comment_text"].astype(str))
submission_X = pad_sequences(submission_tokens)

In [None]:
submission_preds = model.fit(submission_X[0])