In [3]:
%matplotlib inline

In [51]:
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import load_model, Sequential
from keras.layers import BatchNormalization, Conv1D, Dense, Dropout, Embedding, Flatten, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

from sklearn.model_selection import train_test_split

In [6]:
VOCAB_SIZE = 50000
MAX_TEXT_LEN = 75
EMBEDDING_DIM = 300

In [7]:
train_df = pd.read_csv("train.csv")

In [8]:
len(train_df)

95851

In [9]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)

In [10]:
train = train_df[:10000]

In [11]:
tokenizer.fit_on_texts(list(train_df["comment_text"]))

word_2_index = tokenizer.word_index
index_2_word = {ix: word for word, ix in word_2_index.items()}

In [40]:
def create_padded_tokens(df):
    comment_text = df["comment_text"].astype(str)
    tokens = tokenizer.texts_to_sequences(comment_text)
    padded_tokens = pad_sequences(tokens, MAX_TEXT_LEN)
    return padded_tokens
    

Load fast text weights from downloaded file

In [13]:
fasttext_weights = pd.read_csv("fasttext/fasttext.vec", sep=" ", skiprows=1,
                               quoting=csv.QUOTE_NONE, header=None, index_col=0
                               )

In [14]:
# Reading CSV puts an empty column at the end. Drop that column.
fasttext_weights.drop(labels=len(fasttext_weights.columns), axis=1, inplace=True)

In [15]:
embedding_weights = np.zeros((len(index_2_word) + 1, EMBEDDING_DIM))

for word, index in word_2_index.items():
    if word in fasttext_weights.index:
        embedding_weights[index] = fasttext_weights.loc[word]


In [17]:
X = create_padded_tokens(train_df)

In [18]:
y = [[train_df.iloc[row]["toxic"], train_df.iloc[row]["severe_toxic"],
                     train_df.iloc[row]["obscene"], train_df.iloc[row]["threat"], 
                     train_df.iloc[row]["insult"], train_df.iloc[row]["identity_hate"]
                     ]for row in range(len(train_df))]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.05)

In [20]:
model = Sequential()

embedding = Embedding(len(word_2_index) + 1, 300, input_length=75, trainable=False)

model.add(embedding)
model.add(Conv1D(32, 5))
model.add(GlobalMaxPool1D())
model.add(Dropout(.3))
model.add(Dense(512, input_shape=(10,)))
model.add(BatchNormalization())
model.add(Dropout(.3))
model.add(Dense(1024, input_shape=(10,)))
model.add(BatchNormalization())
model.add(Dense(6, activation="sigmoid"))

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 75, 300)           45956700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 71, 32)            48032     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               16896     
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
__________

In [22]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [23]:
model.fit(x=np.array(X_train), y=np.array(y_train), validation_data=(X_test, y_test), epochs=3)

Train on 91058 samples, validate on 4793 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x125ce4198>

In [32]:
model.save("saved_models/fasttext_conv_3ep_20180107")

In [53]:
model = load_model("saved_models/fasttext_conv_3ep_20180107")

In [34]:
submission = pd.read_csv("test.csv")
submission.set_index("id", inplace=True)

In [64]:
X_submit = create_padded_tokens(submission)

In [79]:
preds = model.predict(X_submit)

In [83]:
pred_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submission = submission.reindex(columns=pred_columns)
submission[pred_columns] = preds

In [85]:
submission.to_csv("submission_1_20180107.csv")

In [86]:
submission.shape

(226998, 6)