In [1]:
%matplotlib inline

In [16]:
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import load_model, Sequential
from keras.layers import BatchNormalization, Conv1D, Dense, Dropout, Embedding, Flatten, GRU, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

from sklearn.model_selection import train_test_split

In [3]:
VOCAB_SIZE = 25000
MAX_TEXT_LEN = 100
EMBEDDING_DIM = 128

In [4]:
train_df = pd.read_csv("train.csv")

In [5]:
len(train_df)

95851

In [6]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)

In [7]:
tokenizer.fit_on_texts(list(train_df["comment_text"].fillna("MISSINGVALUE").values))

word_2_index = tokenizer.word_index
index_2_word = {ix: word for word, ix in word_2_index.items()}

In [8]:
def create_padded_tokens(df):
    comment_text = df["comment_text"].astype(str)
    tokens = tokenizer.texts_to_sequences(comment_text)
    padded_tokens = pad_sequences(tokens, MAX_TEXT_LEN)
    return padded_tokens
    

In [11]:
y = [[train_df.iloc[row]["toxic"], train_df.iloc[row]["severe_toxic"],
                     train_df.iloc[row]["obscene"], train_df.iloc[row]["threat"], 
                     train_df.iloc[row]["insult"], train_df.iloc[row]["identity_hate"]
                     ]for row in range(len(train_df))]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

In [30]:
model = Sequential()

embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM)

model.add(embedding)
model.add(Dropout(.1))
model.add(Conv1D(filters=32, kernel_size=2, padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(.1))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=2, padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(GRU(32))
model.add(Dropout(.1))
model.add(Dense(256, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(6, activation="sigmoid"))

In [31]:
# fasttext_weights = np.load("fasttext/fasttext_weights.npy")
# model.layers[0].set_weights(fasttext_weights)
# model.layers[0].trainable = False

In [49]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 300)         7500000   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 300)         0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, None, 32)          19232     
_________________________________________________________________
batch_normalization_3 (Batch (None, None, 32)          128       
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 32)          0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, None, 32)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 32)          2080      
__________

In [50]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x=np.array(X), y=np.array(y), validation_data=(X, y), epochs=1)

Train on 95851 samples, validate on 95851 samples
Epoch 1/1
18496/95851 [====>.........................] - ETA: 4:28 - loss: 0.0457 - acc: 0.9832

In [35]:
model.fit(x=np.array(X_train), y=np.array(y_train), validation_data=(X_test, y_test), epochs=1)

Train on 86265 samples, validate on 9586 samples
Epoch 1/1


<keras.callbacks.History at 0x131718ef0>

In [152]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=4, verbose=1)

In [153]:
model.fit(x=np.array(X_train), y=np.array(y_train), validation_data=(X_test, y_test), epochs=15,
          callbacks=[early_stopping])

Train on 91058 samples, validate on 4793 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 00005: early stopping


<keras.callbacks.History at 0x24707e9b0>

In [36]:
preds = model.predict(X_test)

In [37]:
from math import log

def bin_log_loss(pred, actual, eps=.000001):
    pred = eps if pred==0 else pred
    pred = 1 - eps if pred==1 else pred
    return actual * log(pred) + (1 - actual) * log(1 - pred)

def log_loss_6(preds, actual, eps=.000001):
    preds = [pred or eps for pred in preds]
    losses = [bin_log_loss(preds[x], actual[x]) for x in range(len(preds))]
    return -sum(losses) / len(losses)


# [(x, log_loss_6(preds[x], y_test[x])) for x in range(len(preds)) if log_loss_6(preds[x], y_test[x]) > 1]



In [45]:
sum([log_loss_6(preds[x], y_test[x]) for x in range(len(preds))])/len(preds)

0.05307727477144796

In [46]:
model.save("saved_models/conv_GRU_20180109")

In [47]:
submission = pd.read_csv("test.csv")
submission.set_index("id", inplace=True)

In [145]:
X_submit = create_padded_tokens(submission)

In [146]:
preds = model.predict(X_submit)

In [147]:
pred_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submission = submission.reindex(columns=pred_columns)
submission[pred_columns] = preds

In [148]:
submission.to_csv("submission_2_20180108.csv")

In [149]:
submission.shape

(226998, 6)