In [25]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import ModelCheckpoint
from sklearn.utils import class_weight

In [26]:
DIR_PATH = "~/workspace/kaggle_toxic_comment_classification/data1/"
TRAIN_DATA_FILE = DIR_PATH + 'train.csv'
TEST_DATA_FILE = DIR_PATH + 'test.csv'
EMBEDDING_FILE = "/Users/rituc/glove/glove.6B.100d.txt"

In [27]:
embed_size = 100 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [28]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

In [29]:
cols = train.columns[2:]
for col in cols:
    print col, sum(train[col]), "=>", sum(train[col])*100.0/train.shape[0]

In [30]:
def tag_other(x):
    return 1*(x["toxic"] == x["severe_toxic"] == x["obscene"] == x["threat"] == x["insult"] == x["identity_hate"] == 0)

In [31]:
train["other"] = train.apply(lambda x:tag_other(x), axis=1)

In [32]:
cols = train.columns[2:]
for col in cols:
    print col, sum(train[col]), "=>", sum(train[col])*100.0/train.shape[0]

In [None]:
# def tag_class(x):
#     for col in cols:
#         if x[col]:
#             return col
# train["class"] = train.apply(lambda x:tag_class(x), axis=1)

In [33]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [6]:
# toxic_class_weight = class_weight.compute_class_weight('balanced', list_classes, train["class"])

In [7]:
len(list_sentences_train)

159571

In [34]:
tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [35]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [36]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.004452007, 0.40815714)

In [37]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [38]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.3))(x)
x = GlobalMaxPool1D()(x)
x = Dense(60, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
filepath = "models/toxic_classifier_model_4.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [40]:
sys.stdout = open("logs/toxic_classifier_lstm_log4.log", "w")

In [41]:
model.summary()

In [42]:
model.fit(X_t, y, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks_list);

In [None]:
list_classes[0:-1]

In [None]:
sample_submission

In [None]:
y_test.shape

In [None]:
y_test1 = y_test

In [None]:

y_test1[0:,0:-1].shape

In [None]:
list_classes[0:-1]

In [17]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(DIR_PATH + "sample_submission.csv")
# sample_submission[list_classes[0:-1]] = y_test[0:,0:-1]
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)

In [None]:
# df = pd.read_csv('submission.csv')
# df1 = pd.read_csv(DIR_PATH + "sample_submission.csv")

In [20]:
sample_submission["sum"] = sample_submission["toxic"] + sample_submission["severe_toxic"] + sample_submission["obscene"]+sample_submission["threat"] + sample_submission["insult"] + sample_submission["identity_hate"]

In [24]:
sample_submission[sample_submission["sum"] > 2]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum
0,00001cee341fdb12,0.999808,0.449477,0.988491,0.050172,0.937825,0.491149,3.916921
48,0013fed3aeae76b7,0.998910,0.354347,0.950907,0.012260,0.790757,0.625011,3.732191
56,0016b94c8b20ffa6,0.940782,0.034175,0.189198,0.730518,0.108142,0.002698,2.005513
59,0017d4d47894af05,0.999938,0.476098,0.994052,0.004325,0.860354,0.636329,3.971096
70,001c86f5bceccb32,0.999969,0.596098,0.992941,0.043033,0.950553,0.527480,4.110075
81,001eff4007dbb65b,0.986522,0.009016,0.434351,0.000167,0.862959,0.002775,2.295790
92,002261b0415c4f9d,0.995430,0.050868,0.786749,0.000180,0.883689,0.022323,2.739239
99,00260d8dfcc29827,0.999074,0.163873,0.972257,0.000025,0.729575,0.019583,2.884389
109,002a3ebaaa51f17a,0.957766,0.034526,0.766961,0.001449,0.560072,0.001875,2.322648
126,0031db73bf0939c3,0.999983,0.509137,0.998422,0.000315,0.757846,0.013971,3.279674


In [23]:
sample_submission[sample_submission["sum"] < 1]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum
1,0000247867823ef7,0.000956,4.857557e-07,0.000241,1.754416e-07,4.863252e-05,1.080271e-06,0.001248
2,00013b17ad220c46,0.000042,3.201635e-08,0.000009,1.336655e-08,3.198604e-06,2.613209e-07,0.000055
3,00017563c3f7919a,0.000043,2.477115e-08,0.000007,7.463559e-08,4.138456e-06,3.407191e-07,0.000055
4,00017695ad8997eb,0.001969,1.945382e-06,0.000175,1.560475e-05,9.318991e-05,2.225541e-05,0.002277
5,0001ea8717f6de06,0.000102,3.017794e-08,0.000012,7.418751e-07,8.263609e-06,7.322390e-07,0.000124
6,00024115d4cbde0f,0.000786,6.612964e-08,0.000038,5.913278e-07,2.624269e-05,1.807119e-06,0.000852
7,000247e83dcc1211,0.398078,1.522309e-04,0.013244,1.523012e-04,2.441965e-02,8.616909e-04,0.436908
8,00025358d4737918,0.512491,6.078945e-03,0.198742,5.654058e-03,2.620987e-01,7.650047e-03,0.992714
9,00026d1092fe71cc,0.000318,1.108683e-07,0.000074,1.109952e-07,2.207709e-05,9.131381e-07,0.000415
10,0002eadc3b301559,0.528343,4.742122e-04,0.345060,8.083379e-05,1.354378e-02,5.202472e-04,0.888022
