In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GRU,CuDNNLSTM,CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPool1D,SpatialDropout1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import gc

Using TensorFlow backend.


In [None]:
#!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip

In [None]:
#!unzip crawl-300d-2M.vec.zip

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
#!unzip glove.6B.zip

In [2]:
embed_size = 300 # how big is each word vector
max_features = 300000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a comment to use


In [3]:

#EMBEDDING_FILE='./crawl-300d-2M.vec'
TRAIN_DATA_FILE='./train.csv'
TEST_DATA_FILE='./test.csv'



In [4]:


train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("something").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("something").values


In [5]:
%%time
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)+list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

CPU times: user 43.3 s, sys: 1.18 s, total: 44.5 s
Wall time: 44.5 s


In [6]:
gc.collect()

0

In [7]:
%%time
EMBEDDING_FILE_FASTTEXT="./crawl-300d-2M.vec"
EMBEDDING_FILE_TWITTER="./glove.6B.200d.txt"
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_ft = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT))
embeddings_index_tw = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE_TWITTER))

CPU times: user 2min 8s, sys: 3.06 s, total: 2min 11s
Wall time: 2min 10s


In [None]:
'''
%%time
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

'''

In [8]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,500))

something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((500,))
something[:300,] = something_ft
something[300:500,] = something_tw


In [9]:
%%time
for word, i in word_index.items():
    if i >= max_features: continue
    if embeddings_index_ft.get(word) is not None:    
        embedding_vector_ft = embeddings_index_ft.get(word)
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw
    else:
        embedding_matrix[i] =something 


CPU times: user 1.42 s, sys: 300 ms, total: 1.72 s
Wall time: 1.72 s


In [None]:
'''
%%time
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

'''

In [10]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features,500, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.5)(x)
x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.fit(X_t, y, batch_size=512, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f09be745fd0>

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 500)         150000000 
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1000, 500)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 80)          173440    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 80)          29280     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 80)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 486       
Total para

In [13]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [12]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('sub-lstm-gru-500.csv', index=False)




In [13]:
gc.collect()

784