In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from keras_tqdm import TQDMNotebookCallback

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('train.csv')
train.dropna(inplace=True)

train_text = train['comment_text'].values
y_train = train.values[:,2:8]

In [3]:
test = pd.read_csv('test.csv')
test.dropna(inplace=True)

test_text = test['comment_text'].values

In [4]:
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import os, re, csv, math, codecs
from tqdm import tqdm

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
#table = str.maketrans('', '', string.punctuation)

In [5]:
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

MAX_NB_WORDS = 100000
max_seq_len = 250

raw_docs_train = train['comment_text'].tolist()
raw_docs_test = test['comment_text'].tolist() 

label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[label_names].values
num_classes = len(label_names)

tokenizer = RegexpTokenizer(r'\w+')

print("pre-processing train data...")
processed_docs_train = []
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))
#end for

processed_docs_test = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_test.append(" ".join(filtered))
#end for

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

print("Done !!")

pre-processing train data...


100%|███████████████████████████████████████████████████████████████████████| 159571/159571 [00:04<00:00, 34755.79it/s]
100%|███████████████████████████████████████████████████████████████████████| 153164/153164 [00:03<00:00, 39584.48it/s]


tokenizing input data...
dictionary size:  348520
Done !!


In [6]:

#model parameters
embed_dim = 300 

len(word_index)

348520

In [7]:
#load embeddings
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki.simple.vec', encoding='utf-8')

# This dataset comes from : T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix_wiki = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_wiki[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_wiki, axis=1) == 0))

loading word embeddings...


111052it [00:13, 8435.38it/s]


found 111052 word vectors
preparing embedding matrix...
number of null word embeddings: 47196


In [8]:
print("sample words not found: ", np.random.choice(words_not_found, 10))
embedding_matrix_wiki.shape

sample words not found:  ['gestellt' 'unshakable' 'منافع' 'minn' 'theverge' 'cya' 'puter'
 'littlemountain5' 'الخطر' 'doormat']


(100000, 300)

In [9]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))

word_index = tokenizer.word_index
embedding_matrix_crawl = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_crawl[i] = embedding_vector


del embeddings_index

In [10]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Conv1D, Conv2D, MaxPooling1D, GlobalMaxPool1D, Bidirectional, GlobalMaxPooling1D
from keras.layers import LSTM, GRU, Dropout , BatchNormalization, Embedding, Flatten, GlobalAveragePooling1D, concatenate, Input

inp = Input(shape=(max_seq_len, ))
emb_wiki = Embedding(nb_words, 300, weights=[embedding_matrix_wiki], input_length = max_seq_len, trainable=False)(inp)

emb_crawl = Embedding(nb_words, 300,
          weights=[embedding_matrix_crawl], input_length = max_seq_len, trainable=False)(inp)

conc1 = concatenate([emb_wiki, emb_crawl])
x = Bidirectional(LSTM(400, return_sequences=True))(conc1)
x = Bidirectional(GRU (400, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
out = concatenate([avg_pool, max_pool])

out = Dense(200, activation="relu")(out)
out = Dense(200, activation="relu")(out)
out = Dense(y_train.shape[1], activation="sigmoid")(out)

model = Model(inputs=inp, outputs=out)

model.compile(loss='binary_crossentropy', optimizer = keras.optimizers.Adam(lr=0.001),
              metrics=['accuracy'])

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 250, 600)     0           embedding_1[0][0]                
                                                                 embedding_2[0][0]                
__________

In [12]:
model.fit(word_seq_train, y_train, epochs=8, batch_size=100, shuffle=True, validation_split=0.1,verbose = 0, callbacks=[TQDMNotebookCallback()])




<keras.callbacks.History at 0x2339a454f60>

In [17]:
y_test = model.predict(word_seq_test)

In [18]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

sample_submission = pd.read_csv("sample_submission.csv")

sample_submission[list_classes] = y_test
sample_submission.to_csv("results.csv", index=False)

153164
153164


In [15]:
# min loss avec LSTM + embedding layer = 0.046 après 3 epochs
# min loss avec conv1d + embedding = 0.0495 apres 4 epochs
# best val_loss: 0.0524 - val_acc: 0.9813 apres 2 epochs

In [16]:
del sample_submission