In [None]:
#no_preprocessing + conv1d -- 0.92716
#preprocessing + conv1d -- 0.92029

#no_preprocessing + LSTM -- 0.97592
#preprocessing + LSTM -- 

#no_preprocessing + Bi-LSTM -- 0.97647
#preprocessing + Bi-LSTM -- 

In [None]:
import os
import sys
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow as tf

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    
    data = re.sub('[^a-zA-Z]', ' ', text)
    data = data.lower()
    data = data.split()
    data = [lemmatizer.lemmatize(word) for word in data if word not in stop_words]
    
    return ' '.join(data)

In [None]:
max_seq_len = 100
max_vocab_size = 20000
embedding_dim = 100
validation_split = 0.2
batch_size = 64
epochs = 50

In [None]:
word_embeddings = {}
with open(os.path.join('../input/glove6b100dtxt/glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word_key = values[0]
        word_vector = np.array(values[1:], dtype='float32')
        
        word_embeddings[word_key] = word_vector

In [None]:
len(word_embeddings)

In [None]:
train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df

In [None]:
documents = train_df.comment_text.fillna('DummyValue').values
documents = [clean_text(doc) for doc in documents]
documents = np.array(documents)
targets = train_df[train_df.columns[2:]].values

In [None]:
doc_len_list = [len(d) for d in documents]
max_doc_len = max(doc_len_list)
min_doc_len = min(doc_len_list)
avg_doc_len = sum(doc_len_list)/len(documents)

print(f'max document length: {max_doc_len}')
print(f'min document length: {min_doc_len}')
print(f'Avg document length: {avg_doc_len}')

In [None]:
plt.hist(doc_len_list)
plt.show

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(documents)
sequences = tokenizer.texts_to_sequences(documents)

In [None]:
word_index = tokenizer.word_index
len(word_index)

In [None]:
seq_data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_seq_len)
seq_data.shape

In [None]:
seq_data

In [None]:
num_words = min(max_vocab_size, len(word_index)+1)
num_words

In [None]:
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, idx in word_index.items():
    if idx < max_vocab_size:
        embedding_vector = word_embeddings.get(word)
        
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

In [None]:
embedding_matrix

In [None]:
print(f'shape of the seq_data: {seq_data.shape}')
print(f'shape of the targets: {targets.shape}')
print(f'shape of the embedding_matrix: {embedding_matrix.shape}')

In [None]:
embedding_layer = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False)

In [None]:
input_ = tf.keras.Input(shape=(max_seq_len,))
x = embedding_layer(input_)

In [None]:
##### Conv1D ##########
'''x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)'''

In [None]:
##### LSTM ##########
#x = tf.keras.layers.LSTM(50, return_sequences=True)(x)

In [None]:
##### Bi-LSTM ##########
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True))(x)

In [None]:
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output_ = tf.keras.layers.Dense(6, activation='sigmoid')(x)
model = tf.keras.Model(inputs=input_, outputs=output_)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=5, 
    restore_best_weights=True,
)

In [None]:
history = model.fit(seq_data, targets, batch_size=batch_size, epochs= epochs, validation_split=validation_split, callbacks=[early_stopping])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()

In [None]:
test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_data = test_df.comment_text.fillna('DummyValue').values
test_data = [clean_text(doc) for doc in test_data]
test_data = np.array(test_data)
test_ids = test_df.id.values
test_df

In [None]:
test_labels_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
test_labels_df

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_data)
test_seq_data = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_seq_len)
preds = model.predict(test_seq_data)
preds

In [None]:
preds.shape

In [None]:
test_ids.shape

In [None]:
test_ids = test_ids.reshape(-1, 1)
sub_df = pd.DataFrame(preds, columns=train_df.columns[2:])
sub_df['id'] = test_ids
sub_df

In [None]:
cols = sub_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
sub_df = sub_df[cols]
sub_df

In [None]:
sub_df.to_csv('submissioin.csv', index=False, header=True)

In [None]:
#from sklearn.metrics import roc_auc_score
#aucs = []
#for idx in range(6):
#    aucs.append(roc_auc_score(targets[:,idx], preds[:, idx]))
#print(np.mean(aucs))