Things to try:
  - remove stopwords
  - tfidf for tokenizer

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
from os.path import expanduser
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
dataPath = expanduser("~/Datasets/32017/jigsaw-toxic-comments/")
train = pd.read_csv(dataPath + 'tc_train.csv',index_col=0)
train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Get Embeddings

In [4]:
# Create embedding index from file in .txt format. First line contains 
# dictionary size and embedding dim. Fields are space separated
def get_embeddings(file_name):
    embeddings_index = {}
    with open(file_name, encoding="utf8") as f:
        for line in f:
            values = line.rstrip().split(' ')
            if len(values) > 2:
                embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32")
    return embeddings_index
embeddings_path = expanduser('~/Datasets/text_embeddings/')
embeddings_index = get_embeddings(embeddings_path+'crawl-300d-2M.vec')

## Preprocess Text

Remove punctuation and stopwords

In [7]:
import string
trans_table = str.maketrans({key: ' ' for key in string.digits + '\r\n' +
                             string.punctuation.replace("\'",'')})
def preprocess(text):
    return ' '.join(text.lower().translate(trans_table).split(' '))

train['comment_clean'] = train.comment_text.apply(preprocess)
train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww he matches this background colour i'm s...
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i'm really not trying to edit war it...
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can't make any real suggestions on im...
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember...


In [None]:
types = list(train)[1:]
print(types)

## Create Vocabulary

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

UNKNOWN_PROXY = 'unknown'
MIN_WORD_OCCURRENCE = 5

print("Creating the vocabulary of words occurred more than", MIN_WORD_OCCURRENCE)
vectorizer = CountVectorizer(lowercase=False, token_pattern="\S+", 
                             min_df=MIN_WORD_OCCURRENCE)
vectorizer.fit(train.comment_clean)

top_words = set(vectorizer.vocabulary_.keys())
top_words.add(UNKNOWN_PROXY)
print(len(top_words),'top words')

Creating the vocabulary of words occurred more than 5
36366 top words


## Tokenization

Tokenize the text using word index

In [9]:
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(train.comment_clean)
print(tokenizer)

<keras_preprocessing.text.Tokenizer object at 0x12ec81cf8>


In [11]:
word_index = tokenizer.word_index
print(len(word_index))
list(word_index.items())[:16]

195805


[('the', 1),
 ('to', 2),
 ('of', 3),
 ('and', 4),
 ('a', 5),
 ('you', 6),
 ('i', 7),
 ('is', 8),
 ('that', 9),
 ('in', 10),
 ('it', 11),
 ('for', 12),
 ('this', 13),
 ('not', 14),
 ('on', 15),
 ('be', 16)]

## Input data for Network

In [12]:
MAX_SEQUENCE_LENGTH = 50
seq = tokenizer.texts_to_sequences(train.comment_clean)
data = pad_sequences(seq,maxlen=MAX_SEQUENCE_LENGTH,padding='post',
                     truncating='post')
with open(dataPath + 'comments.pkl','wb') as f: pickle.dump(data, f, -1)

print('\nFirst 2 sequences in `seq`: ',seq[:2])
print('\nShape of `data`: ',data.shape)
print('\nFirst prepared text in `data`:',data[0])


First 2 sequences in `seq`:  [[669, 75, 1, 126, 130, 175, 29, 655, 4305, 11421, 1032, 86, 330, 51, 2203, 11038, 50, 6544, 15, 60, 2654, 148, 7, 2829, 34, 117, 1185, 14720, 2713, 4, 45, 59, 241, 1, 363, 31, 1, 38, 27, 143, 73, 3322, 89], [91985, 52, 2528, 13, 551, 3652, 73, 4345, 2603, 21, 94, 38, 960, 180]]

Shape of `data`:  (159571, 50)

First prepared text in `data`: [  669    75     1   126   130   175    29   655  4305 11421  1032    86
   330    51  2203 11038    50  6544    15    60  2654   148     7  2829
    34   117  1185 14720  2713     4    45    59   241     1   363    31
     1    38    27   143    73  3322    89     0     0     0     0     0
     0     0]


In [13]:
embeddings_dim = len(next(iter(embeddings_index.values())))
embeddings_dim

300

In [20]:
# Create embedding matrix
def get_embedding_matrix(word_index,embeddings_index):
    nb_words = len(word_index) + 1 # +1 since min(word_index.values())=1
    embedding_matrix = np.zeros((nb_words,embeddings_dim))
    unknown = 0
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None: unknown += 1
        else: embedding_matrix[i] = embedding_vector
    return embedding_matrix, unknown

In [21]:
# Create embedding_layer and save it.
def make_save_emb_layer(word_index,embeddings_index,layer_file_name):
    embedding_matrix,unknown = get_embedding_matrix(word_index,embeddings_index)
    embedding_layer = Embedding(embedding_matrix.shape[0],embedding_matrix.shape[1],
                                weights=[embedding_matrix],trainable=False)
    with open(layer_file_name,'wb') as f: 
        pickle.dump(embedding_layer, f, -1)
    return unknown

EMBEDDING_LAYER_FILE = dataPath + 'toxic_comments_embed_layer.pkl'
print(make_save_emb_layer(word_index,embeddings_index,EMBEDDING_LAYER_FILE),
      'unknown words')

98379 unknown words


## Split into new train and validation sets

In [45]:
import pickle
from sklearn.model_selection import train_test_split

# convert each vector of labels to the string
labels = train[types].astype(str).apply(lambda x: ''.join(x),axis=1)
print('Labels: \n',labels.head())
# aggregate rare combinations if any
count = labels.value_counts()
rare = count.index[count<=2]
labels[np.isin(labels.values,rare)] = 'rare'
print('\nCounts of labels: \n',labels.value_counts())
train_index, val_index = train_test_split(list(range(data.shape[0])), test_size=0.2, 
                                      stratify = labels, random_state=0)
# save train and validation indices for further calculations
fname = dataPath + 'train_val_split.pkl'
with open(fname, 'wb') as f: pickle.dump([train_index, val_index], f, -1)

Labels: 
 id
0000997932d777bf    000000
000103f0d9cfb60f    000000
000113f07ec002fd    000000
0001b41b1c6bb37e    000000
0001d958c54c6e35    000000
dtype: object

Counts of labels: 
 000000    143346
100000      5666
101010      3800
101000      1758
100010      1215
111010       989
101011       618
001000       317
000010       301
111011       265
001010       181
111000       158
100001       136
100011       134
101110       131
100100       113
111110        64
101111        56
000001        54
110000        41
101001        35
111111        31
000011        28
000100        22
001011        18
100110        16
110010        14
110100        11
101100        11
110011         7
100101         7
rare           6
111001         6
111100         4
000110         3
110001         3
001001         3
100111         3
dtype: int64


## Neural Network

In [64]:
from keras.layers import Dense,Embedding,Input,Dropout,Conv1D
from keras.layers import SpatialDropout1D, Flatten,LSTM,GlobalMaxPooling1D
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from sklearn.metrics import roc_auc_score
from keras.utils import plot_model

def get_model():
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embedding_layer(input_layer)
    x = SpatialDropout1D(0.5)(x)
    x = LSTM(10, return_sequences=True)(x)
    x = Conv1D(5, kernel_size=2, padding="valid")(x)
    x = GlobalMaxPooling1D()(x)
    x = BatchNormalization()(x)
    x = Dropout(.2)(x)
    output_layer = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss="binary_crossentropy", optimizer=Adam())
    return model

In [65]:
with open(EMBEDDING_LAYER_FILE, 'rb') as f: embedding_layer = pickle.load(f)
with open(dataPath + 'comments.pkl', 'rb') as f: data = pickle.load(f)   

X_train = data[train_index]
X_val = data[val_index]

y_train = train.iloc[train_index][['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
y_val = train.iloc[val_index][['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [66]:
best_model_path = 'best_model.h5'
BATCH_SIZE = 512

early_stopping = EarlyStopping(patience=2)
model_checkpoint = ModelCheckpoint(best_model_path,
                                   save_best_only=True, save_weights_only=True)

model = get_model()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 300)           58741800  
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 50, 300)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 50, 10)            12440     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 49, 5)             105       
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 5)                 0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 5)                 20        
__________

In [67]:
hist = model.fit(X_train, y_train,validation_data=(X_val, y_val),
                 epochs=28, batch_size=BATCH_SIZE, shuffle=True, verbose=2,
                 callbacks=[model_checkpoint, early_stopping])
model.load_weights(best_model_path)
val_pred = model.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
print('validation AUC',roc_auc_score(y_val, val_pred))

Train on 127656 samples, validate on 31915 samples
Epoch 1/28
 - 110s - loss: 0.5745 - val_loss: 0.3872
Epoch 2/28
 - 121s - loss: 0.2940 - val_loss: 0.1857
Epoch 3/28
 - 131s - loss: 0.1503 - val_loss: 0.0980
Epoch 4/28
 - 133s - loss: 0.1048 - val_loss: 0.0725
Epoch 5/28
 - 121s - loss: 0.0882 - val_loss: 0.0642
Epoch 6/28
 - 105s - loss: 0.0797 - val_loss: 0.0586
Epoch 7/28
 - 116s - loss: 0.0755 - val_loss: 0.0559
Epoch 8/28
 - 123s - loss: 0.0720 - val_loss: 0.0541
Epoch 9/28
 - 114s - loss: 0.0692 - val_loss: 0.0537
Epoch 10/28
 - 124s - loss: 0.0678 - val_loss: 0.0531
Epoch 11/28
 - 149s - loss: 0.0663 - val_loss: 0.0520
Epoch 12/28
 - 112s - loss: 0.0660 - val_loss: 0.0512
Epoch 13/28
 - 112s - loss: 0.0643 - val_loss: 0.0519
Epoch 14/28
 - 113s - loss: 0.0632 - val_loss: 0.0503
Epoch 15/28
 - 113s - loss: 0.0628 - val_loss: 0.0509
Epoch 16/28
 - 117s - loss: 0.0623 - val_loss: 0.0506
validation AUC 0.9742115410944733


## Output file

In [70]:
test = pd.read_csv(dataPath + 'tc_test.csv',index_col=0)
print(test.shape)
test.head()

(89186, 1)


Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
0000247867823ef7,== From RfC == \n\n The title is fine as it is...
00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
00017563c3f7919a,":If you have a look back at the source, the in..."
00017695ad8997eb,I don't anonymously edit articles at all.


In [72]:
test['comment_clean'] = test.comment_text.apply(preprocess)

In [73]:
test_seq = tokenizer.texts_to_sequences(test.comment_clean)
test_data = pad_sequences(test_seq,maxlen=MAX_SEQUENCE_LENGTH,padding='post',
                         truncating='post')
with open(dataPath + 'test_comments.pkl','wb') as f: pickle.dump(data, f, -1)

In [76]:
test_pred = model.predict(test_data, batch_size=BATCH_SIZE, verbose=0)

In [85]:
out = pd.DataFrame(test_pred, index=test.index)
out.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
out.to_csv(dataPath + 'tc_output1.csv')

In [86]:
out.shape

(89186, 6)