# Word2Vec Embedding

In [1]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('train_small.csv')
dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
x_text = np.array(dataset.iloc[1:,1])
y = np.array(dataset.iloc[1:,2:], dtype='float32')

In [4]:
from nltk.corpus import stopwords
import string

def normalize(x_text):
    stop = stopwords.words('english')
    res = [' '.join([t for t in doc.split() if t not in string.punctuation and t not in  stop]) for doc in x_text]
    return res

In [5]:
def tokenize_data(x_text):
    tokenizer = Tokenizer(lower=True)
    tokenizer.fit_on_texts(x_text)
    x_tokenized_doc = []
    for doc in x_text:
        tk_doc = tokenizer.texts_to_sequences(doc)
        x_tk_optimized = [x_doc for x_doc in tk_doc if len(x_doc)>0]
        x_tokenized_doc.append(x_tk_optimized)
    return x_tokenized_doc, tokenizer

In [6]:
x_text = normalize(x_text)
x_tokenized, x_tk= tokenize_data(x_text)
index_to_words = {id: word for word, id in x_tk.word_index.items()}
max_vocab = len(set([word for word, id in x_tk.word_index.items()]))
max_len = max([len(item) for item in x_tokenized])
print(max_len)
print(max_vocab)

2909
9978


In [7]:
x_array = []
for x_doc in x_tokenized:
    x_list = [x_item[0] for x_item in x_doc]
    x_array.append(x_list)

x_all =pad_sequences(x_array, maxlen=2000, dtype='int32', padding='post', truncating='post', value=0.0)
print(x_all)
x = np.array(x_all)

[[1528  812   24 ...,    0    0    0]
 [2452  281 2153 ...,    0    0    0]
 [ 623  927 1001 ...,    0    0    0]
 ..., 
 [  63  147   24 ...,    0    0    0]
 [   1  584 1001 ...,    0    0    0]
 [ 584    1  584 ...,    0    0    0]]


In [13]:
print(x.shape)
print(y.shape[1])

(14,)
6


In [77]:
model = Sequential()
model.add(Embedding(max_vocab, 100, input_length=2000))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 2000, 100)         997800    
_________________________________________________________________
dense_35 (Dense)             (None, 2000, 256)         25856     
_________________________________________________________________
dropout_23 (Dropout)         (None, 2000, 256)         0         
_________________________________________________________________
dense_36 (Dense)             (None, 2000, 128)         32896     
_________________________________________________________________
dropout_24 (Dropout)         (None, 2000, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 256000)            0         
_________________________________________________________________
dense_37 (Dense)             (None, 6)                 1536006   
Total para

In [79]:
num_epoch=50
checkpoint = ModelCheckpoint(filepath='best_model_1.hdf5', save_best_only=True)
hist = model.fit(x, y, batch_size=10, epochs=num_epoch, validation_split=0.2, callbacks=[checkpoint], shuffle=True, verbose=2)

Train on 799 samples, validate on 200 samples
Epoch 1/200


KeyboardInterrupt: 