In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

#### Load GloVe embeddings

In [2]:
embeddings_index = {}

f = open ('glove.6B.100d.txt','r', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word]=coefs

f.close()
print ('Found %s word embeddings'%(len(embeddings_index)))

Found 400000 word embeddings


### Load comments

In [3]:
data = pd.read_csv('./data/toxic-comments/train.csv')

In [4]:
sentences = data['comment_text']
target_values = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
labels = data[target_values].values

print ('Read %d sentences'%(len(sentences)))
print ('Read %d labels'%(len(labels)))

Read 159571 sentences
Read 159571 labels


In [5]:
print ('Longest sentence has %d words'%(max (len (s) for s in sentences)))
print ('Smallest sentence has %d words'%(min (len (s) for s in sentences)))

Longest sentence has 5000 words
Smallest sentence has 6 words


In [6]:
MAX_WORDS=20000
BATCH_SIZE=16
EPOCHS=2
OOV_TOKEN=0
EMBEDDING_DIM = 100
MAX_SEQ_LENGTH=100
VALIDATION_SPLIT_RATIO= 0.3

### Tokenize comments

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(sentences)
word2Idx = tokenizer.word_index

print ('Found %d unique words'%(len(word2Idx)))
print (word2Idx['girl'])

sequences = tokenizer.texts_to_sequences(sentences)
print ('Found %d sequences'%(len(sequences)))

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='pre', maxlen=MAX_SEQ_LENGTH)
print ('Created %d padded sequences'%(len(padded_sequences)))

Found 210338 unique words
1995
Found 159571 sequences
Created 159571 padded sequences


In [8]:
num_words = min (MAX_WORDS, len(word2Idx)+1)
print ('Min words to be considered are %d'%(num_words))

loaded_embeddings_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2Idx.items():
    if (i<num_words):
        if word in embeddings_index.keys():
            embedding_vector = embeddings_index[word]
            loaded_embeddings_matrix[i] = embedding_vector

print (loaded_embeddings_matrix.shape)

Min words to be considered are 20000
(20000, 100)


### Build Model

In [9]:
l0 = tf.keras.layers.Embedding(input_dim = num_words,
                               output_dim = EMBEDDING_DIM,
                               input_length=MAX_SEQ_LENGTH, 
                               embeddings_initializer=tf.keras.initializers.Constant(loaded_embeddings_matrix),
                               trainable=False)

In [10]:
input = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH))
x = l0(input)
x = tf.keras.layers.Conv1D(filters=5, kernel_size=2, activation='relu')(x)
x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
#x = tf.keras.layers.Conv1D(filters=5, kernel_size=2, activation='relu')(x)
#x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(units=32, activation='relu')(x)
output = tf.keras.layers.Dense(units=6, activation='sigmoid')(x)

model = tf.keras.models.Model(input, output)
model.compile (optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          2000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 99, 5)             1005      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 49, 5)             0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 5)                 0         
_________________________________________________________________
dense (Dense)                (None, 32)                192       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 198   

In [11]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_targets, test_targets = train_test_split (padded_sequences, labels)
print (train_sentences.shape)
print (test_sentences.shape)
print (train_targets.shape)

(119678, 100)
(39893, 100)
(119678, 6)


In [12]:
model.fit (train_sentences, train_targets, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT_RATIO)

W0824 23:39:46.513308  7748 deprecation.py:323] From C:\MachineLearning\anaconda\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 83774 samples, validate on 35904 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1e516e413c8>

In [13]:
predicted_labels = model.predict(test_sentences)
print (predicted_labels.shape)
print (test_targets.shape)

(39893, 6)
(39893, 6)


In [14]:
test_comment = ['You are a asshole']
test_seq = tokenizer.texts_to_sequences(test_comment)
padded_test_seq = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=MAX_SEQ_LENGTH)
predicted_target = model.predict(padded_test_seq)
print (predicted_target)

[[0.2030863  0.01227964 0.08545672 0.00386927 0.07395536 0.01034679]]
