Import lib

In [1]:
from __future__ import print_function, division
from builtins import range


import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score


Using TensorFlow backend.


### Define config

In [0]:
MAX_SEQUENCE_LENGTH=100
MAX_VOCAB_SIZE=20000
EMBEDDING_DIM=100
VALIDATION_SPLIT=0.2
BATCH_SIZE=128
EPOCHS=10

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded_train = drive.CreateFile({'id':"1UclYAm4IBj8AispSJ6OWRn85l-xzG38z"})
downloaded_train.GetContentFile('glove.6B.100d.txt')


In [0]:
downloaded_train = drive.CreateFile({'id':"1FNgBMYz3eBOXhpfn-CBrlZLg1FyXs3Lu"})
downloaded_train.GetContentFile('train_toxic.csv')

### 1. Load pretrained word vectors

In [0]:
word2vec={}
#with open(os.path.join('C:\Users\Ruchika\Downloads\glove.6B.zip\glove.6B.100d.txt'% EMBEDDING_DIM)) as f:
f=open('glove.6B.100d.txt', encoding='utf8')
for line in f:
    values=line.split()
    word=values[0]
    vec=np.asarray(values[1:],dtype='float32')
    word2vec[word]=vec

### 2. Load train data

In [0]:
train=pd.read_csv('train_toxic.csv')
sentences=train["comment_text"]
labels=["toxic","severe_toxic", "obscene","threat","insult","identity_hate"]
targets=train[labels].values


In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
train.describe

<bound method NDFrame.describe of                       id  ... identity_hate
0       0000997932d777bf  ...             0
1       000103f0d9cfb60f  ...             0
2       000113f07ec002fd  ...             0
3       0001b41b1c6bb37e  ...             0
4       0001d958c54c6e35  ...             0
...                  ...  ...           ...
159566  ffe987279560d7ff  ...             0
159567  ffea4adeee384e90  ...             0
159568  ffee36eab5c267c9  ...             0
159569  fff125370e4aaaf3  ...             0
159570  fff46fc426af1f9a  ...             0

[159571 rows x 8 columns]>

In [10]:
print('max length',max(len(s) for s in sentences))
print('min length',min(len(s) for s in sentences))

max length 5000
min length 5


### 3. NLP
Tokenization

In [11]:
tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences=tokenizer.texts_to_sequences(sentences)

wordtoindex=tokenizer.word_index
print(len(wordtoindex))


210443


  Padding

In [0]:
data=pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)


### 3. Embedding Matrix

In [0]:
num_words=min(MAX_VOCAB_SIZE,len(wordtoindex)+1)
embedding_matrix=np.zeros((20000,100))  # num_words =20,000 and embedding dim=100
for word, i in wordtoindex.items():
# fetch vector from pretrained model
  if i<MAX_VOCAB_SIZE:
    embedding_vector=word2vec.get(word)
    if embedding_vector is not None:
      embedding_matrix[i]=embedding_vector

### 4. Load pretrained word embeddings

In [14]:
embedding_layer=Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)




### 5. Build model 

A. CNN

In [15]:
inputlayer=Input(shape=(MAX_SEQUENCE_LENGTH,))
x=embedding_layer(inputlayer)
x=Conv1D(128,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(128, activation='relu')(x)
output=Dense(len(labels),activation='sigmoid')(x)


model=Model(inputlayer,output)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
model.fit(data,targets,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=VALIDATION_SPLIT)



Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff37ef612e8>

### CNN AUC 

In [17]:
p=model.predict(data)
aucs=[]
for j in range(6):
    auc= roc_auc_score(targets[:,j],p[:,j])
    aucs.append(auc)

print(np.mean(aucs))


0.9756246910567611


### B. LSTM

In [0]:
from keras.layers import LSTM
inputlayer=Input(shape=(MAX_SEQUENCE_LENGTH,))
x=embedding_layer(inputlayer)
x=LSTM(15,return_sequences=True)(x)
x=GlobalMaxPooling1D()(x)
output=Dense(len(labels),activation='sigmoid')(x)

model=Model(inputlayer,output)
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [19]:
model.fit(data,targets,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=VALIDATION_SPLIT)

Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff3700949e8>

In [20]:
p=model.predict(data)
aucs=[]
for j in range(6):
    auc= roc_auc_score(targets[:,j],p[:,j])
    aucs.append(auc)

print(np.mean(aucs))

0.9736022437907174
