In [1]:
import pandas as pd
import numpy as np

In [2]:
folder='/home/saloni/Machine-learning/Toxic Comment Classification'

In [3]:
import os

In [4]:
def get_txt_and_tar(folder,filename):
    texts=[]
    targets=[]
    data=pd.read_csv(os.path.join(folder,filename))
    for i in range(len(data)):
        texts.append(data['comment_text'][i])
        targets.append(np.array([float(x) for x in data.iloc[i][2:]]))
    return texts,targets
    
    

In [5]:
texts,target=get_txt_and_tar(folder,'train.csv')


In [6]:
len(texts),len(target)

(159571, 159571)

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [8]:
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.05
def get_datasets(texts,targets,tokenizer=None):
    if tokenizer is None:
        tokenizer=Tokenizer()
        tokenizer.fit_on_texts(texts)
    sequences=tokenizer.texts_to_sequences(texts)
    word_index=tokenizer.word_index
    data=pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
    #indices=np.arange(data.shape[0])
    #np.random.shuffle(indices)
    #data=data[indices]
    #targets=targets[indices]
    val_size=int(VALIDATION_SPLIT*data.shape[0])
    x_train=data[:-val_size]
    y_train=targets[:-val_size]
    x_val=data[-val_size:]
    y_val=targets[-val_size:]
    
    return tokenizer,word_index,x_train,y_train,x_val,y_val

In [9]:
tokenizer, word_index, x_train, y_train, x_val, y_val = get_datasets(texts, target)

In [10]:
from gensim.models import KeyedVectors


In [11]:
def load_glove_model():
    WORD2VEC_FOLDER="/home/saloni/Machine-learning/Toxic Comment Classification"
    word2vec = KeyedVectors.load_word2vec_format(
        os.path.join(WORD2VEC_FOLDER,
                'glove.6B.100d.txt'),
            binary=False)
    return word2vec

In [12]:
word2vec=load_glove_model()

In [34]:
#from gensim.models import word2vec

In [13]:
len(word2vec.wv["apple"])

100

In [14]:
from keras.layers import *
from keras.models import Model

In [17]:
def get_embedding_layer(word_index, gensim_model):
    embedding_dim = 100
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if word in gensim_model.wv.vocab:
            embedding_matrix[i] = gensim_model.wv[word]
    embedding_layer = Embedding(len(word_index) + 1,
            embedding_dim,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=True)
    return embedding_layer

In [18]:
embedding_layer = get_embedding_layer(word_index, word2vec)

In [23]:
N_TARGET_CLASSES = 6
def get_convnet_model(embedding_layer):
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(64, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Flatten()(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    preds = Dense(N_TARGET_CLASSES, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    return model

In [24]:
model = get_convnet_model(embedding_layer)


In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          21033800  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 196, 128)          64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 35, 64)            41024     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 7, 64)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 448)               0         
__________

In [27]:
model.compile(loss='binary_crossentropy',
        optimizer='adagrad',
        metrics=['accuracy'])
model.fit(np.array(x_train),np.array( y_train), validation_data=(np.array(x_val),np.array( y_val)), epochs=2, batch_size=32, verbose=1)

Train on 151593 samples, validate on 7978 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f212073eb70>

In [48]:
model.fit(np.array(x_train),np.array( y_train), validation_data=(np.array(x_val),np.array( y_val)), epochs=1, batch_size=32, verbose=1)

Train on 151593 samples, validate on 7978 samples
Epoch 1/1


<keras.callbacks.History at 0x7f21207ad748>

In [28]:
test=pd.read_csv("/home/saloni/Machine-learning/Toxic Comment Classification/test.csv")

In [29]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [30]:
test.columns

Index(['id', 'comment_text'], dtype='object')

In [31]:
test_id=test['id']

In [32]:
len(test_id)

153164

In [33]:
test_text=test['comment_text']

In [34]:
np.array(x_train).shape,np.array(y_train).shape

((151593, 200), (151593, 6))

In [35]:
test_sequences=tokenizer.texts_to_sequences(test_text)
#word_index=tokenizer.word_index
data_test=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)

In [36]:
np.array(data_test).shape

(153164, 200)

In [49]:
predictions=model.predict(np.array(data_test),batch_size=64,verbose=0)

In [44]:
predictions.shape,len(predictions)

((153164, 6), 153164)

In [50]:
predictions[0]

array([ 0.99190873,  0.32051873,  0.94017857,  0.08319982,  0.82984388,
        0.18676686], dtype=float32)

In [51]:
toxic=[]
severe_toxic=[]
obscene=[]
threat=[]
insult=[]
identity_hate=[]
for i in range(len(predictions)):
    toxic.append(predictions[i][0])
    severe_toxic.append(predictions[i][1])
    obscene.append(predictions[i][2])
    threat.append(predictions[i][3])
    insult.append(predictions[i][4])
    identity_hate.append(predictions[i][5])
    

In [52]:
output = pd.DataFrame( data={"id":test_id, 
                             "toxic":toxic,
                             "severe_toxic":severe_toxic,
                            "obscene":obscene, 
                             "threat":threat,
                            "insult":insult,
                            "identity_hate":identity_hate} )
output.to_csv( "../Toxic Comment Classification/output_toxic1(4ep).csv", index=False, quoting=3 )

<B>score=0.9488</B> on LeaderBoard