In [3]:
import os
os.chdir("desktop")

In [192]:
import re
import string
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from keras.layers.embeddings import Embedding

from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
import matplotlib as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [102]:
from sklearn.model_selection import train_test_split

In [26]:
import nltk

from nltk.collocations import *
from nltk import FreqDist,WordNetLemmatizer

In [167]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [170]:
train= train.dropna()
train = train[train.comment_text.apply(lambda x: x !="")]


In [171]:
test= test.dropna()
test = test[test.comment_text.apply(lambda x: x !="")]

In [172]:

CATEGORIES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = train[CATEGORIES].values

In [173]:
def dataClean(tweets_raw):
    text = tweets_raw.translate(string.punctuation)
    text = text.lower().split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    text = (text.encode('ascii', 'ignore')).decode("utf-8")
    text = re.sub(r'[<>!#@$:.,%\?-]+', r'', text)
    text = re.sub(r'@\w+', r'', text)
    return text

In [174]:
train['comment_text'] = train['comment_text'].map(lambda x: dataClean(x))
test['comment_text'] = test['comment_text'].map(lambda x: dataClean(x))

In [178]:

test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succes then you'll ev...
1,0000247867823ef7,== from rfc == ++ the titl is fine as it is imo
2,00013b17ad220c46,""" ++ == sourc == ++ * zaw ashton on lapland / """
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i don't anonym edit articl at all


In [180]:
X_train = train["comment_text"].str.lower()
X_test = test["comment_text"].str.lower()

In [181]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)
maxlen = 300
X_tr = pad_sequences(tokenized_train, maxlen=maxlen)
X_te = pad_sequences(tokenized_test, maxlen=maxlen)

In [182]:
print(X_t.shape,X_te.shape)



(19886, 50) (153164, 300)


In [183]:
embeddings_index = dict()
f = open('I:\\glove.840B.300d.txt\\glove.840B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 2195893 word vectors.


In [184]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [200]:
inp = Input(shape=(maxlen,))

x = Embedding(max_features, 300, weights=[embedding_matrix], trainable=True)(inp)
x = Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1)(x)

x = Dropout(0.5)(x)
x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.5, recurrent_dropout=0.25))(x)


avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

out = Dense(6, activation='sigmoid')(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


batch_size = 50
epochs = 1

In [201]:
print("start fitting...")

model.fit(X_tr,y, batch_size=batch_size, epochs=epochs, validation_split=0.4)
model.summary()




start fitting...
Train on 95742 samples, validate on 63829 samples
Epoch 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 300, 300)     6000000     input_9[0][0]                    
__________________________________________________________________________________________________
conv1d_19 (Conv1D)              (None, 300, 30)      27030       embedding_20[0][0]               
__________________________________________________________________________________________________
dropout_14 (Dropout)            (None, 300, 30)      0           conv1d_19[0][0]                  
________________________________

In [202]:
y_pred = model.predict(X_te,batch_size=50,verbose=1)
submission = pd.read_csv('sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_glove.csv', index=False)

