In [1]:
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout, LSTM
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [3]:
rawdata = pd.read_csv("CUTe_data.csv")

In [4]:
rawdata.head()

Unnamed: 0,id,text,hate_speech,obscene,insulting
0,0,"""\n\nWikipedia an interesting yet frustrating ...",1,0,0
1,1,"""\n\nThanks\nThanks for reverting the vandalis...",0,0,0
2,2,(UTC)\n\nAnd fundamental Christains are not as...,1,1,1
3,3,"Why don't you go fuck your mom 05:52, 26 Jun ...",1,1,0
4,4,"Not to mention, he's a nobody. He's his bigges...",0,0,0


In [5]:
maxlen  = 300 # Max sequence length of a sentence or row/document.
max_features  = 20000 # This picks the top 20,000 most frequent words
embedding_dims  = 60
VALIDATION_SPLIT = 0.2

In [6]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(rawdata.text)
sequences = tokenizer.texts_to_sequences(rawdata.text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

# labels = to_categorical(raw.Labels)
# print('Shape of data tensor:', data.shape)
# print('Shape of label tensor:', labels.shape)

Found 67252 unique tokens.


In [7]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])

In [8]:
x_train = data[:-num_validation_samples]

In [9]:
x_train.shape

(25308, 300)

In [10]:
y = rawdata[["hate_speech","obscene","insulting"]]

In [11]:
y.head()

Unnamed: 0,hate_speech,obscene,insulting
0,1,0,0
1,0,0,0
2,1,1,1
3,1,1,0
4,0,0,0


In [11]:
x_train= data[:-num_validation_samples]
x_test= data[-num_validation_samples:]
y_train= y[:-num_validation_samples]
y_test= y[-num_validation_samples:]

In [12]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(25308, 300)
(6326, 300)
(25308, 3)
(6326, 3)


In [13]:
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen)(comment_input)
#                         embeddings_initializer="uniform")(comment_input)

#lstm = 60
comment_emb = LSTM(90, return_sequences=True,name='lstm_layer')(comment_emb)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the comment
h = GlobalMaxPooling1D()(comment_emb)

#dense 50
comment_emb = Dropout(0.1)(comment_emb)
comment_emb= Dense(70, activation="relu")(comment_emb)
comment_emb = Dropout(0.1)(comment_emb)

# We project onto a six-unit output layer, and squash it with a sigmoid:
output = Dense(3, activation='sigmoid')(h)

model = Model(inputs=comment_input, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [19]:
    model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 300, 200)          4000000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 300, 60)           62640     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 60)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 183       
Total params: 4,062,823
Trainable params: 4,062,823
Non-trainable params: 0
_________________________________________________________________


In [15]:
#dense 70 lstm 90
model_hist = model.fit(x_train, y_train,
              batch_size=32,
              epochs=10,
              validation_data=(x_test, y_test)).history

Train on 25308 samples, validate on 6326 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 