In [1]:
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout, BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [5]:
rawdata = pd.read_csv("CUTe_data.csv")

In [6]:
rawdata.head()

Unnamed: 0,id,text,hate_speech,obscene,insulting
0,0,"""\n\nWikipedia an interesting yet frustrating ...",1,0,0
1,1,"""\n\nThanks\nThanks for reverting the vandalis...",0,0,0
2,2,(UTC)\n\nAnd fundamental Christains are not as...,1,1,1
3,3,"Why don't you go fuck your mom 05:52, 26 Jun ...",1,1,0
4,4,"Not to mention, he's a nobody. He's his bigges...",0,0,0


In [7]:
maxlen  = 300 # Max sequence length of a sentence or row/document.
max_features  = 20000 # This picks the top 20,000 most frequent words
embedding_dims  = 200
VALIDATION_SPLIT = 0.2

In [8]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(rawdata.text)
sequences = tokenizer.texts_to_sequences(rawdata.text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

# labels = to_categorical(raw.Labels)
# print('Shape of data tensor:', data.shape)
# print('Shape of label tensor:', labels.shape)

Found 67252 unique tokens.


In [9]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])

In [10]:
x_train = data[:-num_validation_samples]

In [11]:
x_train.shape

(25308, 300)

In [12]:
y = rawdata[["hate_speech","obscene","insulting"]]

In [13]:
y.head()

Unnamed: 0,hate_speech,obscene,insulting
0,1,0,0
1,0,0,0
2,1,1,1
3,1,1,0
4,0,0,0


In [14]:
x_train= data[:-num_validation_samples]
x_test= data[-num_validation_samples:]
y_train= y[:-num_validation_samples]
y_test= y[-num_validation_samples:]

In [15]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(25308, 300)
(6326, 300)
(25308, 3)
(6326, 3)


In [14]:
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen, 
                        embeddings_initializer="uniform")(comment_input)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the comment
h = GlobalMaxPooling1D()(comment_emb)
#h = Dropout(0.1)(h)
# We project onto a six-unit output layer, and squash it with a sigmoid:
output = Dense(3, activation='sigmoid')(h)

model = Model(inputs=comment_input, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [23]:
    model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 200)          4000000   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 603       
Total params: 4,000,603
Trainable params: 4,000,603
Non-trainable params: 0
_________________________________________________________________


In [24]:
model_hist = model.fit(x_train, y_train,
              batch_size=64,
              epochs=3,
              validation_data=(x_test, y_test)).history

Train on 25308 samples, validate on 6326 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen, 
                        embeddings_initializer="uniform")(comment_input)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the comment
h = GlobalMaxPooling1D()(comment_emb)
h = Dropout(0.2)(h)
#h = BatchNormalization()(h)
# We project onto a six-unit output layer, and squash it with a sigmoid:
output = Dense(3, activation='sigmoid')(h)

model1 = Model(inputs=comment_input, outputs=output)

model1.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [20]:
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 200)          4000000   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 603       
Total params: 4,000,603
Trainable params: 4,000,603
Non-trainable params: 0
_________________________________________________________________


In [21]:
model_hist = model1.fit(x_train, y_train,
              batch_size=64,
              epochs=2,
              validation_data=(x_test, y_test)).history

Train on 25308 samples, validate on 6326 samples
Epoch 1/2
Epoch 2/2
