In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [63]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, GRU
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate
from keras.preprocessing import text, sequence

from keras.callbacks import EarlyStopping, ModelCheckpoint

In [39]:
max_features = 20000
maxlen= 100

In [40]:
train = pd.read_csv("Data/train.csv")
#test = pd.read_csv("Data/test.csv")

print (train.head(10))

list_sentences_train = train["comment_text"].fillna("unknown").values

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y = train[list_classes].values

#list_sentences_test = test["comment_text"].fillna("unknown").values

          id                                       comment_text  toxic  \
0   22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1   27450690  "\n\n Please do not vandalize pages, as you di...      0   
2   54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3   77493077  Asking some his nationality is a Racial offenc...      0   
4   79357270  The reader here is not going by my say so for ...      0   
5   82428052      Fried chickens \n\nIs dat sum fried chickens?      0   
6   87311443  Why can you put English for example on some pl...      0   
7  114749757  Guy Fawkes \n\nim a resident in bridgwater and...      0   
8  138560519  as far as nicknames go this article is embarra...      0   
9  139353149  Woodland Meadows\nGood to hear that you correc...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2          

In [41]:
print (list_sentences_train[0])
y[0]

Nonsense?  kiss off, geek. what I said is true.  I'll have your account terminated.


array([1, 0, 0, 0, 0, 0])

In [42]:
# Convert Each word in comment into a token
tokenizer = text.Tokenizer(num_words= max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

#### Train Data
#tranform the text into sequences
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

#transform a list of samples sequences (lists of scalars) into a 2D Numpy array
X_t = sequence.pad_sequences(list_tokenized_train, maxlen= maxlen)

#### Test Data
#list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
#X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


## Model Architecture

In [61]:
def cnn_rnn():
    embed_size = 256
    inp = Input(shape = (maxlen, ))
    main = Embedding(max_features, embed_size)(inp)
    main = Dropout(0.2)(main)
    main = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(main)
    main = MaxPooling1D(pool_size=2) (main)
    main = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(main)
    main = MaxPooling1D(pool_size=2)(main)
    main= GRU(32)(main)
    
    main= Dense(16, activation='relu')(main)
    main = Dense(6, activation='sigmoid')(main)
    model = Model(inputs=inp, outputs=main)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metric = ['accuracy'])
    model.summary()
    
    return model
    
    

In [62]:
model = cnn_rnn()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 100, 256)          5120000   
_________________________________________________________________
dropout_9 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 100, 32)           16416     
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 50, 32)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 50, 32)            2080      
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 25, 32)            0         
__________

In [45]:
from sklearn.model_selection import train_test_split

print ("positive labels")
any_category_positive = np.sum(y,1)
print(pd.value_counts(any_category_positive))
X_t_train, X_t_test, y_train, y_test = train_test_split(X_t, y, 
                                                        test_size = 0.10, 
                                                        )
print('Training:', X_t_train.shape)
print('Testing:', X_t_test.shape)

positive labels
0    86061
1     3833
3     2523
2     2107
4     1076
5      231
6       20
dtype: int64
Training: (86265, 100)
Testing: (9586, 100)


In [65]:
batch_size = 128 
epochs = 3

file_path="model_best.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
#early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint]#, early] #early

model.fit(X_t_train, y_train, 
          validation_data=(X_t_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)
model.save('Whole_model.h5')



Train on 86265 samples, validate on 9586 samples
Epoch 1/3


TypeError: run() got an unexpected keyword argument 'metric'

In [66]:
batch_size = 128 
epochs = 3

file_path="model_best.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
#early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint]#, early] #early

model.fit(X_t, y, 
          validation_split =0.1,
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)
model.save('Whole_model.h5')

Train on 86265 samples, validate on 9586 samples
Epoch 1/3


TypeError: run() got an unexpected keyword argument 'metric'