In [None]:
import numpy as np 
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Embedding, Input, GRU
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate
from keras.preprocessing import text, sequence

from keras.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Flatten
from keras.optimizers import Adam
import keras

In [2]:
path = '/axp/rim/imsadsml/warehouse/sagra39/Kaggle/toxic/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
print(train.shape, test.shape)

(95851, 8) (226998, 2)


In [3]:
list_sentences_train = train["comment_text"].fillna("unknown").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("unknown").values

In [4]:
max_features = 20000
maxlen = 100

In [5]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

In [6]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)

list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [7]:
from sklearn.model_selection import train_test_split

print('Positive Labels ')
any_category_positive = np.sum(y,1)
print(pd.value_counts(any_category_positive))

X_t_train, X_t_test, y_train, y_test = train_test_split(X_t, y,test_size = 0.20)
print('Training:', X_t_train.shape)
print('Testing:', X_t_test.shape)

Positive Labels 
0    86061
1     3833
3     2523
2     2107
4     1076
5      231
6       20
dtype: int64
Training: (76680, 100)
Testing: (19171, 100)


In [26]:
model = Sequential()
model.add(Embedding(max_features, 256, input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(GRU(32))
model.add(Dense(16, activation="relu"))
model.add(Dense(6, activation='sigmoid'))

In [27]:
Adam = keras.optimizers.Adam(lr=0.002)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [28]:
file_path="model_best.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] #early

In [30]:
model.fit(X_t_train, y_train,
          validation_data=(X_t_test, y_test),
          batch_size=50,
          epochs=2,
          shuffle = True,
          verbose=2)
model.save('Whole_model.h5')

Train on 76680 samples, validate on 19171 samples
Epoch 1/2
364s - loss: 0.0425 - acc: 0.9839 - val_loss: 0.0533 - val_acc: 0.9804
Epoch 2/2
361s - loss: 0.0379 - acc: 0.9853 - val_loss: 0.0559 - val_acc: 0.9812


In [32]:
y_test = model.predict(X_te)
sample_submission = pd.read_csv(path + "sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv(path + "Keras_embedding_layer4.csv", index=False)

## Ensemble

In [42]:
path_new = '/axp/rim/imsadsml/warehouse/sagra39/Kaggle/toxic/'
import pandas as pd
import numpy as np

file1 = pd.read_csv(path_new + 'Keras_embedding_layer1.csv')
file2 = pd.read_csv(path_new + 'Keras_embedding_layer2.csv')
# file3 = pd.read_csv(path_new + 'sub01445.csv')
# file4 = pd.read_csv(path_new + 'vgg16.csv')


file1.columns = ['id','toxic_x','severe_toxic_x','obscene_x','threat_x','insult_x','identity_hate_x']
file2.columns = ['id','toxic_y','severe_toxic_y','obscene_y','threat_y','insult_y','identity_hate_y']
# file3.columns = ['id','target_z']
# file4.columns = ['id','target_w']

In [43]:
concat_sub = file1.merge(file2, on='id', how='inner')

In [50]:
concat_sub.head(2)

Unnamed: 0,id,toxic_x,severe_toxic_x,obscene_x,threat_x,insult_x,identity_hate_x,toxic_y,severe_toxic_y,obscene_y,...,identity_hate_max,identity_hate_min,identity_hate_mean,identity_hate_median,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,1.056884e-07,5.296963e-06,2.05277e-06,2.785412e-07,2.059524e-07,1.182304e-07,0.008709,0.000177,0.002443,...,0.001094,0.001094,0.001094,0.001094,0.008709,0.000177,0.002443,0.000854,0.001551,0.001094
1,6102620,2.670524e-08,8.424707e-07,3.014423e-08,2.420733e-07,4.095454e-08,1.403654e-07,0.000811,7e-06,0.000186,...,4.8e-05,4.8e-05,4.8e-05,4.8e-05,0.000811,7e-06,0.000186,2.1e-05,9.6e-05,4.8e-05


In [45]:
toxic_set=[]
severe_toxic_set=[]
obscene_set=[]
threat_set=[]
insult_set=[]
identity_hate_set=[]
for i in concat_sub.columns:
    if i[:5]=='toxic':
        toxic_set.append(i)
    if i[:12]=='severe_toxic':
        severe_toxic_set.append(i)
    if i[:7]=='obscene':
        obscene_set.append(i)
    if i[:6]=='threat':
        threat_set.append(i)
    if i[:6]=='insult':
        insult_set.append(i)
    if i[:13]=='identity_hate':
        identity_hate_set.append(i)
        
# print(toxic_set, severe_toxic_set, obscene_set, threat_set, insult_set, identity_hate_set)

In [46]:
models = 3
set = (toxic_set, severe_toxic_set, obscene_set, threat_set, insult_set, identity_hate_set)
for i in set:
    concat_sub[i[0][:-2]+'_max'] = concat_sub[i].iloc[:, 1:models].max(axis=1)
    concat_sub[i[0][:-2]+'_min'] = concat_sub[i].iloc[:, 1:models].min(axis=1)
    concat_sub[i[0][:-2]+'_mean'] = concat_sub[i].iloc[:, 1:models].mean(axis=1)
    concat_sub[i[0][:-2]+'_median'] = concat_sub[i].iloc[:, 1:models].median(axis=1)

In [48]:
cutoff_lo = 0.8
cutoff_hi = 0.2

In [49]:
for i in set:
    concat_sub[i[0][:-2]] = np.where(np.all(concat_sub[i].iloc[:,1:models] > cutoff_lo, axis=1), 
                                   concat_sub[i[0][:-2]+'_max'],
                                   np.where(np.all(concat_sub[i].iloc[:,1:models] < cutoff_hi, axis=1),
                                            concat_sub[i[0][:-2]+'_min'],
                                            concat_sub[i[0][:-2]+'_median']))

In [51]:
final_columns = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']
concat_sub[final_columns].to_csv(path_new + 'layer1_layer2_stacking.csv', index=False)