In [3]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text
from keras.layers import SpatialDropout1D
from keras import metrics


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

y_train = train.label.values
y_test = test.label.values

tk = text.Tokenizer(num_words=200000)
train.link = train.link_name.astype(str)
test.link = test.link_name.astype(str)
train.text = train.textdata.astype(str)
test.text = test.textdata.astype(str)

max_len = 80

tk.fit_on_texts(list(train.link.values) + list(train.text.values) + list(test.link.values) + list(
    test.text.values))
x_train_title = tk.texts_to_sequences(train.link.values)
x_train_title = sequence.pad_sequences(x_train_title, maxlen=max_len)

x_train_textdata_01 = tk.texts_to_sequences(train.text.values)
x_train_textdata_01 = sequence.pad_sequences(x_train_textdata_01, maxlen=max_len)

x_test_title_01 = tk.texts_to_sequences(test.link.values)
x_test_title_01 = sequence.pad_sequences(x_test_title_01, maxlen=max_len)

x_test_textdata_02 = tk.texts_to_sequences(test.text.values)
x_test_textdata_02 = sequence.pad_sequences(x_test_textdata_02, maxlen=max_len)

word_index = tk.word_index
ytrain_enc = np_utils.to_categorical(y_train)

classifier = Sequential()
classifier.add(Embedding(len(word_index), 300, input_length=80, dropout=0.2),)
classifier.add(LSTM(300, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))

classifier.add(Dense(200))
classifier.add(PReLU())
classifier.add(SpatialDropout1D(0.2))

classifier.add(BatchNormalization())

classifier.add(Dense(200))
classifier.add(PReLU())
classifier.add(SpatialDropout1D(0.2))
classifier.add(BatchNormalization())

classifier.add(Flatten())

classifier.add(Dense(2))
classifier.add(Activation('softmax'))


classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc' ])


checkpoint = ModelCheckpoint('data/weights.h5', monitor='val_acc', save_best_only=True, verbose=2)

classifier.fit(x_train_title, y=ytrain_enc,
                 batch_size=128, epochs=20, verbose=2, validation_split=0.1,
                 shuffle=True, callbacks=[checkpoint])



Train on 3523 samples, validate on 392 samples
Epoch 1/20
 - 60s - loss: 1.0906 - acc: 0.6267 - val_loss: 0.6054 - val_acc: 0.7347

Epoch 00001: val_acc improved from -inf to 0.73469, saving model to data/weights.h5
Epoch 2/20
 - 54s - loss: 0.3200 - acc: 0.8788 - val_loss: 0.2306 - val_acc: 0.9464

Epoch 00002: val_acc improved from 0.73469 to 0.94643, saving model to data/weights.h5
Epoch 3/20
 - 54s - loss: 0.0432 - acc: 0.9830 - val_loss: 0.2904 - val_acc: 0.9388

Epoch 00003: val_acc did not improve from 0.94643
Epoch 4/20
 - 55s - loss: 0.0139 - acc: 0.9952 - val_loss: 0.2862 - val_acc: 0.9566

Epoch 00004: val_acc improved from 0.94643 to 0.95663, saving model to data/weights.h5
Epoch 5/20
 - 53s - loss: 0.0071 - acc: 0.9972 - val_loss: 0.2533 - val_acc: 0.9592

Epoch 00005: val_acc improved from 0.95663 to 0.95918, saving model to data/weights.h5
Epoch 6/20
 - 54s - loss: 0.0044 - acc: 0.9983 - val_loss: 0.2917 - val_acc: 0.9541

Epoch 00006: val_acc did not improve from 0.9591

<keras.callbacks.History at 0x16c91d5d6d8>