In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, Embedding, Activation, Flatten
from keras.optimizers import Adam, SGD
import numpy as np
import re
import preprocessing as preproc
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import helpers
import models
from tqdm import tqdm
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, train_test_split

Using TensorFlow backend.


In [2]:
start_time = time.time()

In [3]:
train_data, y, test_data = helpers.get_processed_data(full_dataset=True)

In [4]:
np.unique(y)

array([0., 1.])

In [5]:
vectorizer = TfidfVectorizer(stop_words=None, ngram_range=(1,2), sublinear_tf=True, max_features=500)
X = vectorizer.fit_transform(train_data).toarray()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [52]:
optimizer = Adam(lr=0.03, beta_1=0.9, beta_2=0.99, epsilon=None, decay=0.001, amsgrad=False)

In [15]:
# optimizer = SGD(lr=0.04, momentum=0.08, decay=0.001, nesterov=True)

In [53]:
X.shape

(2500000, 500)

In [54]:
def create_model():
    model = Sequential()
    model.add(Dense(units=64, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model
model = create_model()
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [21]:
result = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32, verbose=2, 
                  callbacks=helpers.checkpointing())

Train on 2125000 samples, validate on 375000 samples
Epoch 1/3
 - 148s - loss: 0.4608 - acc: 0.7722 - val_loss: 0.4503 - val_acc: 0.7788

Epoch 00001: val_acc improved from -inf to 0.77877, saving model to ../data/intermediate/weights-improvement-01-0.78.hdf5

Epoch 00001: val_loss improved from inf to 0.45028, saving model to ../data/intermediate/weights-improvement-01-0.45.hdf5
Epoch 2/3
 - 148s - loss: 0.4490 - acc: 0.7810 - val_loss: 0.4472 - val_acc: 0.7813

Epoch 00002: val_acc improved from 0.77877 to 0.78125, saving model to ../data/intermediate/weights-improvement-02-0.78.hdf5

Epoch 00002: val_loss improved from 0.45028 to 0.44721, saving model to ../data/intermediate/weights-improvement-02-0.45.hdf5
Epoch 3/3
 - 148s - loss: 0.4465 - acc: 0.7826 - val_loss: 0.4456 - val_acc: 0.7822

Epoch 00003: val_acc improved from 0.78125 to 0.78218, saving model to ../data/intermediate/weights-improvement-03-0.78.hdf5

Epoch 00003: val_loss improved from 0.44721 to 0.44562, saving model 

In [55]:
from sklearn.model_selection import StratifiedKFold
kfold = KFold(n_splits=4, shuffle=True, random_state=42)

In [None]:
for i in range(1,10):
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        result = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=32, verbose=2, 
                  callbacks=helpers.checkpointing())

Train on 1875000 samples, validate on 625000 samples
Epoch 1/1
 - 138s - loss: 0.4624 - acc: 0.7708 - val_loss: 0.4540 - val_acc: 0.7761

Epoch 00001: val_acc improved from -inf to 0.77607, saving model to ../data/intermediate/weights-improvement-01-0.78.hdf5

Epoch 00001: val_loss improved from inf to 0.45396, saving model to ../data/intermediate/weights-improvement-01-0.45.hdf5
Train on 1875000 samples, validate on 625000 samples
Epoch 1/1
 - 137s - loss: 0.4538 - acc: 0.7765 - val_loss: 0.4503 - val_acc: 0.7793

Epoch 00001: val_acc improved from -inf to 0.77934, saving model to ../data/intermediate/weights-improvement-01-0.78.hdf5

Epoch 00001: val_loss improved from inf to 0.45032, saving model to ../data/intermediate/weights-improvement-01-0.45.hdf5
Train on 1875000 samples, validate on 625000 samples
Epoch 1/1
 - 138s - loss: 0.4523 - acc: 0.7776 - val_loss: 0.4485 - val_acc: 0.7809

Epoch 00001: val_acc improved from -inf to 0.78094, saving model to ../data/intermediate/weights

In [14]:
# result = model.fit_generator(generator=helpers.batch_generator(X_train, y_train, 32),
#                    epochs=10, validation_data=(X_test,y_test),
#                     steps_per_epoch=int(X_train.shape[0]/32), verbose=1)

In [22]:
np.mean(result.history['val_acc'])

0.780731555556827

In [24]:
model.load_weights('../data/intermediate/weights-improvement-03-0.78.hdf5')

In [27]:
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.

In [None]:
# a = batch_generator(X,y,32)

In [None]:
# next(a)

# best model so far
adam = Adam(lr=0.02, beta_1=0.9, beta_2=0.99, epsilon=None, decay=0.001, amsgrad=False)
model = Sequential()
model.add(Dense(units=64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.summary()