In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, Embedding, Activation, Flatten
from keras.optimizers import Adam, SGD
import numpy as np
import re
import preprocessing as preproc
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import helpers
import models
from tqdm import tqdm
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, train_test_split

Using TensorFlow backend.


In [2]:
start_time = time.time()

In [3]:
train_data, y, test_data = helpers.get_processed_data(full_dataset=False)

In [4]:
np.unique(y)

array([0., 1.])

In [None]:
# model, X_test = helpers.transform_and_fit(train_data, y, test_data, text_representation='word2vec',
#                                         ml_algorithm='LR', cross_val=True, predefined=False)

In [5]:
vectorizer = TfidfVectorizer(stop_words=None, ngram_range=(1,2), sublinear_tf=True, max_features=100)
X = vectorizer.fit_transform(train_data)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [7]:
optimizer = Adam(lr=0.03, beta_1=0.9, beta_2=0.99, epsilon=None, decay=0.001, amsgrad=False)

In [9]:
# optimizer = SGD(lr=0.04, momentum=0.08, decay=0.001, nesterov=True)

In [8]:
X.shape

(200000, 100)

In [9]:
def create_model():
    model = Sequential()
    model.add(Dense(units=64, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(1, activation='tanh'))
    return model
model = create_model()
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [10]:
result = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32, verbose=2)

Train on 170000 samples, validate on 30000 samples
Epoch 1/3
 - 18s - loss: 0.5532 - acc: 0.7073 - val_loss: 0.5387 - val_acc: 0.7176
Epoch 2/3
 - 17s - loss: 0.5337 - acc: 0.7224 - val_loss: 0.5331 - val_acc: 0.7204
Epoch 3/3
 - 18s - loss: 0.5271 - acc: 0.7269 - val_loss: 0.5304 - val_acc: 0.7236


In [15]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=4, shuffle=True, random_state=42)

In [16]:
for i in range(1,10):
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        result = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=32, verbose=2, 
                  callbacks=helpers.checkpointing())

Train on 150000 samples, validate on 50000 samples
Epoch 1/1
 - 16s - loss: 0.5530 - acc: 0.7080 - val_loss: 0.5422 - val_acc: 0.7151

Epoch 00001: saving model to ../data/intermediate/weights-improvement-01-0.5422--0.7151.hdf5
Train on 150000 samples, validate on 50000 samples
Epoch 1/1
 - 16s - loss: 0.5357 - acc: 0.7207 - val_loss: 0.5309 - val_acc: 0.7219

Epoch 00001: saving model to ../data/intermediate/weights-improvement-01-0.5309--0.7219.hdf5
Train on 150000 samples, validate on 50000 samples
Epoch 1/1
 - 19s - loss: 0.5294 - acc: 0.7244 - val_loss: 0.5244 - val_acc: 0.7321

Epoch 00001: saving model to ../data/intermediate/weights-improvement-01-0.5244--0.7321.hdf5
Train on 150000 samples, validate on 50000 samples
Epoch 1/1
 - 16s - loss: 0.5278 - acc: 0.7259 - val_loss: 0.5166 - val_acc: 0.7325

Epoch 00001: saving model to ../data/intermediate/weights-improvement-01-0.5166--0.7325.hdf5
Train on 150000 samples, validate on 50000 samples
Epoch 1/1
 - 16s - loss: 0.5228 - acc

KeyboardInterrupt: 

In [14]:
# result = model.fit_generator(generator=helpers.batch_generator(X_train, y_train, 32),
#                    epochs=10, validation_data=(X_test,y_test),
#                     steps_per_epoch=int(X_train.shape[0]/32), verbose=1)

In [22]:
np.mean(result.history['val_acc'])

0.780731555556827

In [24]:
model.load_weights('../data/intermediate/weights-improvement-03-0.78.hdf5')

In [27]:
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.

In [None]:
# a = batch_generator(X,y,32)

In [None]:
# next(a)

# best model so far
adam = Adam(lr=0.02, beta_1=0.9, beta_2=0.99, epsilon=None, decay=0.001, amsgrad=False)
model = Sequential()
model.add(Dense(units=64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.summary()