In [1]:
import numpy as np
import pandas as pd 
import math
from sklearn import feature_extraction
from sklearn import svm
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf
from keras.utils import np_utils

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

def load_file(filename):
    file = open(filename, 'r', encoding='UTF-8', newline='')
    X = []
    y = []
    for line in file:
        split = line.split('\t')
        label = split[1]
        review = tokenizer.tokenize(split[0])
        review = [lemmer.lemmatize(x.lower()) for x in review]
        review = " ".join(review)
        X.append(review)
        y.append(label)
    return X,y

Using TensorFlow backend.


In [6]:
X_train, y_train = load_file("IMDB-train.txt")
X_dev, y_dev = load_file("IMDB-valid.txt")
X_test, y_test = load_file("IMDB-test.txt")

y_final_train = np.array(y_train + y_dev)
X_final_train = np.array([row for row in X_train] + [row for row in X_dev])

vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range = (1,2), binary = True)
#vectorizer = feature_extraction.text.CountVectorizer(ngram_range = (1,2), binary = True)
vectorizer.fit(X_train + X_dev + X_test)
X_train = vectorizer.transform(X_train)
X_dev = vectorizer.transform(X_dev)
X_test = vectorizer.transform(X_test)
X_final_train = vectorizer.transform(X_final_train)

y_train = np_utils.to_categorical(y_train, 2)
y_dev = np_utils.to_categorical(y_dev, 2)
y_test = np_utils.to_categorical(y_test, 2)

In [7]:
np.random.seed(0)

In [8]:
batch_size_list = [400, 500]
neuron_list = [100, 75] #hidden neurons
dropout_list = [0.3, 0.4, 0.5]

best_score = 0
best_batch = 0
best_neuron = 0
best_dropout = 0

for batch in batch_size_list:
    for neuron in neuron_list:
        for dropout in dropout_list:
            print('---------------------------------------------------------------------')
            print('batch: ' + str(batch) + ' neuron:' + str(neuron) + ' dropout: ' + str(dropout))
            input_dim = len(vectorizer.vocabulary_)
            model = Sequential()
            model.add(Dense(neuron, input_dim=input_dim, activation='relu'))
            model.add(Dense(neuron, activation='relu'))
            model.add(Dropout(dropout))
            model.add(Dense(2, activation='softmax'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(X_train, y_train, epochs=2, batch_size=batch)
            scores = model.evaluate(X_dev, y_dev)
            print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
            test_scores = model.evaluate(X_test, y_test)
            print("\n%s: %.2f%%" % (model.metrics_names[1], test_scores[1]*100))

---------------------------------------------------------------------
batch: 400 neuron:100 dropout: 0.3
Epoch 1/2
Epoch 2/2

acc: 91.08%

acc: 90.22%
---------------------------------------------------------------------
batch: 400 neuron:100 dropout: 0.4
Epoch 1/2
Epoch 2/2

acc: 90.95%

acc: 90.31%
---------------------------------------------------------------------
batch: 400 neuron:100 dropout: 0.5
Epoch 1/2
Epoch 2/2

acc: 91.13%

acc: 90.23%
---------------------------------------------------------------------
batch: 400 neuron:75 dropout: 0.3
Epoch 1/2
Epoch 2/2

acc: 90.82%

acc: 89.87%
---------------------------------------------------------------------
batch: 400 neuron:75 dropout: 0.4
Epoch 1/2
Epoch 2/2

acc: 90.98%

acc: 90.16%
---------------------------------------------------------------------
batch: 400 neuron:75 dropout: 0.5
Epoch 1/2
Epoch 2/2

acc: 90.90%

acc: 90.16%
---------------------------------------------------------------------
batch: 500 neuron:100 dropo

KeyboardInterrupt: 