In [16]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.linear_model as model
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import BatchNormalization
from tflearn.data_utils import to_categorical, pad_sequences

Define a helper function to calculate classification error:

In [5]:
def class_error(y, y_pred):
    misclassified = 0
    for i in range(len(y)):
        if y[i] != y_pred[i]:
            misclassified += 1
    return float(misclassified) / len(y)

A helper function to apply TF-IDF to the data: 

In [9]:
def TFIDF(X, labels):
    documents = []

    for element in X:
        temp = ''
        for i in range(len(element) - 1):
            word = labels[i + 1] + ' '
            new_word = word * element[i]
            temp += (new_word)
        documents.append(temp)
        
    tokenize = lambda doc: doc.lower().split(" ")
    sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
    return sklearn_tfidf.fit_transform(documents)

A helper function for writing the test prediction to a text file in csv format:

In [10]:
def writeToText(predictions, filename):
    array = [["Id","Prediction"]]
    index = 1
    for i in range(len(predictions)):
        predict = predictions[index - 1]
        array.append([index, int(predictions[index - 1])])
        index += 1
    f = open(filename, 'w')
    writer = csv.writer(f, delimiter=',', quotechar='|')
    writer.writerows(array)

A helper function to build our neural network on train_X2 and train_y2. It then outputs the predictions for testX.

In [20]:
def neuralNetwork(train_X2, train_y2, testX):
    train_y = to_categorical(train_y2, 2)
    
    model = Sequential()

    keras.layers.BatchNormalization(axis=-1, 
    momentum=0.99, epsilon=0.001, center=True, 
    scale=True, beta_initializer='zeros', 
    gamma_initializer='ones', moving_mean_initializer='zeros', 
    moving_variance_initializer='ones', beta_regularizer=None, 
    gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
    
    model.add(Dense(600, input_dim=1000)) 
    model.add(Activation('tanh'))
    model.add(Dropout(0.30))
    model.add(BatchNormalization())
    
    model.add(Dense(300)) 
    model.add(Activation('tanh'))
    model.add(Dropout(0.20))
    model.add(BatchNormalization())

    model.add(Dense(100)) 
    model.add(Activation('tanh'))
    model.add(Dropout(0.10))
    model.add(BatchNormalization())

    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

    fit = model.fit(train_X2[:, :1000], train_y, batch_size=128, nb_epoch=5, verbose=1)
    
    prediction = model.predict(testX[:, :1000])
    
    return prediction

A helper function to use the predictions from the 4 models to reach a consensus. Assuming pred2 is the prediction of the logistic model, we weight it twice as much as the other models since it performed the best individually.

In [14]:
def TieBreak(neural, pred1, pred2, pred3):
    ones = 0
    if neural == 1:
        ones += 1 
    if pred1 == 1:
        ones += 1
    if pred2 == 1:
        ones += 2
    if pred3 == 1:
        ones += 1
    if ones > 2:
        return 1
    return 0

Import the data:

In [4]:
train_data = np.genfromtxt('Data/training_data.txt',dtype='str')
test_data = np.genfromtxt('Data/test_data.txt',dtype='str')
train_labels = train_data[0, :]
train_stars = train_data[1:, 0]
train_reviews = train_data[1:, 1:]

test_labels = test_data[0, :]
test_reviews = test_data[1:, 0:]

trainX = train_reviews
trainY = train_stars
testX = test_reviews

Split the data into 80% training set and 20% validation set:

In [6]:
num_rows = []
for i in range(len(trainY)):
    num_rows.append(i)
    
np.random.shuffle(num_rows)
train_indices = num_rows[:16000]
valid_indices = num_rows[16000:]

train_X = []
train_y = []
valid_X = []
valid_y = []

for index in train_indices:
    train_X.append(trainX[index])
    train_y.append(trainY[index])
    
for index in valid_indices:
    valid_X.append(trainX[index])
    valid_y.append(trainY[index])

Convert the contents of the arrays to integers instead of strings:

In [8]:
train_X2 = []
for row in train_X:
    temp = []
    for element in row:
        temp.append(int(element))
    train_X2.append(temp)

valid_X2 = []
for row in valid_X:
    temp = []
    for element in row:
        temp.append(int(element))
    valid_X2.append(temp)
    
valid_y2 = []
for row in valid_y:
    valid_y2.append(int(row))
    
train_y2 = []
for row in train_y:
    train_y2.append(int(row))
    
test_X2 = []
for row in testX:
    temp = []
    for element in row:
        temp.append(int(element))
    test_X2.append(temp)

train_X2 = np.array(train_X2)
valid_X2 = np.array(valid_X2)
train_y2 = np.array(train_y2)
valid_y2 = np.array(valid_y2)
test_X2 = np.array(test_X2)

Generate an array that re-combines the training and validation set. Also, run the TFIDF function on all the data:

In [11]:
all_X = np.concatenate((train_X2, valid_X2))
all_y = np.concatenate((train_y2, valid_y2))
valid_TFIDF = TFIDF(valid_X2, train_labels)
train_TFIDF = TFIDF(train_X2, train_labels)
all_TFIDF = TFIDF(all_X, train_labels)
test_TFIDF = TFIDF(test_X2, train_labels)

Run our ensemble model on the training data. The ensemble model is a mix of logistic, SGD, naive bayes, and a neural network. Each of these trains on the training data. We then look at the confidence scores that the logistic and sgd models have in their predictions. If both have magnitudes above 1 and have the same signs, we use these. Otherwise, we call TieBreaker to give us a consensus between the four models. 

In [21]:
logistic = model.LogisticRegression()
logistic.fit(train_TFIDF, train_y2)

sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, random_state=42)
sgd.fit(train_TFIDF, train_y2)

nb = MultinomialNB(alpha=50).fit(train_TFIDF, train_y2)

sgd_confidence = sgd.decision_function(valid_TFIDF)
log_confidence = logistic.decision_function(valid_TFIDF)

pred1 = sgd.predict(valid_TFIDF)
pred2 = logistic.predict(valid_TFIDF)
pred3 = nb.predict(valid_TFIDF)

pred_n = neuralNetwork(train_TFIDF, train_y2, valid_TFIDF)

pred_neural = []
for i in range(0, len(pred_n)):
    if abs(pred_n[i][0]-1) < abs(pred_n[i][1]-1):
        pred_neural.append(0)
    else:
        pred_neural.append(1)
        

pred_blend = []
pred_blend_uncertain = []

for i in range(len(pred1)):
    if (log_confidence[i] > 1 and sgd_confidence[i] > 1) or (log_confidence[i] < -1 and sgd_confidence[i] < -1):
        pred_blend.append(pred1[i])
    else:
        pred_blend.append(TieBreak(pred_neural[i], pred1[i], pred2[i], pred3[i]))
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 600)               600600    
_________________________________________________________________
activation_1 (Activation)    (None, 600)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 600)               2400      
_________________________________________________________________
dense_4 (Dense)              (None, 300)               180300    
_________________________________________________________________
activation_2 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
__________



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


print the classification error:

In [22]:
class_error(pred_blend, valid_y2)

0.146

Now, repeat this, training on all the data, and generating predictions for the test set.

In [23]:
logistic = model.LogisticRegression()
logistic.fit(all_TFIDF, all_y)

sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, random_state=42)
sgd.fit(all_TFIDF, all_y)

nb = MultinomialNB(alpha=50).fit(all_TFIDF, all_y)

sgd_confidence = sgd.decision_function(test_TFIDF)
log_confidence = logistic.decision_function(test_TFIDF)

pred1 = sgd.predict(test_TFIDF)
pred2 = logistic.predict(test_TFIDF)
pred3 = nb.predict(test_TFIDF)

pred_n = neuralNetwork(all_TFIDF, all_y, test_TFIDF)

pred_neural = []
for i in range(0, len(pred_n)):
    if abs(pred_n[i][0]-1) < abs(pred_n[i][1]-1):
        pred_neural.append(0)
    else:
        pred_neural.append(1)
        

pred_blend = []
pred_blend_uncertain = []

for i in range(len(pred1)):
    if (log_confidence[i] > 1 and sgd_confidence[i] > 1) or (log_confidence[i] < -1 and sgd_confidence[i] < -1):
        pred_blend.append(pred1[i])
    else:
        pred_blend.append(TieBreak(pred_neural[i], pred1[i], pred2[i], pred3[i]))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 600)               600600    
_________________________________________________________________
activation_5 (Activation)    (None, 600)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 600)               0         
_________________________________________________________________
batch_normalization_8 (Batch (None, 600)               2400      
_________________________________________________________________
dense_8 (Dense)              (None, 300)               180300    
_________________________________________________________________
activation_6 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
__________



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Write the results to a text file.

In [24]:
writeToText(pred_blend, "Blended_log_deeper_neural.txt")