In [1]:
from sklearn.datasets import load_svmlight_file
import numpy as np
import gensim

## Load Database

In [2]:
train_dataset, train_labels = load_svmlight_file('aclImdb/train/labeledBow.feat', 89527)
test_dataset, test_labels = load_svmlight_file('aclImdb/test/labeledBow.feat', 89527)

for i in range(len(train_labels)):
    train_labels[i] = 1 if train_labels[i] > 5 else -1

for i in range(len(test_labels)):
    test_labels[i] = 1 if test_labels[i] > 5 else -1
    
num_train = train_dataset.shape[0]
num_test = test_dataset.shape[0]

In [3]:
def saveOutput(filename, filedata):
    with open('output/'+filename, 'w') as f:
        for c in filedata:
            print(c, file=f)

## Load pre-trained model

In [4]:
# from gensim.scripts import glove2word2vec

# glove2word2vec('glove.6B/glove.6B.300d.txt', 'glove.6B/glove.6B.300d.txt.word2vec')

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B/glove.6B.300d.txt.word2vec', binary=False)

### GLoVE without TF-IDF

In [6]:
VocabFile  = 'aclImdb/imdb.vocab'
vocab = open(VocabFile,'r') 
dicti = vocab.readlines()
dicti = [item.strip() for item in dicti]
num_model = model.vector_size

xtr ,ytr = train_dataset.nonzero()
train_dataset_tmp = np.zeros(shape=(num_train, num_model))
rowindex = train_dataset.indptr

for i in range(num_train):
    for j in range(rowindex[i],rowindex[i+1]-1):
        word = dicti[ytr[j]]
        if word in model.vocab:
            train_dataset_tmp[i] += model.wv[word]
            
    train_dataset_tmp[i] /= rowindex[i+1] - rowindex[i]

xts, yts = test_dataset.nonzero()
test_dataset_tmp = np.zeros(shape=(num_test, num_model))
rowindex = test_dataset.indptr

for i in range(num_test):
    for j in range(rowindex[i],rowindex[i+1]-1):
        word = dicti[yts[j]]
        if word in model.vocab:
            test_dataset_tmp[i] += model.wv[word]
    test_dataset_tmp[i] /= rowindex[i+1] - rowindex[i]

train_dataset = train_dataset_tmp
test_dataset = test_dataset_tmp  

## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(train_dataset, train_labels)
# print(dir(clf))
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("Logistic Regression Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('GLoVE(without_tfidf)_LR.txt', predicted)

Logistic Regression Accuracy: 83.532%


## SVM

In [8]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(train_dataset, train_labels)
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("SVM Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('GLoVE(without_tfidf)_SVM.txt', predicted)

SVM Accuracy: 83.98%


## Feed Forward Neural Network

In [9]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1)
clf.fit(train_dataset, train_labels)
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("Feed Forward Neural Network Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('GLoVE(without_tfidf)_FFNN.txt', predicted)

Feed Forward Neural Network Accuracy: 83.836%
