In [2]:
from sklearn.datasets import load_svmlight_file

## Load Dataset

In [3]:
train_dataset, train_labels = load_svmlight_file('aclImdb/train/labeledBow.feat', 89527)
test_dataset, test_labels = load_svmlight_file('aclImdb/test/labeledBow.feat', 89527)

train_dataset[train_dataset>0] = 1
for i in range(len(train_labels)):
    train_labels[i] = 1 if train_labels[i] > 5 else -1

test_dataset[test_dataset>0] = 1
for i in range(len(test_labels)):
    test_labels[i] = 1 if test_labels[i] > 5 else -1
train_dataset

<25000x89527 sparse matrix of type '<class 'numpy.float64'>'
	with 3456685 stored elements in Compressed Sparse Row format>

In [3]:
def saveOutput(filename, filedata):
    with open('output/'+filename, 'w') as f:
        for c in filedata:
            print(c, file=f)

## Naive Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(train_dataset, train_labels)
# print(dir(clf))
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("Naive Bayes Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('BBoW_NaiveBayes.txt', predicted)

Naive Bayes Accuracy: 82.992%


## Logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(train_dataset, train_labels)
# print(dir(clf))
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("Logistic Regression Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('BBoW_LR.txt', predicted)

Logistic Regression Accuracy: 86.972%


## SVM

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(train_dataset, train_labels)
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("SVM Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('BBoW_SVM.txt', predicted)

SVM Accuracy: 85.272%


## Feed Forward Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1)
clf.fit(train_dataset, train_labels)
predicted = clf.predict(test_dataset)
# print(predicted)
hit = 0
for i in range(len(test_labels)):
    hit += 1 if predicted[i] == test_labels[i] else 0

print("Feed Forward Neural Network Accuracy: " + str(hit*100/len(test_labels)) + "%")
saveOutput('BBoW_FFNN.txt', predicted)