## a. Download File And Load The Contents

In [236]:
def readFile(filename):
    data = []
    with open(filename,"r") as text_file:
        lines = text_file.read().split('\n')
        for line in lines:
            if line!='' and line.split('\t') and line.split("\t")!='' and len(line.split('\t'))==2:
                data.append(line.split('\t'))
    return data

imdbFilename = "./sentiment labelled sentences/imdb_labelled.txt"
imdbData = readFile(imdbFilename)
yelpFilename = "./sentiment labelled sentences/yelp_labelled.txt"
yelpData = readFile(yelpFilename)
amazonFilename = "./sentiment labelled sentences/amazon_cells_labelled.txt"
amazonData = readFile(amazonFilename)


## Check if Balanced

In [237]:
imdbData.sort(key=lambda x: x[1])
yelpData.sort(key=lambda x: x[1])
amazonData.sort(key=lambda x: x[1])

def check_balanced(zeros, ones):
    if(zeros == ones):
        print("Balanced")
    else:
        print("Zeros:", zeros, ",Ones:", ones)

imdbTrainingData = [data[0] for data in imdbData]
imdbLabels = [int(data[1]) for data in imdbData]
zerosImdb = imdbLabels.count(0)
onesImdb = imdbLabels.count(1)
check_balanced(zerosImdb, onesImdb)

yelpTrainingData = [data[0] for data in yelpData]
yelpLabels = [int(data[1]) for data in yelpData]
zerosYelp = yelpLabels.count(0)
oneYelp = yelpLabels.count(1)
check_balanced(zerosYelp, oneYelp)

amazonTrainingData = [data[0] for data in amazonData]
amazonLabels = [int(data[1]) for data in amazonData]
zerosAmazon = amazonLabels.count(0)
onesAmazon = amazonLabels.count(1)
check_balanced(zerosAmazon, onesAmazon)


Balanced
Balanced
Balanced


## b. Pre-Processing

In [238]:
#Pre-Processing
import nltk
import string
from nltk import word_tokenize,pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_data(sentence):
    tempWord  = []
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('','',string.punctuation))
    tokens = nltk.wordpunct_tokenize(sentence)
    lmtzr = WordNetLemmatizer()
    for i,j in pos_tag(tokens):
        if i.isalpha():
            posValue = get_wordnet_pos(j[0])
            if posValue:
                word = "".join(lmtzr.lemmatize(i,pos=posValue))
            else:
                word = "".join(lmtzr.lemmatize(i))
            tempWord.append(word)
    tempWord = list(filter(lambda word: word not in stopwords.words('english'), tempWord))
    return " ".join(tempWord)
   

newImdbTrainingData = []
for lines in imdbTrainingData:
    newLine = clean_data(lines)
    newImdbTrainingData.append(newLine)

newYelpTrainingData = []
for lines in yelpTrainingData:
    newLine = clean_data(lines)
    newYelpTrainingData.append(newLine)

newAmazonTrainingData = []
for lines in amazonTrainingData:
    newLine = clean_data(lines)
    newAmazonTrainingData.append(newLine)

    

## c. Split training and test set

In [239]:
import numpy as np
import pandas as pd
trainingData = newImdbTrainingData[0:400] + newImdbTrainingData[500:900] + newYelpTrainingData[0:400] + newYelpTrainingData[500:900] + newAmazonTrainingData[0:400] + newAmazonTrainingData[500:900] 
trainingLabels = imdbLabels[0:400] + imdbLabels[500:900] + yelpLabels[0:400] + yelpLabels[500:900] + amazonLabels[0:400] + amazonLabels[500:900]
testData = newImdbTrainingData[400:500] + newImdbTrainingData[900:1000] + newYelpTrainingData[400:500] + newYelpTrainingData[900:1000] + newAmazonTrainingData[400:500] + newAmazonTrainingData[900:1000] 
testLabels = imdbLabels[400:500] + imdbLabels[900:1000] + yelpLabels[400:500] + yelpLabels[900:1000] + amazonLabels[400:500] + amazonLabels[900:1000]
fullData = trainingData + testData


## d. Bag of words

In [240]:
def wordTokenizer(text):
    return text.split(" ")

def uniqueWordDict(trainingData):
    words = set()
    for word in trainingData:
        for i in wordTokenizer(word):
            if i:
                words.add(i)
    words = list(words)
    return words
def bagOfWords(fullData):
    uniqueWords = uniqueWordDict(trainingData)
    X = np.zeros((len(fullData),len(uniqueWords)))
    for j in range(0, len(fullData)):
        review = wordTokenizer(fullData[j])
        
        for word in review:
            if word in uniqueWords:
                index = list(uniqueWords).index(word)
                X[j,index]+= 1
    return X
        
X_uniqueWords = bagOfWords(fullData)

In [241]:
X_train = X_uniqueWords[0:2400]
X_test = X_uniqueWords[2400:3000]
np.set_printoptions(threshold=np.nan)
print(X_train[0:2])

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0. 

## e. Post Processing Strategy

In [242]:
## LOG NORMALIZATION
import math
X_normalized_train = X_train + 1
X_normalized_test = X_test + 1
X_normalized_train = np.log(X_normalized_train)
X_normalized_test = np.log(X_normalized_test)
print(X_normalized_test.shape)

(600, 3832)


In [156]:
## L2 NORMALIZATION
# from sklearn import preprocessing
# X_normalized_train = preprocessing.normalize(X_train, norm='l2')
# X_normalized_test = preprocessing.normalize(X_test, norm='l2')

## L1 NORMALIZATION
#X_normalized_train = preprocessing.normalize(X_train, norm='l1')
#X_normalized_test = preprocessing.normalize(X_test, norm='l1')

## f. Sentiment Prediction - Logistic Regression

In [243]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
#Perform Logistis Regression
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(X_normalized_train,trainingLabels)
y_test = logReg.predict(X_normalized_test)
print("Accuracy Score of Logistic Regression on bag of words: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix of Logistic Regression on bag of words: \n",confusion_matrix(testLabels, y_test))

Accuracy Score of Logistic Regression on bag of words:  0.821666666667
Confusion Matrix of Logistic Regression on bag of words: 
 [[258  42]
 [ 65 235]]


In [244]:
## Weight Vector, Most Important Words
coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Logistic Regression on Bag of Words: \n",impWords[-20:])

Important words from Logistic Regression on Bag of Words: 
 ['disappointed', 'didnt', 'beautiful', 'awful', 'suck', 'happy', 'good', 'waste', 'terrible', 'awesome', 'amaze', 'best', 'fantastic', 'nice', 'poor', 'excellent', 'delicious', 'love', 'bad', 'great']


## f. Sentiment Prediction - Naive Bayes

In [245]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
clf = MultinomialNB().fit(X_normalized_train,trainingLabels)
y_test = clf.predict(X_normalized_test)
print("Accuracy Score of Logistic Regression on bag of words: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix of Logistic Regression on bag of words:\n ",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.coef_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB on Bag of Words: \n",impWords[-20:])

Accuracy Score of Logistic Regression on bag of words:  0.806666666667
Confusion Matrix of Logistic Regression on bag of words:
  [[253  47]
 [ 69 231]]
Important words from NB on Bag of Words: 
 ['foot', 'mechanism', 'medical', 'clichã', 'ironside', 'misleading', 'borrow', 'bug', 'predictably', 'lame', 'um', 'babble', 'pen', 'earpad', 'wave', 'annoy', 'inexplicable', 'james', 'pandering', 'facial']


## g. N-gram Model

In [246]:
def uniqueWordDictNgram(trainingData):
    words = set()
    for word in trainingData:
        ngram = find_ngrams(wordTokenizer(word),2)
        for z in ngram:
            words.add(z)
    words = list(words)
    return words

def find_ngrams(uniqueWords, n):
    ngrams = zip(*[uniqueWords[i:] for i in range(n)])
    return [' '.join(x) for x in ngrams]
               

def bagOfWords(fullData):
    uniqueNGrams = uniqueWordDictNgram(trainingData)
    X = np.zeros((len(fullData),len(uniqueNGrams)))
    for j in range(0, len(fullData)):
        review = wordTokenizer(fullData[j])
        review_ngram=find_ngrams(review,2)
        for word in review_ngram:
            if word in uniqueNGrams:
                index = list(uniqueNGrams).index(word)
                #print(index)
                X[j,index]+= 1
    return X,uniqueNGrams
        
X_uniqueWords_ngram, uniqueNGrams = bagOfWords(fullData)

In [247]:
print(uniqueNGrams)



## g. N-Gram - Logistic, Naive

In [248]:
X_train = X_uniqueWords_ngram[0:2400]
X_test = X_uniqueWords_ngram[2400:3000]
X_normalized_train = X_train + 1
X_normalized_test = X_test + 1
X_normalized_train = np.log(X_normalized_train)
X_normalized_test = np.log(X_normalized_test)

logReg = LogisticRegression(class_weight="balanced")
logReg.fit(X_normalized_train,trainingLabels)
y_test = logReg.predict(X_normalized_test)
print("Accuracy of N-Gram using Logistic Regression: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix:\n ",confusion_matrix(testLabels, y_test))


coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Logistic Regression on Bag of Words: \n",impWords[-20:])

clf = MultinomialNB().fit(X_normalized_train,trainingLabels)
y_test = clf.predict(X_normalized_test)
print("Accuracy of N-Gram using NB: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.coef_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB on Bag of Words: \n",impWords[-20:])

Accuracy of N-Gram using Logistic Regression:  0.635
Confusion Matrix:
  [[269  31]
 [188 112]]
Important words from Logistic Regression on Bag of Words: 
 ['love place', 'wont disappoint', 'dont think', 'customer service', 'food delicious', 'great service', 'good price', 'dont buy', 'great food', 'easy use', 'waste money', 'really good', 'dont waste', 'food good', 'great product', 'great phone', 'one best', 'waste time', 'highly recommend', 'work great']
Accuracy of N-Gram using NB:  0.635
Confusion Matrix:
 [[271  29]
 [190 110]]
Important words from NB on Bag of Words: 
 ['oconnor energetic', 'purpose bluetooth', 'total letdown', 'could flush', 'exaggerate every', 'wifi notice', 'masculinity pledge', 'friend like', 'need water', 'vegetable overcook', 'mollusk see', 'movie avoid', 'tool communicate', 'minute finally', 'please stay', 'overall tremendously', 'note plug', 'charger car', 'issue methe', 'hour operation']


## h. PCA for bag of words

In [249]:
X_train = X_uniqueWords[0:2400]
X_test = X_uniqueWords[2400:3000]
X_normalized_train = X_train + 1
X_normalized_test = X_test + 1
X_normalized_train = np.log(X_normalized_train)
X_normalized_test = np.log(X_normalized_test)


def PCA(X,Xtest,r):
    mean = np.mean(X, axis=0)
    meanSubtractionTrain = np.asarray(X-mean)
    meanTest = np.asarray(Xtest-mean)
    U,D,Vt = np.linalg.svd(meanSubtractionTrain)
    def features(r):
        Vtr = Vt[:r,:]
        F = meanSubtractionTrain@Vtr.T
        F_test = meanTest@Vtr.T
        return F,F_test
    return features(r)

F,F_test = PCA(X_normalized_train,X_normalized_test,10)
F50,F50_test = PCA(X_normalized_train,X_normalized_test,50)
F100,F100_test = PCA(X_normalized_train,X_normalized_test,100)


In [164]:
# # Comparison with actual PCA
# from sklearn.decomposition import PCA
# pca = PCA(n_components=10)
# pca.fit(X_normalized_train)
# F = pca.transform(X_normalized_train)
# F_test = pca.transform(X_normalized_test)
# logReg = LogisticRegression(class_weight="balanced")
# logReg.fit(F,trainingLabels)
# y_test = logReg.predict(F_test)
# print(accuracy_score(testLabels, y_test))
# print(confusion_matrix(testLabels, y_test))

In [250]:
#Repeat with f. Logistic
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F,trainingLabels)
y_test = logReg.predict(F_test)
print("Accuracy PCA 10: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 10:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Logistic Regression PCA 10: \n",impWords[-20:])


#R=50
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F50,trainingLabels)
y_test = logReg.predict(F50_test)
print("Accuracy PCA 50: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 50:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Logistic Regression PCA 50: \n",impWords[-20:])

#R=100
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F100,trainingLabels)
y_test = logReg.predict(F100_test)
print("Accuracy PCA 100: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 100:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Logistic Regression PCA 100: \n",impWords[-20:])


print(confusion_matrix(testLabels, y_test))

Accuracy PCA 10:  0.641666666667
Confusion Matrix PCA 10:
 [[247  53]
 [162 138]]
Important words from Logistic Regression PCA 10: 
 ['beateous', 'evoke', 'road', 'installed', 'turn', 'spy', 'possibly', 'incorrectness', 'fond', 'china']
Accuracy PCA 50:  0.695
Confusion Matrix PCA 50:
 [[254  46]
 [137 163]]
Important words from Logistic Regression PCA 50: 
 ['funny', 'man', 'motorolas', 'english', 'turn', 'row', 'sauce', 'interesting', 'spy', 'splendid', 'scar', 'redeeming', 'possibly', 'charisma', 'max', 'preferably', 'typical', 'incorrectness', 'fond', 'china']
Accuracy PCA 100:  0.686666666667
Confusion Matrix PCA 100:
 [[242  58]
 [130 170]]
Important words from Logistic Regression PCA 100: 
 ['turn', 'difficult', 'unaccompanied', 'row', 'exploit', 'interesting', 'sauce', 'spy', 'number', 'splendid', 'scar', 'possibly', 'max', 'redeeming', 'typical', 'charisma', 'preferably', 'incorrectness', 'fond', 'china']
[[242  58]
 [130 170]]


In [251]:
#Repeat with f. Naive Bayes
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(F,trainingLabels)
y_test = clf.predict(F_test)
print("Accuracy PCA 10: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 10:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB PCA 10: \n",impWords[-20:])

clf = GaussianNB().fit(F50,trainingLabels)
y_test = clf.predict(F50_test)
print("Accuracy PCA 50: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 50:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB PCA 50: \n",impWords[-20:])

clf = GaussianNB().fit(F100,trainingLabels)
y_test = clf.predict(F100_test)
print("Accuracy PCA 100: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix PCA 100:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueWordDict(trainingData))
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB PCA 100: \n",impWords[-20:])



Accuracy PCA 10:  0.598333333333
Confusion Matrix PCA 10:
 [[246  54]
 [187 113]]
Important words from NB PCA 10: 
 ['beateous', 'evoke', 'road', 'installed', 'spy', 'possibly', 'incorrectness', 'turn', 'fond', 'china']
Accuracy PCA 50:  0.663333333333
Confusion Matrix PCA 50:
 [[250  50]
 [152 148]]
Important words from NB PCA 50: 
 ['motorolas', 'installed', 'transfer', 'english', 'elias', 'max', 'row', 'redeeming', 'sauce', 'charisma', 'preferably', 'typical', 'interesting', 'scar', 'spy', 'possibly', 'incorrectness', 'turn', 'fond', 'china']
Accuracy PCA 100:  0.661666666667
Confusion Matrix PCA 100:
 [[250  50]
 [153 147]]
Important words from NB PCA 100: 
 ['motorolas', 'installed', 'transfer', 'english', 'elias', 'max', 'row', 'redeeming', 'sauce', 'charisma', 'preferably', 'typical', 'interesting', 'scar', 'spy', 'possibly', 'incorrectness', 'turn', 'fond', 'china']


In [252]:
#PCA with N-gram
X_train = X_uniqueWords_ngram[0:2400]
X_test = X_uniqueWords_ngram[2400:3000]
X_normalized_train = X_train + 1
X_normalized_test = X_test + 1
X_normalized_train = np.log(X_normalized_train)
X_normalized_test = np.log(X_normalized_test)
F_ngram,F_test_ngram = PCA(X_normalized_train,X_normalized_test,10)
F50_ngram,F50_test_ngram = PCA(X_normalized_train,X_normalized_test,50)
F100_ngram,F100_test_ngram = PCA(X_normalized_train,X_normalized_test,100)

#Repeat with f. Logistic
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F_ngram,trainingLabels)
y_test = logReg.predict(F_test_ngram)
print("Accuracy for N-gram PCA 10: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 10:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Log Reg on N-grams PCA 10: \n",impWords[-20:])

#R=50
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F50_ngram,trainingLabels)
y_test = logReg.predict(F50_test_ngram)
print("Accuracy for N-gram PCA 50: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 50:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Log Reg on N-grams PCA 50: \n",impWords[-20:])


#R=100
logReg = LogisticRegression(class_weight="balanced")
logReg.fit(F100_ngram,trainingLabels)
y_test = logReg.predict(F100_test_ngram)
print("Accuracy for N-gram PCA 100: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 100:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(logReg.coef_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from Log Reg on N-grams PCA 100: \n",impWords[-20:])

print(confusion_matrix(testLabels, y_test))

Accuracy for N-gram PCA 10:  0.52
Confusion Matrix for N-gram PCA 10:
 [[296   4]
 [284  16]]
Important words from Log Reg on N-grams PCA 10: 
 ['century fox', 'month pro', 'ups end', 'theatrical feel', 'like morgan', 'snow possibly', 'today sample', 'role gently', 'pull plug', 'earbud headset']
Accuracy for N-gram PCA 50:  0.53
Confusion Matrix for N-gram PCA 50:
 [[291   9]
 [273  27]]
Important words from Log Reg on N-grams PCA 50: 
 ['unhappy immediately', 'snow possibly', 'killer next', 'bachi burger', 'love closeup', 'flawlessly moto', 'freak sandwich', 'today sample', 'intention make', 'part conversation', 'eyed pea', 'good others', 'role gently', 'pull plug', 'plantronics bluetooth', 'imitation type', 'mention huge', 'one simply', 'potentially fry', 'earbud headset']
Accuracy for N-gram PCA 100:  0.536666666667
Confusion Matrix for N-gram PCA 100:
 [[284  16]
 [262  38]]
Important words from Log Reg on N-grams PCA 100: 
 ['sure never', 'part conversation', 'never treat', 'eyed 

In [254]:
#Repeat with f. Naive Bayes
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(F_ngram,trainingLabels)
y_test = clf.predict(F_test_ngram)
print("Accuracy for N-gram PCA 10: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 10:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB on N-grams PCA 10: \n",impWords[-20:])


clf = GaussianNB().fit(F50_ngram,trainingLabels)
y_test = clf.predict(F50_test_ngram)
print("Accuracy for N-gram PCA 50: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 50:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB on N-grams PCA 50: \n",impWords[-20:])


clf = GaussianNB().fit(F100_ngram,trainingLabels)
y_test = clf.predict(F100_test_ngram)
print("Accuracy for N-gram PCA 100: ",accuracy_score(testLabels, y_test))
print("Confusion Matrix for N-gram PCA 100:\n",confusion_matrix(testLabels, y_test))

coef = np.argsort(abs(clf.theta_[0]))
keys=list(uniqueNGrams)
impWords = []
for i in range(0, len(coef)):
    impWords.append(keys[coef[i]])
print("Important words from NB on N-grams PCA 100: \n",impWords[-20:])


Accuracy for N-gram PCA 10:  0.505
Confusion Matrix for N-gram PCA 10:
 [[300   0]
 [297   3]]
Important words from NB on N-grams PCA 10: 
 ['century fox', 'month pro', 'theatrical feel', 'ups end', 'like morgan', 'snow possibly', 'pull plug', 'role gently', 'today sample', 'earbud headset']
Accuracy for N-gram PCA 50:  0.513333333333
Confusion Matrix for N-gram PCA 50:
 [[299   1]
 [291   9]]
Important words from NB on N-grams PCA 50: 
 ['killer next', 'freak sandwich', 'intention make', 'ups end', 'bachi burger', 'like morgan', 'flawlessly moto', 'part conversation', 'snow possibly', 'good others', 'eyed pea', 'pull plug', 'role gently', 'plantronics bluetooth', 'today sample', 'imitation type', 'mention huge', 'one simply', 'potentially fry', 'earbud headset']
Accuracy for N-gram PCA 100:  0.521666666667
Confusion Matrix for N-gram PCA 100:
 [[300   0]
 [287  13]]
Important words from NB on N-grams PCA 100: 
 ['use antena', 'low quality', 'flawlessly moto', 'part conversation', 'sno

In [169]:
#Logistic Regression on simple bag of words performs the best, followed by Naive Bayes on Bag of Words
#Bag of Words > PCA > N-Gram