In [52]:
import re
import numpy as np
from matplotlib import pylab as plt

stop_words = {'the', 'and', 'or', 'with', 'its', "it's"}

def get_data(filename):
    data_train = []
    data_test = []
    labels_train = []
    labels_test = []
    num_p_labels = 0
    num_n_labels = 0
    for line in open(filename):
        l = line[-2]
        d = process_line(line[:len(line)-3])
        is_positive = l == '1'
        is_negative = l == '0'
        if is_positive:
            num_p_labels += 1
            if num_p_labels <= 400:
                data_train.append(d)
                labels_train.append(l)
            else:
                data_test.append(d)
                labels_test.append(l)
        elif is_negative:
            num_n_labels += 1
            if num_n_labels <= 400:
                data_train.append(d)
                labels_train.append(l)
            else:
                data_test.append(d)
                labels_test.append(l)
        else:
            raise Exception('Bad label found')

    print('Positive labels: {0}'.format(num_p_labels))
    print('Negative labels: {0}'.format(num_n_labels))
    return np.array(data_train), np.array(labels_train),\
           np.array(data_test), np.array(labels_test)

In [53]:
import string
def process_line(line):
    l = line.lower()
    l = l.translate(None, string.punctuation)
    
    # get all of the words
    word_tokens = l.split(' ')
    ret_l = ''
    for word in word_tokens:
        if word not in stop_words: # get rid of stopwords
            ret_l += word + ' '

    ret_l = ret_l[:len(ret_l)-1] # get rid of the trailing space
    return ret_l

In [54]:
print('Loading data...')
am_trd, am_trl, am_ted, am_tld = get_data('./sentiment labelled sentences/amazon_cells_labelled.txt')
im_trd, im_trl, im_ted, im_tld = get_data('./sentiment labelled sentences/imdb_labelled.txt')
ye_trd, ye_trl, ye_ted, ye_tld = get_data('./sentiment labelled sentences/yelp_labelled.txt')

print('')
print('Training data: {0}'.format(len(am_trd) + len(im_trd) + len(ye_trd)))
print('Testing data: {0}'.format(len(am_ted) + len(im_ted) + len(ye_ted)))

Loading data...
Positive labels: 500
Negative labels: 500
Positive labels: 500
Negative labels: 500
Positive labels: 500
Negative labels: 500

Training data: 2400
Testing data: 600


In [55]:
index_to_word = {}
def gather_features(feature_map, d1, d2, d3):
    for ds in (d1, d2, d3):
        for d in ds:
            sl = d.split(' ')
            for word in sl:
                if word not in feature_map:
                    ind = len(features)
                    feature_map[word] = ind
                    index_to_word[ind] = word
                
features = {}
gather_features(features, am_trd, im_trd, ye_trd)
print('Number of features: {0}'.format(len(features)))

Number of features: 4706


In [56]:
def convert_bag_of_words(feature_set, data):
    ret = []
    for d in data:
        bow = np.zeros(len(feature_set))
        line = d.split(' ')
        for word in line:
            if word in features:
                bow[feature_set[word]] += 1
        ret.append(bow)
    return np.array(ret)

amc_trd = convert_bag_of_words(features, am_trd)
amc_ted = convert_bag_of_words(features, am_ted)
imc_trd = convert_bag_of_words(features, im_trd)
imc_ted = convert_bag_of_words(features, im_ted)
yec_trd = convert_bag_of_words(features, ye_trd)
yec_ted = convert_bag_of_words(features, ye_ted)

In [57]:
"""
Part 1.E
"""
def postprocess(train1, train2, train3, test1, test2, test3):
    return train1, train2, train3, test1, test2, test3

amp_trd, imp_trd, yep_trd, amp_ted, imp_ted, yep_ted = postprocess(amc_trd, imc_trd, yec_trd, amc_ted, imc_ted, yec_ted)

In [58]:
"""
Part 1.F
"""
from sklearn.linear_model import LogisticRegression
def classify_logistic_regression(training_set, training_labels, testing_set, testing_labels, ind_to_words=None, print_top_words=True):
    tp = 0.0
    tn = 0.0
    fp = 0.0
    fn = 0.0
    
    lr = LogisticRegression().fit(training_set, training_labels)
    guesses = lr.predict(testing_set)
    for ind in xrange(len(guesses)):
        label = guesses[ind]
        tl = testing_labels[ind]
        correct = tl == label
        if label == '1':
            if correct: tp += 1.0
            else: fp += 1.0
        else:
            if correct: tn += 1.0
            else: fn += 1.0
    
    if print_top_words:
        weight_vector = np.abs(lr.coef_)
        sorted_index = np.argsort(weight_vector)[::-1]
        print('Top 5 weighted words: ')
        for i in range(5):
            print ind_to_words[sorted_index[0][i]]
    
    print("True Positive: {0}".format(tp))
    print("False Positive: {0}".format(fp))    
    print("True Negative: {0}".format(tn))    
    print("False Negative: {0}".format(fn))    
    print("Accuracy: {0}".format((tp + tn)/(tp + tn + fp + fn)))

classify_logistic_regression(np.concatenate((amp_trd, imp_trd, yep_trd)), 
                             np.concatenate((am_trl, im_trl, ye_trl)),
                             np.concatenate((amp_ted, imp_ted, yep_ted)),
                             np.concatenate((am_tld, im_tld, ye_tld)), index_to_word)

Top 5 weighted words: 
reviewing
reasons
liking
2mp
pics
True Positive: 238.0
False Positive: 45.0
True Negative: 255.0
False Negative: 62.0
Accuracy: 0.821666666667


In [59]:
"""
Part 1.F
"""
from sklearn.naive_bayes import GaussianNB
def classify_naive_bayes(training_set, training_labels, testing_set, testing_labels):
    tp = 0.0
    tn = 0.0
    fp = 0.0
    fn = 0.0
    
    nb = GaussianNB().fit(training_set, training_labels)
    guesses = nb.predict(testing_set)
    for ind in xrange(len(guesses)):
        label = guesses[ind]
        tl = testing_labels[ind]
        correct = tl == label
        if label == '1':
            if correct: tp += 1.0
            else: fp += 1.0
        else:
            if correct: tn += 1.0
            else: fn += 1.0
            
    print("True Positive: {0}".format(tp))
    print("False Positive: {0}".format(fp))    
    print("True Negative: {0}".format(tn))    
    print("False Negative: {0}".format(fn))    
    print("Accuracy: {0}".format((tp + tn)/(tp + tn + fp + fn)))

classify_naive_bayes(np.concatenate((amp_trd, imp_trd, yep_trd)), 
                     np.concatenate((am_trl, im_trl, ye_trl)),
                     np.concatenate((amp_ted, imp_ted, yep_ted)),
                     np.concatenate((am_tld, im_tld, ye_tld)))

True Positive: 154.0
False Positive: 39.0
True Negative: 261.0
False Negative: 146.0
Accuracy: 0.691666666667


In [60]:
"""
Part 1.G
"""
def get_2_grams(line):
    ret = []
    words = line.split(' ')
    if len(words) == 0: return []
    if len(words) == 1: return words
    
    w1 = words[0]
    w2 = words[1]
    for i in xrange(3, len(words)):
        new_word = w1 + ' ' + w2
        ret.append(new_word)
        w1 = words[i-2]
        w2 = words[i-1]
    return ret

index_to_grams = {}
def get_2_gram_features(feature_set, data):
    for d in data:
        for p in get_2_grams(d):
            if p not in feature_set:
                ind = len(feature_set)
                feature_set[p] = ind
                index_to_grams[ind] = p
                
    
def convert_2_gram(feature_set, data):
    ret = []
    for d in data:
        gram = np.zeros(len(feature_set))
        for p in get_2_grams(d):
            if p in feature_set:
                gram[feature_set[p]] += 1
        ret.append(gram)
    return np.array(ret)

two_gram_features = {}
get_2_gram_features(two_gram_features, np.concatenate((am_trd, im_trd, ye_trd)))
print(len(two_gram_features))

amg_trd = convert_2_gram(two_gram_features, am_trd)
amg_ted = convert_2_gram(two_gram_features, am_ted)
img_trd = convert_2_gram(two_gram_features, im_trd)
img_ted = convert_2_gram(two_gram_features, im_ted)
yeg_trd = convert_2_gram(two_gram_features, ye_trd)
yeg_ted = convert_2_gram(two_gram_features, ye_ted)

print('\r\nLogistic Regression:')
classify_logistic_regression(np.concatenate((amg_trd, img_trd, yeg_trd)), 
                             np.concatenate((am_trl, im_trl, ye_trl)),
                             np.concatenate((amg_ted, img_ted, yeg_ted)),
                             np.concatenate((am_tld, im_tld, ye_tld)), index_to_grams)

print('\r\nNaive Bayes:')
classify_naive_bayes(np.concatenate((amg_trd, img_trd, yeg_trd)), 
                     np.concatenate((am_trl, im_trl, ye_trl)),
                     np.concatenate((amg_ted, img_ted, yeg_ted)),
                     np.concatenate((am_tld, im_tld, ye_tld)))

14438

Logistic Regression:
Top 5 weighted words: 
tell you
not having
his most
who is
he came
True Positive: 220.0
False Positive: 123.0
True Negative: 177.0
False Negative: 80.0
Accuracy: 0.661666666667

Naive Bayes:
True Positive: 247.0
False Positive: 166.0
True Negative: 134.0
False Negative: 53.0
Accuracy: 0.635


In [61]:
"""
Part 1.H
"""
from numpy.linalg import svd

def get_Vt(data):
    _,_,vt = np.linalg.svd(data, full_matrices=True)
    return vt

print("Getting Vts")
vt_1 = get_Vt(np.concatenate((amp_trd, imp_trd, yep_trd)))
vt_2 = get_Vt(np.concatenate((amg_trd, img_trd, yeg_trd)))

Getting Vts


In [63]:
def pca(rank, Vt, data):
    return data.dot(Vt[:,:rank])

d1 = [amp_trd, imp_trd, yep_trd, amp_ted, imp_ted, yep_ted]
d2 = [amg_trd, img_trd, yeg_trd, amg_ted, img_ted, yeg_ted]
for Vt, message, ds in [(vt_1, '\r\nFor standard data', d1),(vt_2, '\r\nFor N-gram data', d2)]:
    print(message)
    for rank in (10, 50, 100, 600):
        print('PCA rank {0}:'.format(rank))
        t1r = pca(rank, Vt, ds[0])
        t2r = pca(rank, Vt, ds[1])
        t3r = pca(rank, Vt, ds[2])
        te1r = pca(rank, Vt, ds[3])
        te2r = pca(rank, Vt, ds[4])
        te3r = pca(rank, Vt, ds[5])

        classify_logistic_regression(np.concatenate((t1r, t2r, t3r)), 
                                     np.concatenate((am_trl, im_trl, ye_trl)),
                                     np.concatenate((te1r, te2r, te3r)),
                                     np.concatenate((am_tld, im_tld, ye_tld)), None, False)


For standard data
PCA rank 10:
True Positive: 143.0
False Positive: 125.0
True Negative: 175.0
False Negative: 157.0
Accuracy: 0.53
PCA rank 50:
True Positive: 154.0
False Positive: 108.0
True Negative: 192.0
False Negative: 146.0
Accuracy: 0.576666666667
PCA rank 100:
True Positive: 166.0
False Positive: 86.0
True Negative: 214.0
False Negative: 134.0
Accuracy: 0.633333333333
PCA rank 600:
True Positive: 222.0
False Positive: 60.0
True Negative: 240.0
False Negative: 78.0
Accuracy: 0.77

For N-gram data
PCA rank 10:
True Positive: 109.0
False Positive: 97.0
True Negative: 203.0
False Negative: 191.0
Accuracy: 0.52
PCA rank 50:
True Positive: 120.0
False Positive: 116.0
True Negative: 184.0
False Negative: 180.0
Accuracy: 0.506666666667
PCA rank 100:
True Positive: 132.0
False Positive: 107.0
True Negative: 193.0
False Negative: 168.0
Accuracy: 0.541666666667
PCA rank 600:
True Positive: 212.0
False Positive: 151.0
True Negative: 149.0
False Negative: 88.0
Accuracy: 0.601666666667
