# Sentiment Analysis

In [30]:
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer # we won't be doing stemming and rather perform lemmatizing
from nltk.stem import WordNetLemmatizer


from sklearn.metrics import accuracy_score

In [31]:

# the sentiments we want to analyze
sentiments = ['pos', 'neg']
# the raw data we will store in an object for now
data = {}

# set of stopwords, we want to remove
stop_words = set(stopwords.words('english'))

# our lemmatizer
lemmatizer = WordNetLemmatizer()

for i, sentiment in enumerate(sentiments):
    files = os.listdir('./movie_review/txt_sentoken/' + sentiment)
    data[sentiment] = []
    for i, filename in enumerate(files):
        data[sentiment].append([])
        with open('./movie_review/txt_sentoken/' + sentiment + '/' + filename) as f:
            review = f.read()
            words = word_tokenize(review)

            for word in words:
                if word not in stop_words and word.isalpha():
                    token = str(lemmatizer.lemmatize(word))
                    data[sentiment][i].append(token)

# -> cleaned data - removed stop words and punctuation

In [32]:
# read the positive and negative vocab
with open('./opinion-lexicon/negative-words.txt') as f:
    negative_words = f.read()
    negative_words = negative_words.split("\n")
    neg_vocab = negative_words[:-1] # we do not want the last empty line
    
with open('./opinion-lexicon/positive-words.txt') as f:
    positive_words = f.read()
    positive_words = positive_words.split("\n")
    pos_vocab = positive_words[:-1] # we do not want the last empty line

In [33]:
# compute the bag of words
statistics = {}
for i, sentiment in enumerate(sentiments):
    statistics[sentiment] = []
    for j, review in enumerate(data[sentiment]):
        bag_pos = np.zeros(len(pos_vocab))
        bag_neg = np.zeros(len(neg_vocab))
        for k, word in enumerate(data[sentiment][j]):
            for l, vocab in enumerate(neg_vocab):
                if word == vocab:
                    bag_neg[l] += 1
            for m, vocab in enumerate(pos_vocab):
                if word == vocab:
                    bag_pos[m] += 1
        
        statistics[sentiment].append({
            'bag_of_words_positive': bag_pos,
            'bag_of_words_negative': bag_neg
        })

In [34]:
def classify_movie(bag_pos, bag_neg, actual):
    positive_rate = np.sum(bag_pos)
    negative_rate = np.sum(bag_neg)

    if positive_rate >= negative_rate:
        correct = 1 if 'pos' == actual else 0
        guess = 1
        target = 1 if 'pos' == actual else -1
        return np.concatenate((bag_pos, bag_neg, np.array([target, guess, correct])))
    elif positive_rate < negative_rate:
        correct = 1 if 'neg' == actual else 0
        guess = -1
        target = 1 if 'pos' == actual else -1
        return np.concatenate((bag_pos, bag_neg, np.array([target, guess, correct])))

    raise Exception("Ui, error!")    

In [35]:
d = pd.DataFrame()
correct_column = len(pos_vocab) + len(neg_vocab) + 2
target_column = len(pos_vocab) + len(neg_vocab)
for i, sentiment in enumerate(sentiments):
    for j, review in enumerate(statistics[sentiment]):
        v = classify_movie(statistics[sentiment][j]['bag_of_words_positive'], statistics[sentiment][j]['bag_of_words_negative'], sentiment)
        d = d.append(pd.DataFrame(v.reshape(-1, len(v))))

In [36]:
correct_guesses = d[d[correct_column] == 1].shape[0]
print("Accuracy: %.4f" % (correct_guesses/d.shape[0]))

Accuracy: 0.7015


In [37]:
true_positives = d[(d[correct_column] == 1) & (d[target_column] == 1)].shape[0]
false_positive = d[(d[correct_column] == 0) & (d[target_column] == -1)].shape[0]
print("Precision: %.4f" % (true_positives/(true_positives + false_positive)))

Precision: 0.7251


In [38]:
true_positives = d[(d[correct_column] == 1) & (d[target_column] == 1)].shape[0]
false_negatives = d[(d[correct_column] == 0) & (d[target_column] == 1)].shape[0]
print("Recall: %.4f" % (true_positives/(true_positives + false_negatives)))
print("F-Measure: %.4f" % (2. / (1./(true_positive/(true_positive + false_positive)) + 1. /(true_positive/(true_positive + false_negative)))) ) 

Recall: 0.6490
F-Measure: 0.5229


## Discriminative Classifier

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
shuffled_d = d.sample(frac=1)
test_bench = shuffled_d.iloc[:400]

In [41]:
training_bench = shuffled_d.iloc[400:]

In [42]:
y = training_bench.filter(items=[target_column])
X = training_bench.drop(d.columns[-3:],axis=1)

In [43]:
discriminative_clf = LogisticRegression().fit(X,y[target_column])

In [44]:
X_prime = test_bench.drop(d.columns[-3:],axis=1)
y_prime = test_bench.filter(items=[target_column])
discriminative_clf.predict(X_prime)

array([ 1., -1., -1., -1., -1.,  1., -1., -1.,  1.,  1., -1.,  1., -1.,
        1., -1.,  1.,  1., -1.,  1.,  1., -1.,  1., -1.,  1.,  1., -1.,
       -1., -1., -1.,  1.,  1., -1.,  1.,  1.,  1., -1.,  1., -1.,  1.,
        1., -1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,  1., -1., -1.,
       -1., -1., -1., -1.,  1.,  1., -1., -1.,  1.,  1., -1., -1.,  1.,
       -1.,  1., -1.,  1., -1.,  1., -1., -1.,  1., -1.,  1., -1.,  1.,
        1.,  1., -1.,  1.,  1.,  1., -1.,  1.,  1., -1.,  1.,  1., -1.,
       -1.,  1., -1., -1., -1.,  1., -1.,  1., -1., -1.,  1., -1.,  1.,
       -1., -1.,  1., -1., -1., -1.,  1., -1.,  1., -1.,  1., -1., -1.,
       -1., -1.,  1., -1., -1., -1., -1.,  1.,  1.,  1.,  1.,  1., -1.,
       -1.,  1., -1.,  1., -1., -1.,  1., -1., -1.,  1.,  1.,  1.,  1.,
       -1.,  1.,  1., -1., -1., -1.,  1.,  1., -1., -1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1., -1., -1.,  1.,
        1.,  1.,  1.,  1., -1., -1., -1., -1.,  1., -1.,  1., -1

In [45]:
discriminative_clf.score(X_prime, y_prime)

0.86

In [46]:
y_prime = y_prime.reset_index().filter(items=[target_column])

In [47]:
s = pd.DataFrame(discriminative_clf.predict(X_prime)).join(y_prime)

In [48]:
true_positive = s[(s[0] == s[target_column]) & (s[target_column] == 1)].shape[0]
true_negative = s[(s[0] == s[target_column]) & (s[target_column] == -1)].shape[0]
false_positive = s[~(s[0] == s[target_column]) & (s[target_column] == -1)].shape[0]
false_negative = s[~(s[0] == s[target_column]) & (s[target_column] == 1)].shape[0]


In [49]:
print("Accuracy %.4f" % accuracy_score(discriminative_clf.predict(X_prime), y_prime[target_column]))
print("Precision: %.4f" % (true_positive/(true_positive + false_positive)))
print("Recall: %.4f" % (true_positive/(true_positive + false_negative)))
print("F-Measure: %.4f" % (2. / (1./(true_positive/(true_positive + false_positive)) + 1. /(true_positive/(true_positive + false_negative)))) ) 

Accuracy 0.8600
Precision: 0.8571
Recall: 0.8482
F-Measure: 0.8526
