### Import Libraries

In [150]:
import pandas as pd
import numpy as np
import os
import string, re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.metrics import accuracy_score

### Read in Data

In [227]:
def read_data(filepath, review_type):
    """This funtion reads in the data, parses it, and separates it into 
    the appropriate train and test splits. Data is read in in UTF-8, and is
    parsed by removing all punctuation. Any review that begins with the 
    filename 'cv9' is considered to be part of the test set.
    
    Args:
        filepath: path to the corpus
        review_type: type of review included in the data within this filepath
    Returns:
        Returns four separate lists. The test set corpus, the test set labels,
        the training set corpus, and the training set labels. 
    """
    test, test_labels = [], []
    train, train_labels = [], []
    
    for filename in os.listdir(filepath):
        with open(filepath + filename, 'rb') as review:
            txt = review.read().decode('utf8', 'surrogateescape')
            txt = txt.replace("--", "").replace("_", " ").replace("-", " ")
            translator = str.maketrans('', '', string.punctuation)
            txt = txt.translate(translator)
            txt = txt.split()
            txt = ' '.join(txt)
            if filename.startswith('cv9'):
                test.append(txt)
                test_labels.append(review_type)
            else: 
                train.append(txt)
                train_labels.append(review_type)
    return(test, test_labels, train, train_labels)

In [228]:
# read in both the positive and negative reviews
neg_test, neg_test_labels, neg_train, neg_train_labels = read_data('review_polarity.v2/txt_sentoken/neg/', 'Negative')
pos_test, pos_test_labels, pos_train, pos_train_labels = read_data('review_polarity.v2/txt_sentoken/pos/', 'Positive')

# combine training sets
train_labels = neg_train_labels + pos_train_labels
train = neg_train + pos_train

# combine test sets
test_labels = neg_test_labels + pos_test_labels
test = neg_test + pos_test

## Problem 1

In [229]:
def show_most_informative_features(vectorizer, classifier, n=10):
    """This function takes a vectorizer and a classifier and prints the
    top n most informative features for each class. 
    
    Args:
        vectorizer: nltk vectorizer
        classifier: nltk classifier
    Returns:
        None
    """
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()  
    topn_pos_class = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    topn_neg_class = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]    

    print("Important words in positive reviews")
    for coef, feature in topn_pos_class:
        print(class_labels[1], coef, feature) 
    print("-----------------------------------------")
    print("Important words in negative reviews")
    for coef, feature in topn_neg_class:
        print(class_labels[0], coef, feature)     

In [230]:
def adv_adj_only(corpus):
    """This function takes a movie review and returns only those words
    in the review that are adjectives and adverbs.
    
    Args:
        corpus: a preprocessed review
    Returns:
        Returns a modified review that only includes adjectives and adverbs.
    """
    tags = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    text = word_tokenize(corpus)
    all_tags = pos_tag(text)
    result = [word[0] for word in all_tags if word[1] in tags]
    result = ' '.join(result)
    return(result)

### M1
Unigrams, absense/presence. 

In [231]:
# get word counts
vectorizer = CountVectorizer(binary=True)
train_features = vectorizer.fit_transform([doc for doc in train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.87
Important words in positive reviews
Positive 900.0 the
Positive 900.0 of
Positive 899.0 to
Positive 899.0 and
Positive 898.0 is
-----------------------------------------
Important words in negative reviews
Negative 899.0 the
Negative 899.0 of
Negative 899.0 and
Negative 898.0 to
Negative 898.0 is


### M2
Unigrams with frequency count.

In [232]:
# get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([doc for doc in train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.84
Important words in positive reviews
Positive 37122.0 the
Positive 17771.0 and
Positive 16692.0 of
Positive 14798.0 to
Positive 12557.0 is
-----------------------------------------
Important words in negative reviews
Negative 31470.0 the
Negative 14011.0 and
Negative 13857.0 to
Negative 13857.0 of
Negative 9961.0 is


### M3
Unigrams, only adjectives and adverbs.

In [233]:
# get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([adv_adj_only(doc) for doc in train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([adv_adj_only(doc) for doc in test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.85
Important words in positive reviews
Positive 2666.0 not
Positive 1633.0 more
Positive 1334.0 so
Positive 1247.0 most
Positive 1200.0 just
-----------------------------------------
Important words in negative reviews
Negative 2430.0 not
Negative 1549.0 so
Negative 1391.0 just
Negative 1362.0 more
Negative 1222.0 even


### M4
Unigrams, sublinear TF-IDF

In [234]:
# get modified word counts
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
train_features = vectorizer.fit_transform([doc for doc in train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.85
Important words in positive reviews
Positive 21.954856077538892 movie
Positive 18.705007035879404 like
Positive 16.785996727777352 story
Positive 16.780804954144276 life
Positive 16.69821652018229 just
-----------------------------------------
Important words in negative reviews
Negative 26.6824696024802 movie
Negative 21.118291464844972 like
Negative 19.702164811842977 just
Negative 18.611681285269174 bad
Negative 16.958553323083297 good


### M5
Bigrams, absense/presence 

In [235]:
# get absense/presence counts for bigrams
vectorizer = CountVectorizer(ngram_range=(2,2), binary=True)
train_features = vectorizer.fit_transform([doc for doc in train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.86
Important words in positive reviews
Positive 845.0 of the
Positive 787.0 in the
Positive 657.0 to the
Positive 649.0 the film
Positive 634.0 and the
-----------------------------------------
Important words in negative reviews
Negative 828.0 of the
Negative 796.0 in the
Negative 624.0 to be
Negative 606.0 the film
Negative 577.0 to the


### Analysis

The best performing model is the M1 model (unigram absense/presence)-- this model has <b>87% accuracy</b>. The worst performing model is M2 (unigram frequency counts)-- this model has <b>84% accuracy</b>. Interestingly enough, both of these models have the same top five most influential words, which are all stop words. Binarizing the counts for the M1 model likely removed some of the negative effects of stop words. 

In general, when looking at the most influential words for all models, each model except M4 is dominated by stop words. For this reason, while the accuracy for M4 is not the best (85%), qualitatively this model appears to be the best because the most influential words appear to capture sentiment more than the other models. Removing stop words and further tuning the parameters used for creating custom stop words would likely help imporove all models. 

## Problem 2
Using Porter Stemmer

In [151]:
def my_PorterStemmer(review):
    ps = PorterStemmer()
    words = word_tokenize(review)
    stemmed = [ps.stem(word) for word in words]
    stemmed_review = ' '.join(stemmed)
    return(stemmed_review)

In [240]:
# update the train and test corpus
stem_train = [my_PorterStemmer(review) for review in train]
stem_test = [my_PorterStemmer(review) for review in test]

### M1

In [241]:
# get word counts
vectorizer = CountVectorizer(binary=True)
train_features = vectorizer.fit_transform([doc for doc in stem_train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in stem_test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print(accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

0.855
Important words in positive reviews
Positive 900.0 the
Positive 900.0 of
Positive 899.0 to
Positive 899.0 and
Positive 898.0 is
-----------------------------------------
Important words in negative reviews
Negative 899.0 the
Negative 899.0 of
Negative 899.0 and
Negative 898.0 to
Negative 898.0 is


### M2

In [242]:
# get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([doc for doc in stem_train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in stem_test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.84
Important words in positive reviews
Positive 37122.0 the
Positive 17771.0 and
Positive 16696.0 of
Positive 14799.0 to
Positive 12557.0 is
-----------------------------------------
Important words in negative reviews
Negative 31470.0 the
Negative 14011.0 and
Negative 13858.0 of
Negative 13857.0 to
Negative 9961.0 is


### M3

In [243]:
# get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([adv_adj_only(doc) for doc in stem_train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([adv_adj_only(doc) for doc in stem_test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.835
Important words in positive reviews
Positive 2679.0 not
Positive 1873.0 hi
Positive 1633.0 more
Positive 1304.0 so
Positive 1248.0 most
-----------------------------------------
Important words in negative reviews
Negative 2434.0 not
Negative 1513.0 so
Negative 1391.0 just
Negative 1361.0 more
Negative 1360.0 hi


### M4

In [244]:
# get modified word counts
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
train_features = vectorizer.fit_transform([doc for doc in stem_train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in stem_test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.855
Important words in positive reviews
Positive 22.794618033004646 wa
Positive 21.081929361327802 charact
Positive 20.976549597038304 like
Positive 19.910406575685826 make
Positive 19.68015088753407 time
-----------------------------------------
Important words in negative reviews
Negative 24.860276120908456 wa
Negative 23.568359090059325 like
Negative 21.52631859516604 charact
Negative 20.95806019401285 just
Negative 19.897462497896765 make


### M5 

In [245]:
# get absense/presence counts for bigrams
vectorizer = CountVectorizer(ngram_range=(2,2), binary=True)
train_features = vectorizer.fit_transform([doc for doc in stem_train])

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

# prep test set
test_features = vectorizer.transform([doc for doc in stem_test])

# make predictions
predictions = nb_clf.predict(test_features)

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy: 0.85
Important words in positive reviews
Positive 845.0 of the
Positive 787.0 in the
Positive 684.0 the film
Positive 657.0 to the
Positive 634.0 and the
-----------------------------------------
Important words in negative reviews
Negative 828.0 of the
Negative 796.0 in the
Negative 644.0 the film
Negative 626.0 to be
Negative 577.0 to the


### Analysis
After applying the Porter Stemmer to our corpus, the only model that improved was the M4 model, which improved from 85% accuracy to 85.5% accuracy. All other models either stayed constant or dropped in accuracy. Due to such a minor increase on a non-optimal model, and overall a decrease in accuracy for other models, I would argue that it is not worth doing in this scenario.

## Problem 3

In [180]:
NRC_emotion = pd.read_csv("NRC_Emotion.txt", 
                           sep="\t", 
                           skiprows=22, 
                           names=["TargetWord", "AffectCategory", "AssociationFlag"])

In [188]:
# words must have an associate to be relevant
NRC_emotion = NRC_emotion[NRC_emotion.AssociationFlag == 1]
# we only care about positive and negative assocaitions
NRC_emotion = NRC_emotion[NRC_emotion.AffectCategory.isin(['negative', 'positive'])]

In [194]:
NRC_emotion.sample(5)

Unnamed: 0,TargetWord,AffectCategory,AssociationFlag
109843,sarcoma,negative,1
132073,unattractive,negative,1
3,abandon,negative,1
28274,cosmopolitan,positive,1
33613,departure,negative,1


In [211]:
def pos_neg_ratio(review, pos_words, neg_words):
    """This function takes a review and lists of positive and negative 
    associated words and returns the ratio of positive rewards to negative
    words in the review. 
    
    Args:
        review: a preprocessed review string
        pos_words: a list of positive words
        neg_words: a list of negative words
    Returns:
        Returns a float representing the ratio of positive words in the 
        review to negative words in the review. 
    """
    pos, neg = 0, 0
    words = word_tokenize(review)
    for word in words: 
        if word in pos_words:
            pos+=1
        if word in neg_words:
            neg+=1
    if neg != 0:
        ratio = pos/neg
    else:
        ratio = pos
    return ratio

In [210]:
# get the negative and positive words from NRC Lexicon
neg = NRC_emotion[NRC_emotion.AffectCategory == 'negative'].TargetWord.values
pos = NRC_emotion[NRC_emotion.AffectCategory == 'positive'].TargetWord.values

# calculate the ratios for train and test set
ratio_train = [pos_neg_ratio(review, pos, neg) for review in train] 
ratio_test = [pos_neg_ratio(review, pos, neg) for review in test] 

# make naive bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(np.array(ratio_train).reshape(-1, 1), train_labels)

# make predictions
predictions = nb_clf.predict(np.array(ratio_test).reshape(-1,1))

# evaluate
accuracy = accuracy_score(predictions, test_labels)
print("Accuracy:", accuracy)

### Analysis
The NRC Emotion model is the worst performing model that we have seen thus far. This is likely due to the limited vocabulary included in the NRC emotion vocabulary and the fact that the that the vocabulary is not tailored to movie reviews (the topic of our corpus). Additionally, the ratings that we are working with are relatively lengthy and oftentimes describe the movie, not just the person's feelings about the movie. Consequently, some of the adjectives used to describe the movie may confuse the calculated sentiment ratio-- resulting in a poor model. 

### Problem 4

As discussed throughout this notebook, in order to improve the performance of these models I would begin by removing standard stop words for all model types. Then, I would add to this by removing a list of custom stop words that pertain to this movie reviews corpus that are not helpful when determining sentiment. For example, words such as "characters", "movie", and "plot" are all within the top 100 words used in the training corpus, however, do not contribute to the sentiment analysis. Tuning the parameters used to find these custom stop words would also be an important step in improving the sentiment analysis. 

Aside from customizing stop words, I would like to experiment with bigram and trigram frequencies in the corpus because I believe phrases like "see again" and "not recommend" could be very telling in determining sentiment. Lastly, some kind of normalization is likely important for this kind of analysis. Normalization could be done in a variety of ways, such as term frequency normalization (to make sure that imporant words reieve higher weighting), review length normalization (to make sure that the length of the review does not impact the sentiment rating), normalizing by reviewer (some reviewers naturally have a more positive vocabulary than others).  