In [72]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
from Extensions import SVMDoc2Vec
from nltk.corpus import stopwords

In [147]:
import os, codecs, sys
from nltk.stem.porter import PorterStemmer

class MovieReviewCorpus():
    def __init__(self,stemming,pos):
        """
        initialisation of movie review corpus.

        @param stemming: use porter's stemming?
        @type stemming: boolean

        @param pos: use pos tagging?
        @type pos: boolean
        """
        # raw movie reviews
        self.reviews=[]
        # held-out train/test set
        self.train=[]
        self.test=[]
        # folds for cross-validation
        self.folds={}
        # porter stemmer
        self.stemmer=PorterStemmer() if stemming else None
        # part-of-speech tags
        self.pos=pos
        # import movie reviews
        self.get_reviews()

    def get_reviews(self):
        """
        processing of movie reviews.

        1. parse reviews in data/reviews and store in self.reviews.

           the format expected for reviews is: [(string,list), ...] e.g. [("POS",["a","good","movie"]), ("NEG",["a","bad","movie"])].
           in data/reviews there are .tag and .txt files. The .txt files contain the raw reviews and .tag files contain tokenized and pos-tagged reviews.

           to save effort, we recommend you use the .tag files. you can disregard the pos tags to begin with and include them later.
           when storing the pos tags, please use the format for each review: ("POS/NEG", [(token, pos-tag), ...]) e.g. [("POS",[("a","DT"), ("good","JJ"), ...])]

           to use the stemmer the command is: self.stemmer.stem(token)

        2. store training and held-out reviews in self.train/test. files beginning with cv9 go in self.test and others in self.train

        3. store reviews in self.folds. self.folds is a dictionary with the format: self.folds[fold_number] where fold_number is an int 0-9.
           you can get the fold number from the review file name.
        """

        directory = 'data/reviews/'

        parts_list = []

        for item in os.listdir(directory):

            item_path = os.path.join(directory, item)

            if os.path.isdir(item_path):
                
                # here you are at the level of item_path being either /reviews/NEG or /reviews/POS

                label = "POS" if "POS" in item_path else "NEG"

                classified_reviews_dir = item_path

                for filename in os.listdir(classified_reviews_dir):

                    if filename.endswith(".tag"):
                        file_path = os.path.join(classified_reviews_dir, filename)

                        with codecs.open(file_path, 'r', 'utf-8') as file:

                            lines = file.readlines()

                            review_tokens = []


                            for line in lines:
                                    
                                parts = line.strip().split()

                                if len(parts) == 2:
                                    token, pos_tag = parts

                                # stem the token if stemming is enabled

                                if self.stemmer:

                                    token = self.stemmer.stem(token)

                                review_tokens.append((token, pos_tag) if self.pos else token)

                            review = (label, review_tokens)
                            self.reviews.append(review)

                        if filename.startswith('cv9'):
                            self.test.append(review)
                        
                        else:
                            self.train.append(review)
                        
                        fold_num = int(filename[2])

                        if fold_num not in self.folds:
                            self.folds[fold_num] = []

                        self.folds[fold_num].append(review)

        return self.reviews



    
"""
Notes: 
- PorterStemmer 
    - stemmers remove morphological affixes from words, leaving only the word stem (running -> run)
- POS tagging, (Parts-of-Speech) tags - represents the grammatical category of the word its attached to. 

"""


# --------------------------------------------------------------------------------------------------------------------- #

from Analysis import Evaluation
from Analysis import Evaluation

class SentimentLexicon(Evaluation):
    def __init__(self):
        """
        read in lexicon database and store in self.lexicon
        """
        # if multiple entries take last entry by default
        self.lexicon = self.get_lexicon_dict()

    def get_lexicon_dict(self):
        lexicon_dict = {}
        with open('data/sent_lexicon', 'r') as f:
            for line in f:
                word = line.split()[2].split("=")[1]
                polarity = line.split()[5].split("=")[1]
                magnitude = line.split()[0].split("=")[1]
                lexicon_dict[word] = [magnitude, polarity]
        return lexicon_dict

    def classify(self,reviews,threshold,magnitude):
        """
        classify movie reviews using self.lexicon.
        self.lexicon is a dictionary of word: [polarity_info, magnitude_info], e.g. "bad": ["negative","strongsubj"].
        explore data/sent_lexicon to get a better understanding of the sentiment lexicon.
        store the predictions in self.predictions as a list of strings where "+" and "-" are correct/incorrect classifications respectively e.g. ["+","-","+",...]

        @param reviews: movie reviews
        @type reviews: list of (string, list) tuples corresponding to (label, content)

        @param threshold: threshold to center decisions on. instead of using 0, there may be a bias in the reviews themselves which could be accounted for.
                          experiment for good threshold values.
        @type threshold: integer

        @type magnitude: use magnitude information from self.lexicon?
        @param magnitude: boolean
        """


        # reset predictions

        self.predictions = []


        for review_ in reviews:

            true_label = review_[0]

            vibe_checker = 0

            for word in review_[1]:

                # if word in self.lexicon and word not in stopwords.words('english'):

                if word in self.lexicon:

                    nuanced_sentiment = self.lexicon[word][0]

                    binary_sentiment = self.lexicon[word][1]

                    
                    # print(f"vibechecker before {word}: {vibe_checker}")

                    if magnitude:
                        
                        if nuanced_sentiment == 'strongsubj':
                            weight = 4
                            

                        elif nuanced_sentiment == 'weaksubj':
                            weight = 0.5
                        
                        # vibe_checker += 1*weight if binary_sentiment == 'positive' else -1*weight

                        if binary_sentiment == 'positive':
                            vibe_checker += 1 * weight
                        
                        elif binary_sentiment == 'negative':
                            vibe_checker -= 1 * weight
                        

                    else:
                        if binary_sentiment == 'positive':
                            vibe_checker += 1
                        
                        elif binary_sentiment == 'negative':
                            vibe_checker -= 1
                
                    # print(f"vibechecker after {word}: {vibe_checker}")

            if vibe_checker > threshold:
                pred_label = 'POS'

            else:
                pred_label = 'NEG'

            # print(f'true label: {true_label},  pred label: {pred_label}, vibe counter: {vibe_checker}')
            
            if true_label == pred_label:
                self.predictions.append('+')
            
            else:
                self.predictions.append('-')

    

In [54]:
corpus=MovieReviewCorpus(stemming=False,pos=False)
test = corpus.get_reviews()

In [50]:
lexicon_dict = {}


with open('data/sent_lexicon', 'r') as f:
    for line in f:
        word = line.split()[2].split("=")[1]
        polarity = line.split()[5].split("=")[1]
        magnitude = line.split()[0].split("=")[1]
        lexicon_dict[word] = [magnitude, polarity]

In [141]:
moreinfo = []
test = []

for key in lexicon_dict:

    if lexicon_dict[key][0] not in moreinfo:
        moreinfo.append(lexicon_dict[key][0])
    
    if lexicon_dict[key][1] not in test:
        test.append(lexicon_dict[key][1])


print(test)

['negative', 'positive', 'neutral', 'both']


In [142]:
lexicon_dict

{'abandoned': ['weaksubj', 'negative'],
 'abandonment': ['weaksubj', 'negative'],
 'abandon': ['weaksubj', 'negative'],
 'abase': ['strongsubj', 'negative'],
 'abasement': ['strongsubj', 'negative'],
 'abash': ['strongsubj', 'negative'],
 'abate': ['weaksubj', 'negative'],
 'abdicate': ['weaksubj', 'negative'],
 'aberration': ['strongsubj', 'negative'],
 'abhor': ['strongsubj', 'negative'],
 'abhorred': ['strongsubj', 'negative'],
 'abhorrence': ['strongsubj', 'negative'],
 'abhorrent': ['strongsubj', 'negative'],
 'abhorrently': ['strongsubj', 'negative'],
 'abhors': ['strongsubj', 'negative'],
 'abidance': ['strongsubj', 'positive'],
 'abide': ['strongsubj', 'positive'],
 'abject': ['strongsubj', 'negative'],
 'abjectly': ['strongsubj', 'negative'],
 'abjure': ['weaksubj', 'negative'],
 'abilities': ['weaksubj', 'positive'],
 'ability': ['weaksubj', 'positive'],
 'able': ['weaksubj', 'positive'],
 'abnormal': ['weaksubj', 'negative'],
 'abolish': ['weaksubj', 'negative'],
 'abominabl

In [148]:
# retrieve corpus
corpus=MovieReviewCorpus(stemming=False,pos=False)

# use sign test for all significance testing
signTest=SignTest()

print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

# question 0.1
lexicon.classify(corpus.reviews, threshold, magnitude=False)
token_preds=lexicon.predictions
# print(token_preds)
print(f"token-only results: {lexicon.getAccuracy():.2f}")

lexicon.classify(corpus.reviews,threshold,magnitude=True)
magnitude_preds=lexicon.predictions
print(f"magnitude results:{lexicon.getAccuracy():.2f}")

# # question 0.2
# p_value=signTest.getSignificance(token_preds,magnitude_preds)
# significance = "significant" if p_value < 0.05 else "not significant"
# print(f"magnitude lexicon results are {significance} with respect to token-only")

--- classifying reviews using sentiment lexicon  ---
token-only results: 0.68
magnitude results:0.69


In [None]:
# question 1.0
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
# store predictions from classifier
non_smoothed_preds=NB.predictions
print(f"Accuracy without smoothing: {NB.getAccuracy():.2f}")

In [None]:
# question 2.0
# use smoothing
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
smoothed_preds=NB.predictions
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
print(f"Accuracy using smoothing: {NB.getAccuracy():.2f}")


# question 2.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(non_smoothed_preds,smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

In [None]:
# question 3.0
print("--- classifying reviews using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus)
# using cross-eval for smoothed predictions from now on
smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation()}")

In [None]:
# question 4.0
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus=MovieReviewCorpus(stemming=True,pos=False)
print("--- cross-validating NB using stemming ---")
NB.crossValidate(stemmed_corpus)
stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation():.3f}")

# TODO Q4.1
# see if stemming significantly improves results on smoothed NB

# TODO Q4.2
print("--- determining the number of features before/after stemming ---")

In [None]:
# question Q5.0
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus)
smoothed_and_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.2f}") 
print(f"Std. Dev: {NB.getStdDeviation():.2f}")


# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(smoothed_preds,smoothed_and_bigram_preds)
signifance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {signifance} with respect to smoothing only")


# TODO Q5.1

In [None]:
# TODO Q6 and 6.1
print("--- classifying reviews using SVM 10-fold cross-eval ---")

In [None]:
# TODO Q7
print("--- adding in POS information to corpus ---")
print("--- training svm on word+pos features ----")
print("--- training svm discarding closed-class words ---")

In [None]:
# question 8.0
print("--- using document embeddings ---")