In [2]:
# import statements
import numpy as np
import csv
import math
import random

from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

In [3]:
# helper method to preprocess a certain word. ie change upper to lower case and remove punctuation
def prepocessword(word):
    processedWord = ""
    word = word.lower()
    for i in range(len(word)):
        if not (ord(word[i]) < 97 or ord(word[i]) > 122):
            processedWord += word[i]
    return processedWord


In [4]:
# this method takes in as input the path to the a data set file and creates a vocabulary of the 10000 most frequent
# words. It then stores it in the desired formal at the 'outputpath'

def createVocab(inputpath, outputpath):
    file = open(inputpath, 'r')
    word_freq = {}
    for line in file:
        words = line.split(' ')
        # we want to do some preprocessing, ie remove all punctuations and change everything to lower case
        for word in words:
            word.strip()
            # for a character to be a letter, its ascii value has to be between 97 and 122 (a, z) as we have already
            # transformed everything to lower case
            processedWord = prepocessword(word)
            if processedWord != '':
                try:
                    word_freq[processedWord] += 1
                except:
                    word_freq[processedWord] = 1
    file.close()
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1])
    sorted_word_freq.reverse()
    sorted_word_freq = sorted_word_freq[:10000]

    file = open(outputpath, 'w')
    for i in range(len(sorted_word_freq)):
        file.write(sorted_word_freq[i][0] + " " + str(i) + " " + str(sorted_word_freq[i][1]))
        file.write('\n')
    file.close()

In [5]:
# this method converts a dataset to the binary bag of words representation

def convertToBinaryBagOfWords(pathToWords, pathToDataset, output):
    sortedWords = {}
    file = open(pathToWords, 'r')
    for line in file:
        temp = line.split(" ")
        sortedWords[temp[0]] = temp[1].strip()

    file.close()

    file = open(pathToDataset, 'r')
    file2 = open(output, 'w')
    writer = csv.writer(file2)
    reviews = file.readlines()
    for i in range(len(reviews)):
        print(i)
        # for each review, we map each word to the index of that word in the frequency sorted list
        # and we change that value to a 1 (ie the word was present)
        # if the word isnt in the list nothing happens
        # at the end we will have an array of dimension 10000 with a 1 at every index where the word appears
        category = 0
        review = reviews[i]
        vectorRepresentation = np.zeros(10001)
        words = review.split(' ')
        for word in words:
            temp = word.split('\t')
            if len(temp) > 1:
                # this means this word contains the category index
                # so we first separate the index from the last word
                vectorRepresentation[-1] = temp[1].strip()
                word = temp[0].strip()
            word = prepocessword(word)
            try:
                # this try block will fail if the word isn't in the 10000 most used words
                vectorRepresentation[int(sortedWords[word])] = 1
            except:
                pass
                # the word is not in the 10000 most frequent words
        writer.writerow(vectorRepresentation)
    file2.close()
    file.close()
    

# and this method converts the dataset to the frequency bag of words representation
def convertToFrequencyBagOfWords(pathToWords, pathToDataset, output):
    sortedWords = {}
    file = open(pathToWords, 'r')
    for line in file:
        temp = line.split(" ")
        sortedWords[temp[0]] = temp[1].strip()

    file.close()

    file = open(pathToDataset, 'r')
    file2 = open(output, 'w')
    writer = csv.writer(file2)
    reviews = file.readlines()
    for i in range(len(reviews)):
        print(i)
        category = 0
        # for each review, we map each word to the index of that word in the frequency sorted list
        # and we change that value to a 1 (ie the word was present)
        # if the word isnt in the list nothing happens
        # at the end we will have an array of dimension 10000 with a 1 at every index where the word appears
        review = reviews[i]
        vectorRepresentation = np.zeros(10001)
        words = review.split(' ')
        occurence = 0
        for word in words:
            temp = word.split('\t')
            if len(temp) > 1:
                # this means this word contains the category index
                # so we first separate the index from the last word
                category = temp[1].strip()
                word = temp[0].strip()
            word = prepocessword(word)
            try:
                vectorRepresentation[int(sortedWords[word])] += 1
                occurence += 1
            except:
                pass
                # the word is not in the 10000 most frequent words
        for k in range(len(vectorRepresentation)):
            vectorRepresentation[k] = float(vectorRepresentation[k]/occurence)

        vectorRepresentation[-1]=category
        writer.writerow(vectorRepresentation)
    file2.close()
    file.close()

In [6]:
# this method takes a path to the vocabulary, and a path to reviews and changes every review to a sequence
# of ID numbers representing specific words
def reviewToIdFormat(pathToDictionary, pathToReviews, output):
    sortedWords = {}
    file = open(pathToDictionary, 'r')
    for line in file:
        temp = line.split(" ")
        sortedWords[temp[0]] = temp[1].strip()
    file.close()

    file = open(pathToReviews, 'r')
    file2 = open(output, 'w')
    reviews = file.readlines()
    for i in range(len(reviews)):
        newReview = ""
        category = ""
        review = reviews[i]
        words = review.split(' ')
        occurence = 0
        for word in words:
            temp = word.split('\t')
            if len(temp) > 1:
                # this means this word contains the category index
                # so we first separate the index from the last word
                category = temp[1].strip()
                word = temp[0].strip()
            word = prepocessword(word)
            try:
                newReview += str(sortedWords[word]) + ' '
            except:
                pass
                # the word is not in the 10000 most frequent words
        newReview = newReview[:-1] + '\t' + str(category)
        file2.write(newReview)
        file2.write('\n')
    file2.close()
    file.close()

In [7]:
# this is a helper method to load reviews and classes from a csv file
# used to load the BBoW and FBoW representations
def loadReviewAsVectors(path):
    file = open(path, 'r')
    reader = csv.reader(file)
    reviews = []
    classes = []
    k = 0
    for row in reader:
        k = k+1
        reviews.append([float(i) for i in row[:-1]])
        classes.append(float(row[-1]))
    file.close()
    return reviews,classes

In [8]:
# Question 2, a) 
# a random classifier and a majority class classifier

def randomClassifier_YELP(pathToTestData):
    reviews, classes = loadReviewAsVectors(pathToTestData)
    predictions = []
    truevalues = []
    for i in range(len(reviews)):
        prediction = random.randint(1,5)
        truevalue = int(classes[i])
        predictions.append(prediction)
        truevalues.append(truevalue)
    return metrics.f1_score(truevalues, predictions, average='macro')


def majorityClassifier_YELP(pathToTrainingData, pathToTestData):
    # we first want to fetch the most common class from the the training data
    # and we will predict each review in the test data to that class
    reviews, classes = loadReviewAsVectors(pathToTrainingData)
    count = [0, 0, 0, 0, 0]
    for value in classes:
        count[int(value)-1] += 1
    mostCommonClass = count.index(max(count)) + 1
    print("the most common class was ")
    print(mostCommonClass)
    print(count)
    reviews, classes = loadReviewAsVectors(pathToTestData)
    predictions = []
    actualClass = []
    for value in classes:
        actualClass.append(value)
        predictions.append(mostCommonClass)
    return metrics.f1_score(actualClass, predictions, average='macro')


In [9]:
# this method is used to train a Bernoulli Naive Bayes classifier
# it takes as input differents path to the data, and a filename to output results
# it fits classifiers with different hyperparameters and selects the one that has the best results
# it then calculates and returns the training validation and test fmeasure for that specific hyperparameter

def fitBernoulliNaiveBayes(pathToTraining,  pathToValdidation, pathToTest, filename):
    alphavalues = np.arange(0, 0.01, 0.001)
    alphavalues = np.concatenate((alphavalues, np.arange(0.01, 0.5, 0.05)))
    alphavalues = np.concatenate((alphavalues, np.arange(0.5, 2, 0.1)))
    training_reviews, training_classes = loadReviewAsVectors(pathToTraining)
    test_reviews, test_classes = loadReviewAsVectors(pathToTest)
    validation_reviews, validation_classes = loadReviewAsVectors(pathToValdidation)
    # store the results
    file = open("NaiveBayesFitting_" + str(filename) + ".csv", 'w')
    writer = csv.writer(file)
    writer.writerow(["alphaValue", "fmeasure"])
    bestAlphaValue = 0
    bestfmeasure = 0
    for alpha in alphavalues:
        print(alpha)
        clf = BernoulliNB(alpha=alpha, binarize=None)
        clf.fit(training_reviews, training_classes)
        predictions = clf.predict(validation_reviews)
        fmeasure = metrics.f1_score(validation_classes, predictions, average='macro')
        writer.writerow([alpha, fmeasure])
        if fmeasure > bestfmeasure:
            bestAlphaValue = alpha
            bestfmeasure = fmeasure

    # now that we have the best alpha value we want to evaluate the performance on the test set
    clf = BernoulliNB(alpha=bestAlphaValue, binarize=None)
    clf.fit(training_reviews, training_classes)
    predictions = clf.predict(test_reviews)
    fmeasure3 = metrics.f1_score(test_classes, predictions, average='macro')

    predictions = clf.predict(training_reviews)
    fmeasure = metrics.f1_score(training_classes, predictions, average='macro')

    predictions = clf.predict(validation_reviews)
    fmeasure2 = metrics.f1_score(validation_classes, predictions, average='macro')
    writer.writerow([bestAlphaValue, fmeasure, fmeasure2, fmeasure3])
    file.close()
    
    return bestAlphaValue, fmeasure, fmeasure2, fmeasure3

In [10]:
# this is similar method as above, but here we are training Gaussian Naive Bayes
# and the hyper parameter to tune is now the 'smoothing' parameter
def fitGaussianNaiveBayes(pathToTraining,  pathToValdidation, pathToTest, filename):
    smoothing = math.pow(10,-10)
    training_reviews, training_classes = loadReviewAsVectors(pathToTraining)
    test_reviews, test_classes = loadReviewAsVectors(pathToTest)
    validation_reviews, validation_classes = loadReviewAsVectors(pathToValdidation)
    # store the results
    file = open("NaiveBayesFitting_" + str(filename) + ".csv", 'w')
    writer = csv.writer(file)
    writer.writerow(["alphaValue", "fmeasure"])
    bestsmoothingvalue = 0
    bestfmeasure = 0
    while smoothing < math.pow(10,-1):
        print(smoothing)
        clf = GaussianNB(var_smoothing=smoothing)
        clf.fit(training_reviews, training_classes)
        predictions = clf.predict(validation_reviews)
        fmeasure = metrics.f1_score(validation_classes, predictions, average='macro')
        print(fmeasure)
        writer.writerow([smoothing, fmeasure])
        if fmeasure > bestfmeasure:
            bestsmoothingvalue = smoothing
            bestfmeasure = fmeasure
        smoothing = smoothing * 10
    # now that we have the best alpha value we want to evaluate the performance on the test set
    clf = GaussianNB(var_smoothing=bestsmoothingvalue)
    clf.fit(training_reviews, training_classes)
    predictions = clf.predict(test_reviews)
    fmeasure3 = metrics.f1_score(test_classes, predictions, average='macro')

    predictions = clf.predict(training_reviews)
    fmeasure1 = metrics.f1_score(training_classes, predictions, average='macro')

    predictions = clf.predict(validation_reviews)
    fmeasure2 = metrics.f1_score(validation_classes, predictions, average='macro')
    writer.writerow([fmeasure1, fmeasure2, fmeasure3])
    file.close()
    
    return bestsmoothingvalue, fmeasure1, fmeasure2, fmeasure3

In [11]:
# this method fits a decision tree classifier to the data provided as parameter
# it tunes different hyper parameters: Criterion, Splitter, max_depth and min_sample_leaf

def fitDecisionTree(pathToTraining,  pathToValdidation, pathToTest, filename):

    training_reviews, training_classes = loadReviewAsVectors(pathToTraining)
    test_reviews, test_classes = loadReviewAsVectors(pathToTest)
    validation_reviews, validation_classes = loadReviewAsVectors(pathToValdidation)
    file = open("DecisionTreeFitting_" + str(filename) + ".csv", 'w')
    writer = csv.writer(file)
    writer.writerow(["criterion", "splitter", "maxdepth", "minSampleLeaf", "fmeasure"])
    criterions = ["gini", "entropy"]
    splitters = ["best", "random"]
    max_depths = np.arange(0,20, 5)
    max_depths = np.concatenate((max_depths, np.arange(21, 45,2)))
    min_sample_leaves = np.arange(1, 5, 1)
    bestHyperParameters = []
    bestFmeasure = 0
    for maxdepth in max_depths:
        for minsampleleaf in min_sample_leaves:
            for criterion in criterions:
                if maxdepth == 0:
                    maxdepth = None
                for splitter in splitters:
                    hyperparameters = [criterion, splitter, maxdepth, minsampleleaf]
                    print(hyperparameters)
                    clf = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=maxdepth,
                                                 min_samples_leaf=minsampleleaf)
                    clf.fit(training_reviews, training_classes)
                    prediction = clf.predict(validation_reviews)
                    fmeasure = metrics.f1_score(validation_classes, prediction, average='macro')
                    print(fmeasure)
                    hyperparameters.append(fmeasure)
                    writer.writerow(hyperparameters)
                    if fmeasure > bestFmeasure:
                        bestFmeasure = fmeasure
                        bestHyperParameters = hyperparameters
    # now once we are done we want to compute the fmeasure on test set, training set and validations set
    clf = DecisionTreeClassifier(criterion=bestHyperParameters[0], splitter=bestHyperParameters[1],
                                 max_depth=bestHyperParameters[2],min_samples_leaf=bestHyperParameters[3])
    clf.fit(training_reviews, training_classes)
    predictions = clf.predict(test_reviews)
    fmeasure3 = metrics.f1_score(test_classes, predictions, average='macro')

    predictions = clf.predict(training_reviews)
    fmeasure1 = metrics.f1_score(training_classes, predictions, average='macro')

    predictions = clf.predict(validation_reviews)
    fmeasure2 = metrics.f1_score(validation_classes, predictions, average='macro')    
    writer.writerow(bestHyperParameters)
    writer.writerow([fmeasure1, fmeasure2, fmeasure3])

    file.close()
    return fmeasure1, fmeasure2, fmeasure3

In [12]:
# this method fits a linear kernel SVM to the data provided
# it tunes the C 'penalty' parameter

def fitLinearSVM(pathToTraining,  pathToValdidation, pathToTest, filename):
    training_reviews, training_classes = loadReviewAsVectors(pathToTraining)
    test_reviews, test_classes = loadReviewAsVectors(pathToTest)
    validation_reviews, validation_classes = loadReviewAsVectors(pathToValdidation)
    # store the results
    file = open("LinearSVMFitting_" + str(filename) + ".csv", 'w')
    writer = csv.writer(file)
    writer.writerow(["C", "validationFmeasure"])
    Cs = [0.001, 0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10, 50, 100]
    bestfmeasure = 0
    besthyperparameter = 0
    for c in Cs:
        hyperparameter = c
        print(hyperparameter)
        clf = LinearSVC(C=c)
        clf.fit(training_reviews, training_classes)
        predictions = clf.predict(validation_reviews)
        fmeasure = metrics.f1_score(validation_classes, predictions, average='macro')
        print(fmeasure)
        if fmeasure > bestfmeasure:
            bestfmeasure = fmeasure
            besthyperparameter = hyperparameter
        writer.writerow([besthyperparameter, fmeasure])

    # now we want to compute, training, validation and test fmeasure for the hyperparameters that lead us to the best
    # results

    clf = LinearSVC(C=besthyperparameter)
    clf.fit(training_reviews, training_classes)
    predictions = clf.predict(training_reviews)
    fmeasure1 = metrics.f1_score(training_classes, predictions, average='macro')

    predictions = clf.predict(validation_reviews)
    fmeasure2 = metrics.f1_score(validation_classes, predictions, average='macro')

    predictions = clf.predict(test_reviews)
    fmeasure3 = metrics.f1_score(test_classes, predictions, average='macro')

    writer.writerow([besthyperparameter,fmeasure1, fmeasure2, fmeasure3])
    file.close()
    return fmeasure1, fmeasure2, fmeasure3

In [13]:
# to generate the vocab files run this cell

createVocab("./hwk3_datasets/IMDB-train.txt", "IMDBvocabfile.txt")
createVocab("./hwk3_datasets/YELP-train.txt", "YELPvocabfile.txt")

In [None]:
# to generate the files containing IDs run this cell 

#IMDB
reviewToIdFormat("IMDB-vocab.txt", "./hwk3_datasets/IMDB-train.txt", "IMDB_train.txt")
reviewToIdFormat("IMDB-vocab.txt", "./hwk3_datasets/IMDB-valid.txt", "IMDB_valid.txt")
reviewToIdFormat("IMDB-vocab.txt", "./hwk3_datasets/IMDB-test.txt", "IMDB_test.txt")

#YELP
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-train.txt", "YELP_train.txt")
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-valid.txt", "YELP_valid.txt")
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-test.txt", "YELP_test.txt")

In [None]:
# to generate the BBoW data files run this cell
#IMDB
convertToBinaryBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-train.txt", "IMDB-train.csv")
convertToBinaryBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-valid.txt", "IMDB-valid.csv")
convertToBinaryBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-test.txt", "IMDB-test.csv")

#YELP
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-train.txt", "YELP-train.csv")
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-valid.txt", "YELP-valid.csv")
convertToBinaryBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-test.txt", "YELP-test.csv")

In [None]:
# to generate the FBoW data files run this cell

#IMDB
convertToFrequencyBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-train.txt", "IMDB-train.csv")
convertToFrequencyBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-valid.txt", "IMDB-valid.csv")
convertToFrequencyBagOfWords("IMDB-vocab.txt", "./hwk3_datasets/IMDB-test.txt", "IMDB-test.csv")

#YELP
convertToFrequencyBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-train.txt", "YELP-train.csv")
convertToFrequencyBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-valid.txt", "YELP-valid.csv")
convertToFrequencyBagOfWords("YELP-vocab.txt", "./hwk3_datasets/yelp-test.txt", "YELP-test.csv")

In [None]:
# to train the classifiers run this cell 

#Naive Bayes:
fitBernoulliNaiveBayes("YELP_train_BBoW.csv", "YELP_valid_BBoW.csv", "YELP_test_BBoW.csv", "YELP_BBoW")
fitGaussianNaiveBayes("YELP_train_FBoW.csv", "YELP_valid_FBoW.csv", "YELP_test_FBoW.csv", "YELP_FBoW")
fitBernoulliNaiveBayes("IMDB_train_BBoW.csv", "IMDB_valid_BBoW.csv", "IMDB_test_BBoW.csv", "IMDB_BBoW")
fitGaussianNaiveBayes("IMDB_train_FBoW.csv", "IMDB_valid_FBoW.csv", "IMDB_test_FBoW.csv", "IMDB_FBoW")


# Decision Tree
fitDecisionTree("YELP_train_BBoW.csv", "YELP_valid_BBoW.csv", "YELP_test_BBoW.csv", "YELP_BBoW")
fitDecisionTree("YELP_train_FBoW.csv", "YELP_valid_FBoW.csv", "YELP_test_FBoW.csv", "YELP_FBoW")
fitDecisionTree("IMDB_train_BBoW.csv", "IMDB_valid_BBoW.csv", "IMDB_test_BBoW.csv", "IMDB_BBoW")

fitDecisionTree("IMDB_train_FBoW.csv", "IMDB_valid_FBoW.csv", "IMDB_test_FBoW.csv", "IMDB_FBoW")

# Linear SVM

fitLinearSVM("YELP_train_BBoW.csv", "YELP_valid_BBoW.csv", "YELP_test_BBoW.csv", "YELP_BBoW")
fitLinearSVM("IMDB_train_BBoW.csv", "IMDB_valid_BBoW.csv", "IMDB_test_BBoW.csv", "IMDB_BBoW")
fitLinearSVM("YELP_train_FBoW.csv", "YELP_valid_FBoW.csv", "YELP_test_FBoW.csv", "YELP_FBoW")
fitLinearSVM("IMDB_train_FBoW.csv", "IMDB_valid_FBoW.csv", "IMDB_test_FBoW.csv", "IMDB_FBoW")