In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from random import sample
from collections import Counter
import string
from sklearn.dummy import DummyClassifier
from numpy.random import random_integers
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [3]:
def readTxt(fileName):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment3\Datasets' + fileName
    df = pd.read_csv(fullFileName, encoding='utf-8', header = None,
                 sep='\t')
    
    return df

In [4]:
def readCsv(fileName):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment3' + fileName
    df = pd.read_csv(fullFileName, encoding='utf-8', header = None,
                 sep=',')
    
    return df

In [5]:
def toCsvDf(fileName, df):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment3\Datasets' + fileName
    df.to_csv(fullFileName, header = False, sep='\t', index = False)

In [6]:
def toCsvNp(fileName, npArray):
    fullFileName = r'C:\Users\Owner\McGill\4thYear\COMP551\Assignments\Assignment3\Datasets' + fileName
    np.savetxt(fullFileName, npArray, delimiter=',')

In [7]:
def createDictionary(trainingSet):
    allWords = list()
    cnt = Counter()
    
    if trainingSet=='IMDB':
        trainingDF = readTxt('\IMDB-train.txt')
    elif trainingSet=='yelp':
        trainingDF = readTxt('\yelp-train.txt')
    
# Replacing !"#$%&()*+,-./:;<=>?@[\]^_`{|}~ with ' ' * 31 (31 spaces, needs to be same length)
# and replacing ' with ' ' (apostrophe with space)
    translator = str.maketrans(string.punctuation.replace('\'', ''), 31*' ', '\'')
    
    for i in range(0, len(trainingDF)):
        allWords.extend(trainingDF.iloc[i,0].translate(translator).lower().split(" "))
    
    for word in allWords:
        cnt[word] +=1
    
    dictionaryWords = list(zip(*cnt.most_common(10001)[1:]))[0]
    
    dictionary = {}
    for index, key in enumerate(dictionaryWords):
        dictionary[key] = index
#     print(dictionary)
    newArray = np.asarray(cnt.most_common(10001)[1:])
    withIndexVocabArray = np.insert(newArray, 1, range(0,10000),1)
    
#     if trainingSet=='IMDB':
#         toCsvDf('\IMDB-vocab.txt', pd.DataFrame(withIndexVocabArray))
#     elif trainingSet=='yelp':
#         toCsvDf('\yelp-vocab.txt', pd.DataFrame(withIndexVocabArray))
        
    return dictionary
# createDictionary('yelp')

In [8]:
def convertWordsToVector(trainingRow, dictionary, BOWType):
    translator = str.maketrans(string.punctuation.replace('\'', ''), 31*' ', '\'')
    returnRow = trainingRow.translate(translator).lower().split(" ")
    vector = np.zeros(10000, dtype = np.int8)
    
    for word in returnRow:
        if word in dictionary:
            if BOWType == 'BagOfWords':
                vector[dictionary[word]] = np.int8(1)
            elif BOWType == 'Frequency':
                vector[dictionary[word]] += 1
    
    if BOWType == 'Frequency':
#         To accomodate for the fact that one of the rows has one word, d-gust-ing, and the lenght of the vector is zero.
#         Hence, the vector returns [Nan Nan Nan ... Nan Nan].  Now it returns [0 0 0 ... 0 0 0]
        vectorLength =  np.sum(vector)
        if vectorLength>0:
            vector = np.divide(vector, vectorLength)
            
    return vector

In [9]:
def createBagOfWordsMatrix(df, dictionary, BOWType):
    translator = str.maketrans(string.punctuation.replace('\'', ''), 31*' ', '\'')
    
    if BOWType == 'BagOfWords':
        distancesArray = np.zeros((len(df), len(dictionary)), dtype = np.int8)
    elif BOWType == 'Frequency':
        distancesArray = np.zeros((len(df), len(dictionary)))
        
    for i in range(0, len(df)):
        vector = convertWordsToVector(df.iloc[i,0], dictionary, BOWType)
        distancesArray[i] = vector
        
    return distancesArray

# print(createBagOfWordsMatrix(readTxt('\yelp-train.txt'), createDictionary('yelp'), 'Frequency'))

In [10]:
def createReviewsTxt(df, dictionary):
    translator = str.maketrans(string.punctuation.replace('\'', ''), 31*' ', '\'')
    reviewsList = list()
    targets = df.iloc[:,1]
    
    for i in range(0, len(df)):
        reviewsString = ''
        trainingRow = df.iloc[i,0].translate(translator).lower().split(" ")
        for word in trainingRow:
            if word in dictionary:
                reviewsString += str(dictionary[word]) + ' ' 
        reviewsList.append(reviewsString)

    outputDF = pd.DataFrame(list(zip(reviewsList, targets)))
    toCsvDf('\submit\yelp-valid.txt', outputDF)
    
    return outputDF

In [13]:
def testScores(predicted, actual):
    f1Score = f1_score(actual, predicted, average='macro')
    confusionMatrix = confusion_matrix(actual, predicted)
    print('F1 Score:', f1Score)
    print('Confusion Matrix:\n', confusionMatrix)
    return f1Score

In [25]:
def bernoulliNB(dataset, BOWType):
    print(dataset)
    if dataset == 'yelp':
        dictionary = createDictionary('yelp')
        trainingDF = readTxt('\yelp-train.txt')
        validDF = readTxt('\yelp-valid.txt')
        testDF = readTxt('\yelp-test.txt')
    elif dataset == 'IMDB':
        dictionary = createDictionary('IMDB')
        trainingDF = readTxt('\IMDB-train.txt')
        validDF = readTxt('\IMDB-valid.txt')
        testDF = readTxt('\IMDB-test.txt')
        
    clf = BernoulliNB(alpha = 0.0256419)
    print(clf.get_params())
    clf.fit(createBagOfWordsMatrix(trainingDF, dictionary, BOWType), trainingDF[1])
    
    print('Training F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(trainingDF, dictionary, BOWType))
    testScores(predictionsArray, trainingDF[1])
    
    print('Validation F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(validDF, dictionary, BOWType))
    testScores(predictionsArray, validDF[1])
    
    print('Test F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(testDF, dictionary, BOWType))
    testScores(predictionsArray, testDF[1])
    
bernoulliNB('IMDB', 'Frequency')

IMDB
{'alpha': 0.0256419, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}
Training F1-Measure
F1 Score: 0.873838532099
Confusion Matrix:
 [[6666  834]
 [1058 6442]]
Validation F1-Measure
F1 Score: 0.845474285376
Confusion Matrix:
 [[4292  708]
 [ 837 4163]]
Test F1-Measure
F1 Score: 0.834173134176
Confusion Matrix:
 [[10801  1699]
 [ 2443 10057]]


In [26]:
def linearSVC(dataset, BOWType):
    print(dataset)
    if dataset == 'yelp':
        dictionary = createDictionary('yelp')
        trainingDF = readTxt('\yelp-train.txt')
        validDF = readTxt('\yelp-valid.txt')
        testDF = readTxt('\yelp-test.txt')
    elif dataset == 'IMDB':
        dictionary = createDictionary('IMDB')
        trainingDF = readTxt('\IMDB-train.txt')
        validDF = readTxt('\IMDB-valid.txt')
        testDF = readTxt('\IMDB-test.txt')
        
    clf = LinearSVC(C=100, dual = False, max_iter = 5758, tol = 0.01)
    print(clf.get_params())
    clf.fit(createBagOfWordsMatrix(trainingDF, dictionary, BOWType), trainingDF[1])
    
    print('Training F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(trainingDF, dictionary, BOWType))
    testScores(predictionsArray, trainingDF[1])
    
    print('Validation F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(validDF, dictionary, BOWType))
    testScores(predictionsArray, validDF[1])
    
    print('Test F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(testDF, dictionary, BOWType))
    testScores(predictionsArray, testDF[1])
    
linearSVC('IMDB', 'Frequency')

IMDB
{'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 5758, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.01, 'verbose': 0}
Training F1-Measure
F1 Score: 0.948599537396
Confusion Matrix:
 [[7092  408]
 [ 363 7137]]
Validation F1-Measure
F1 Score: 0.882395479282
Confusion Matrix:
 [[4381  619]
 [ 557 4443]]
Test F1-Measure
F1 Score: 0.877359038495
Confusion Matrix:
 [[10932  1568]
 [ 1498 11002]]


In [27]:
def decisionTree(dataset, BOWType):
    print(dataset)
    if dataset == 'yelp':
        dictionary = createDictionary('yelp')
        trainingDF = readTxt('\yelp-train.txt')
        validDF = readTxt('\yelp-valid.txt')
        testDF = readTxt('\yelp-test.txt')
    elif dataset == 'IMDB':
        dictionary = createDictionary('IMDB')
        trainingDF = readTxt('\IMDB-train.txt')
        validDF = readTxt('\IMDB-valid.txt')
        testDF = readTxt('\IMDB-test.txt')
        
    clf = tree.DecisionTreeClassifier(max_depth = 23, 
                                      max_leaf_nodes= 351, min_impurity_decrease = 0.00036711903528788959, 
                                      min_samples_leaf = 16, min_samples_split = 195)
    print(clf.get_params())
    clf.fit(createBagOfWordsMatrix(trainingDF, dictionary, BOWType), trainingDF[1])
    
    print('Training F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(trainingDF, dictionary, BOWType))
    testScores(predictionsArray, trainingDF[1])
    
    print('Validation F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(validDF, dictionary, BOWType))
    testScores(predictionsArray, validDF[1])
    
    print('Test F1-Measure')
    predictionsArray = clf.predict(createBagOfWordsMatrix(testDF, dictionary, BOWType))
    testScores(predictionsArray, testDF[1])
    
decisionTree('IMDB', 'Frequency')

IMDB
{'class_weight': None, 'criterion': 'gini', 'max_depth': 23, 'max_features': None, 'max_leaf_nodes': 351, 'min_impurity_decrease': 0.0003671190352878896, 'min_impurity_split': None, 'min_samples_leaf': 16, 'min_samples_split': 195, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
Training F1-Measure
F1 Score: 0.791918478527
Confusion Matrix:
 [[5700 1800]
 [1318 6182]]
Validation F1-Measure
F1 Score: 0.726264209885
Confusion Matrix:
 [[3434 1566]
 [1167 3833]]
Test F1-Measure
F1 Score: 0.740742472922
Confusion Matrix:
 [[8787 3713]
 [2759 9741]]


In [28]:
def randomClassifier(dataset, BOWType):
    print(dataset, 'Random Classifier')
    if dataset == 'yelp':
        dictionary = createDictionary('yelp')
        trainingDF = readTxt('\yelp-train.txt')
        validDF = readTxt('\yelp-valid.txt')
        testDF = readTxt('\yelp-test.txt')
    elif dataset == 'IMDB':
        dictionary = createDictionary('IMDB')
        trainingDF = readTxt('\IMDB-train.txt')
        validDF = readTxt('\IMDB-valid.txt')
        testDF = readTxt('\IMDB-test.txt')
    
    randomClassifier = DummyClassifier(strategy='uniform')
    randomClassifier.fit(createBagOfWordsMatrix(trainingDF, dictionary, BOWType), trainingDF[1])
    
    print('Training F1-Measure')
    predictionsArray = randomClassifier.predict(createBagOfWordsMatrix(trainingDF, dictionary, BOWType))
    testScores(predictionsArray, trainingDF[1])
    
    print('Validation F1-Measure')
    predictionsArray = randomClassifier.predict(createBagOfWordsMatrix(validDF, dictionary, BOWType))
    testScores(predictionsArray, validDF[1])
    
    print('Test F1-Measure')
    predictionsArray = randomClassifier.predict(createBagOfWordsMatrix(testDF, dictionary, BOWType))
    testScores(predictionsArray, testDF[1])
    
randomClassifier('IMDB', 'BagOfWords')

IMDB Random Classifier
Training F1-Measure
F1 Score: 0.505866666667
Confusion Matrix:
 [[3794 3706]
 [3706 3794]]
Validation F1-Measure
F1 Score: 0.49864043347
Confusion Matrix:
 [[2439 2561]
 [2452 2548]]
Test F1-Measure
F1 Score: 0.500239996802
Confusion Matrix:
 [[6254 6246]
 [6248 6252]]
