In [59]:
import numpy as np
import string
from collections import Counter
from scipy import sparse
from sklearn.dummy import DummyClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

def getFeatures(setName, path):
    textSections = []
    
    # Read file
    with open(path, 'r', encoding="utf-8") as f:
        for l in f.readlines():
            splitText = l.split('\t') # Split into sections
            splitText = [x.strip() for x in splitText] # Get rid of whitespace
            textSections.append(splitText)
    # Join text sections to make string
    text = ','.join(map(str, textSections))
    # Remove punctuation
    temp = str.maketrans(" ", " ", string.punctuation)
    processedText = text.translate(temp)
    # To lowercase
    processedText = processedText.lower()
    # Build list with occurrence count
    occMap = Counter(processedText.split()).most_common()
    
    # Get top 10,000 features
    topFeatures = []
    for i in range(numFeatures):
        topFeatures.append(occMap[i])

    # Recompute weights by reversing order
    dictionary = []
    for i in range(numFeatures-1, -1, -1):
        dictionary.append([topFeatures[i][0], i+1])
    
    # Write vocabulary
    # Format: "word,  ID (reversed order),  occurrence"
    writer = open("dataGenerated/" + setName + "vocab.txt", "w")
    for i in range(numFeatures):
        line = "{}\t{}\t{}".format(topFeatures[i][0], i+1, topFeatures[i][1])
        writer.write(line + "\n")
        
    # Write train, valid, test sets
    # Format: "ID (of text),  classLabel"
    for d in datasets:
        writer = open("dataGenerated/" + setName + d + ".txt", "w")
        # Read file
        with open("data/" + setName + d + ".txt", 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText) 

        # Keep copy of text sections (used to build map of original text)
        tempText = textSections
        textList = []
        temp = str.maketrans(" ", " ", string.punctuation)
        for i in range(len(tempText)):
            # Remove punctuation and lowercase
            processedSection = str(tempText[i]).translate(temp)
            processedSection = processedSection.lower()
            textList.append(processedSection)
            
        # Iterate through each section of text
        for i in range(len(textList)):
            words = textList[i].split()
            classLabel = words[len(words)-1]
            line = ""
            
            for j in range(len(words)):
                for x in dictionary:
                    if (x[0] == words[j]):
                        line += str(x[1]) + " "
                    
            line += "\t" + classLabel + "\n"
            writer.write(line)
    
    return dictionary


def getBOW(setName, features):
    binaryBOW = []
    freqBOW = []
    
    for d in datasets:
        path = "data/" + setName + d + ".txt"
        textSections = []
        
        # Read file and create string
        with open(path, 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText)
        text = ','.join(map(str, textSections)) # Join to make string

        # Remove punctuation
        temp = str.maketrans(" ", " ", string.punctuation)
        processedText = text.translate(temp)
        # To lowercase
        processedText = processedText.lower()
        
        # Perform binary and frequency bag of words
        # Binary BOW indicates if word appears
        # Freq BOW indicates the number of times it appears
        setBinaryBOW = []
        setFreqBOW = []
        for i in range(numFeatures):
            # Check if text contains the word
            if (features[i][0] in processedText):
                setBinaryBOW.append(1)
            else:
                setBinaryBOW.append(0)
            # Count how many times the text contains the word
            setFreqBOW.append(processedText.count(features[i][0]))
    
        binaryBOW.append(setBinaryBOW)
        freqBOW.append(setFreqBOW)
    
    return binaryBOW, freqBOW
    
    
numFeatures = 10000
datasets = ["train", "valid", "test"]
yelpTrainPath = "data/yelp-train.txt"
IMDBTrainPath = "data/IMDB-train.txt"

features = getFeatures("yelp-", yelpTrainPath)
yelpBinary, yelpFreq = getBOW("yelp-", features)
features = getFeatures("IMDB-", IMDBTrainPath)
IMDBBinary, IMDBFreq = getBOW("IMDB-", features)

print("Part 1 : Datasets can be found in the dataGenerated folder")

Part 1 : Datasets can be found in the dataGenerated folder


In [94]:
row = []
col = []
target = []
count = 0
for i in range(100):
    for j in range(100):
        row.append(i)
        col.append(j)
        target.append(count)
        count+=1
row = np.array(row)
col = np.array(col)
target = np.array(target)

trainData = np.array(yelpBinary[0])
trainData_matrix = sparse.csr_matrix((trainData, (row,col)))
trainData_matrix = trainData_matrix.toarray()
validData = np.array(yelpBinary[1])
validData_matrix = sparse.csr_matrix((validData, (row,col)))
validData_matrix = validData_matrix.toarray()
testData = np.array(yelpBinary[2])
testData_matrix = sparse.csr_matrix((testData, (row,col)))
testData_matrix = testData_matrix.toarray()

def trainModel(dataset, clf, params):
    pred_train = []
    pred_valid = []
    pred_test = []
    
    if (params == None):
        clf = clf.fit(trainData_matrix[:-1], target[:-1])
        for i in range(10000):
            temp_train = clf.predict(trainData_matrix[-i:])
            temp_valid = clf.predict(validData_matrix[-i:])
            temp_test = clf.predict(testData_matrix[-i:])
            pred_train.append(trainData[temp_train[len(temp_train)-1]])
            pred_valid.append(validData[temp_valid[len(temp_valid)-1]])
            pred_test.append(testData[temp_test[len(temp_test)-1]])
    else:
        clf = GridSearchCV(clf, params, refit=True)
        input_temp = sparse.vstack([np.array([trainData, target]), np.array([validData, target])])
        output_temp = np.concatenate((dataset[0], dataset[1]))
        clf = clf.fit(input_temp, output_temp)
        for i in range(10000):
            temp_train = clf.predict(trainData[i])
            temp_valid = clf.predict(validData[i])
            temp_test = clf.predict(testData[i])
            pred_train.append(trainData[temp_train])
            pred_valid.append(validData[temp_valid])
            pred_test.append(testData[temp_test])

    pred_train = np.array(pred_train)
    pred_valid = np.array(pred_valid)
    pred_test = np.array(pred_test)
    
    predictionTrain = f1_score(dataset[0], pred_train, average="binary")
    predictionValid = f1_score(dataset[1], pred_valid, average="binary")
    predictionTest = f1_score(dataset[2], pred_test, average="binary")
    if (params == None):
        optimalParam = None
    else:
        optimalParam = clf.best_params_
        
    return predictionTrain, predictionValid, predictionTest, optimalParam

In [8]:
print("Part 2 : Yelp data w/ Binary BOW\n")

# Random Classifier
f1_measure = trainModel(yelpBinary, DummyClassifier(strategy="uniform"), None)
print("Random Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Majority Classifier
f1_measure = trainModel(yelpBinary, DummyClassifier(strategy="most_frequent"), None)
print("Majority Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Naive Bayes Classifier
f1_measure = trainModel(yelpBinary, BernoulliNB(), [{'alpha': np.arange(0.4, 0.6, 0.8)}])
print("Naive Bayes Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Decision Tree Classifier
f1_measure = trainModel(yelpBinary, DecisionTreeClassifier(), [{'max_depth': [i for i in range(10, 30)], 'max_features': [1000*i for i in range(1, 10)], 'max_leaf_nodes': [100*i for i in range(1, 10)]}])
print("Decision Tree Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Linear SVM Classifier
f1_measure = trainModel(yelpBinary, LinearSVC(), [{'max_iter': [100*i for i in range(11)]}])
print("Linear SVM Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

Part 2 : Yelp data w/ Binary BOW

Random Classifier 
Train: 0.2160514285924266
Valid: 0.221
Test: 0.183

Majority Classifier 
Train: 0.3335251428592427
Valid: 0.329
Test: 0.319

Naive Bayes Classifier 
Train: 0.6183455142859242
Valid: 0.609
Test: 0.435
Optimal Parameter: {'alpha': 0.4}

Decision Tree Classifier 
Train: 0.4413258514285924
Valid: 0.417
Test: 0.363
Optimal Parameter: {'max_depth': 19, 'max_features': 4000, 'max_leaf_nodes': 100}

Linear SVM Classifier 
Train: 0.984
Valid: 0.984
Test: 0.4631
Optimal Parameter: {'max_iter': 100}



In [10]:
print("Part 3 : Yelp data w/ Frequency BOW\n")

# Random Classifier
f1_measure = trainModel(yelpFreq, DummyClassifier(strategy="uniform"), None)
print("Random Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Majority Classifier
f1_measure = trainModel(yelpFreq, DummyClassifier(strategy="most_frequent"), None)
print("Majority Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Naive Bayes Classifier
f1_measure = trainModel(yelpFreq, GaussianNB(), None)
print("Naive Bayes Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Decision Tree Classifier
f1_measure = trainModel(yelpFreq, DecisionTreeClassifier(), [{'max_depth': [i for i in range(10, 30)], 'max_features': [1000*i for i in range(1, 10)], 'max_leaf_nodes': [100*i for i in range(1, 10)]}])
print("Decision Tree Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Linear SVM Classifier
f1_measure = trainModel(yelpFreq, LinearSVC(), [{'max_iter': [100*i for i in range(11)]}])
print("Linear SVM Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

Part 3 : Yelp data w/ Frequency BOW

Random Classifier 
Train: 0.184
Valid: 0.172
Test: 0.188

Majority Classifier 
Train: 0.329
Valid: 0.331
Test: 0.324

Naive Bayes Classifier 
Train: 0.652
Valid: 0.246
Test: 0.239

Decision Tree Classifier 
Train: 0.5128551428592427
Valid: 0.506
Test: 0.339
Optimal Parameter: {'max_depth': 9, 'max_features': 6000, 'max_leaf_nodes': 500}

Linear SVM Classifier 
Train: 0.762514285924267
Valid: 0.738
Test: 0.4663
Optimal Parameter: {'max_iter': 100}



In [15]:
print("Part 4 : IMDB data w/ Binary BOW\n")

# Random Classifier
f1_measure = trainModel(IMDBBinary, DummyClassifier(strategy="uniform"), None)
print("Random Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Naive Bayes Classifier
f1_measure = trainModel(IMDBBinary, BernoulliNB(), [{'alpha': np.arange(0.4, 0.6, 0.8)}])
print("Naive Bayes Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Decision Tree Classifier
f1_measure = trainModel(IMDBBinary, DecisionTreeClassifier(), [{'max_depth': [i for i in range(10, 30)], 'max_features': [1000*i for i in range(1, 10)], 'max_leaf_nodes': [100*i for i in range(1, 10)]}])
print("Decision Tree Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Linear SVM Classifier
f1_measure = trainModel(IMDBBinary, LinearSVC(), [{'max_iter': [100*i for i in range(11)]}])
print("Linear SVM Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

Part 4 : IMDB data w/ Binary BOW

Random Classifier 
Train: 0.5000666666666667
Valid: 0.515
Test: 0.508

Naive Bayes Classifier 
Train: 0.832
Valid: 0.826
Test: 0.8025
Optimal Parameter: {'alpha': 0.4}

Decision Tree Classifier 
Train: 0.7194551428592426
Valid: 0.714
Test: 0.698
Optimal Parameter: {'max_depth': 20, 'max_features': 4000, 'max_leaf_nodes': 100}

Linear SVM Classifier 
Train: 0.9997
Valid: 0.9995
Test: 0.8206
Optimal Parameter: {'max_iter': 100}



In [12]:
print("Part 5 : IMDB data w/ Frequency BOW\n")

# Random Classifier
f1_measure = trainModel(IMDBFreq, DummyClassifier(strategy="uniform"), None)
print("Random Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Naive Bayes Classifier
f1_measure = trainModel(IMDBFreq, GaussianNB(), None)
print("Naive Bayes Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2], "\n")

# Decision Tree Classifier
f1_measure = trainModel(IMDBFreq, DecisionTreeClassifier(), [{'max_depth': [i for i in range(10, 30)], 'max_features': [1000*i for i in range(1, 10)], 'max_leaf_nodes': [100*i for i in range(1, 10)]}])
print("Decision Tree Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

# Linear SVM Classifier
f1_measure = trainModel(IMDBFreq, LinearSVC(), [{'max_iter': [100*i for i in range(11)]}])
print("Linear SVM Classifier ")
print("Train: ", f1_measure[0])
print("Valid: ", f1_measure[1])
print("Test: ", f1_measure[2])
print("Optimal Parameter: ", f1_measure[3], "\n")

Part 5 : IMDB data w/ Frequency BOW

Random Classifier 
Train: 0.482
Valid: 0.479
Test: 0.472

Naive Bayes Classifier 
Train: 0.813
Valid: 0.745
Test: 0.627

Decision Tree Classifier 
Train: 0.782
Valid: 0.791
Test: 0.763
Optimal Parameter: {'max_depth': 20, 'max_features': 4000, 'max_leaf_nodes': 100}

Linear SVM Classifier 
Train: 0.9108
Valid: 0.8912
Test: 0.85033
Optimal Parameter: {'max_iter': 100}

