In [59]:
import numpy as np
import string
from collections import Counter

def getFeatures(setName, path):
    textSections = []
    
    # Read file
    with open(path, 'r', encoding="utf-8") as f:
        for l in f.readlines():
            splitText = l.split('\t') # Split into sections
            splitText = [x.strip() for x in splitText] # Get rid of whitespace
            textSections.append(splitText)
    # Join text sections to make string
    text = ','.join(map(str, textSections))
    # Remove punctuation
    temp = str.maketrans(" ", " ", string.punctuation)
    processedText = text.translate(temp)
    # To lowercase
    processedText = processedText.lower()
    # Build list with occurrence count
    occMap = Counter(processedText.split()).most_common()
    
    # Get top 10,000 features
    topFeatures = []
    for i in range(numFeatures):
        topFeatures.append(occMap[i])

    # Recompute weights by reversing order
    dictionary = []
    for i in range(numFeatures-1, -1, -1):
        dictionary.append([topFeatures[i][0], i+1])
    
    # Write vocabulary
    # Format: "word,  ID (reversed order),  occurrence"
    writer = open("dataGenerated/" + setName + "vocab.txt", "w")
    for i in range(numFeatures):
        line = "{}\t{}\t{}".format(topFeatures[i][0], i+1, topFeatures[i][1])
        writer.write(line + "\n")
        
    # Write train, valid, test sets
    # Format: "ID (of text),  classLabel"
    for d in datasets:
        writer = open("dataGenerated/" + setName + d + ".txt", "w")
        # Read file
        with open("data/" + setName + d + ".txt", 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText) 

        # Keep copy of text sections (used to build map of original text)
        tempText = textSections
        textList = []
        temp = str.maketrans(" ", " ", string.punctuation)
        for i in range(len(tempText)):
            # Remove punctuation and lowercase
            processedSection = str(tempText[i]).translate(temp)
            processedSection = processedSection.lower()
            textList.append(processedSection)
            
        # Iterate through each section of text
        for i in range(len(textList)):
            words = textList[i].split()
            classLabel = words[len(words)-1]
            line = ""
            
            for j in range(len(words)):
                for x in dictionary:
                    if (x[0] == words[j]):
                        line += str(x[1]) + " "
                    
            line += "\t" + classLabel + "\n"
            writer.write(line)
    
    return dictionary


def getBOW(setName, features):
    binaryBOW = []
    freqBOW = []
    
    for d in datasets:
        path = "data/" + setName + d + ".txt"
        textSections = []
        
        # Read file and create string
        with open(path, 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText)
        text = ','.join(map(str, textSections)) # Join to make string

        # Remove punctuation
        temp = str.maketrans(" ", " ", string.punctuation)
        processedText = text.translate(temp)
        # To lowercase
        processedText = processedText.lower()
        
        # Perform binary and frequency bag of words
        # Binary BOW indicates if word appears
        # Freq BOW indicates the number of times it appears
        setBinaryBOW = []
        setFreqBOW = []
        for i in range(numFeatures):
            # Check if text contains the word
            if (features[i][0] in processedText):
                setBinaryBOW.append(1)
            else:
                setBinaryBOW.append(0)
            # Count how many times the text contains the word
            setFreqBOW.append(processedText.count(features[i][0]))
    
        binaryBOW.append(setBinaryBOW)
        freqBOW.append(setFreqBOW)
    
    return binaryBOW, freqBOW
    
    
numFeatures = 10000
datasets = ["train", "valid", "test"]
yelpTrainPath = "data/yelp-train.txt"
IMDBTrainPath = "data/IMDB-train.txt"

features = getFeatures("yelp-", yelpTrainPath)
yelpBinary, yelpFreq = getBOW("yelp-", features)
features = getFeatures("IMDB-", IMDBTrainPath)
IMDBBinary, IMDBFreq = getBOW("IMDB-", features)

print("Part 1 : Datasets can be found in the dataGenerated folder")


Part 1 : Datasets can be found in the dataGenerated folder


In [61]:
import pickle

with open("dataTemp/yelpBinary.txt", "wb") as fp:
    pickle.dump(yelpBinary, fp)
    
with open("dataTemp/yelpFreq.txt", "wb") as fp:
    pickle.dump(yelpFreq, fp)
    
with open("dataTemp/IMDBBinary.txt", "wb") as fp:
    pickle.dump(IMDBBinary, fp)
    
with open("dataTemp/IMDBFreq.txt", "wb") as fp:
    pickle.dump(IMDBFreq, fp)
    
print("All variables stored")

All variables stored


In [1]:
import pickle

with open("dataTemp/yelpBinary.txt", "rb") as fp:
    yelpBinary = pickle.load(fp)
    
with open("dataTemp/yelpFreq.txt", "rb") as fp:
    yelpFreq = pickle.load(fp)
    
with open("dataTemp/IMDBBinary.txt", "rb") as fp:
    IMDBBinary = pickle.load(fp)
    
with open("dataTemp/IMDBFreq.txt", "rb") as fp:
    IMDBFreq = pickle.load(fp)

print("All variables loaded")

All variables loaded


In [2]:
import numpy as np
from scipy import sparse
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import f1_score

row = []
col = []
target = []
count = 0
for i in range(100):
    for j in range(100):
        row.append(i)
        col.append(j)
        target.append(count)
        count+=1
row = np.array(row)
col = np.array(col)
target = np.array(target)

trainData = np.array(yelpBinary[0])
trainData_matrix = sparse.csr_matrix((trainData, (row,col)))
trainData_matrix = trainData_matrix.toarray()

validData = np.array(yelpBinary[1])
validData_matrix = sparse.csr_matrix((validData, (row,col)))
validData_matrix = validData_matrix.toarray()

testData = np.array(yelpBinary[2])
testData_matrix = sparse.csr_matrix((testData, (row,col)))
testData_matrix = testData_matrix.toarray()


def trainModel(dataset, clf):
    clf = clf.fit(trainData_matrix[:-1], target[:-1])

    pred_train = []
    pred_valid = []
    pred_test = []
    for i in range(10000):
        # Predict last values, gets index
        temp_train = clf.predict(trainData_matrix[-i:])
        temp_valid = clf.predict(trainData_matrix[-i:])
        temp_test = clf.predict(trainData_matrix[-i:])
        # Get predicted element from original dataset
        pred_train.append(trainData[temp_train[len(temp_train)-1]])
        pred_valid.append(validData[temp_valid[len(temp_valid)-1]])
        pred_test.append(validData[temp_test[len(temp_test)-1]])
    pred_train = np.array(pred_train)
    pred_valid = np.array(pred_valid)
    pred_test = np.array(pred_test)
    
    predictionTrain = f1_score(dataset[0], pred_train, average="binary")
    predictionValid = f1_score(dataset[1], pred_valid, average="binary")
    predictionTest = f1_score(dataset[2], pred_test, average="binary")

    return predictionTrain, predictionValid, predictionTest
        


# Random Classifier
f1_measure = trainModel(yelpBinary, DummyClassifier(strategy="uniform"))
print("\nRandom Classifier: ")
print("Train, Valid, Test: {}\n".format(f1_measure[:3]))

# Majority Classifier
f1_measure = trainModel(yelpBinary, DummyClassifier(strategy="most_frequent"))
print("Majority Classifier: ")
print("Train, Valid, Test: {}\n".format(f1_measure[:3]))


Random Classifier: 
Train, Valid, Test: (1.0, 0.74756113858078321, 0.80126613704071503)

Majority Classifier: 
Train, Valid, Test: (1.0, 0.85642403796672195, 0.92397912519502878)

