In [59]:
import numpy as np
import pandas as pd
import math, string, re
from collections import Counter

def getFeatures(setName, path):
    textSections = []
    
    # Read file
    with open(path, 'r', encoding="utf-8") as f:
        for l in f.readlines():
            splitText = l.split('\t') # Split into sections
            splitText = [x.strip() for x in splitText] # Get rid of whitespace
            textSections.append(splitText) 
    # Join text sections to make string
    text = ','.join(map(str, textSections))
    # Remove punctuation
    temp = str.maketrans(" ", " ", string.punctuation)
    processedText = text.translate(temp)
    # To lowercase
    processedText = processedText.lower()
    # Build list with occurrence count
    occMap = Counter(processedText.split()).most_common()
    
    # Get top 10,000 features
    topFeatures = []
    for i in range(numFeatures):
        topFeatures.append(occMap[i])

    # Recompute weights by reversing order
    dictionary = []
    for i in range(numFeatures-1, -1, -1):
        dictionary.append([topFeatures[i][0], i+1])
    
    # Write vocabulary
    # Format: "word,  ID (reversed order),  occurrence"
    writer = open("dataGenerated/" + setName + "vocab.txt", "w")
    for i in range(numFeatures):
        line = "{}\t{}\t{}".format(topFeatures[i][0], i+1, topFeatures[i][1])
        writer.write(line + "\n")
        
    # Write train, valid, test sets
    # Format: "ID (of text),  classLabel"
    for d in datasets:
        writer = open("dataGenerated/" + setName + d + ".txt", "w")
        # Read file
        with open("data/" + setName + d + ".txt", 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText) 

        # Keep copy of text sections (used to build map of original text)
        tempText = textSections
        textList = []
        temp = str.maketrans(" ", " ", string.punctuation)
        for i in range(len(tempText)):
            # Remove punctuation and lowercase
            processedSection = str(tempText[i]).translate(temp)
            processedSection = processedSection.lower()
            textList.append(processedSection)
            
        # Iterate through each section of text
        for i in range(len(textList)):
            words = textList[i].split()
            classLabel = words[len(words)-1]
            line = ""
            
            for j in range(len(words)):
                for x in dictionary:
                    if (x[0] == words[j]):
                        line += str(x[1]) + " "
                    
            line += "\t" + classLabel + "\n"
            writer.write(line)
    
    return dictionary


def getBOW(setName, features):
    binaryBOW = []
    freqBOW = []
    
    for d in datasets:
        path = "data/" + setName + d + ".txt"
        textSections = []
        
        # Read file and create string
        with open(path, 'r', encoding="utf-8") as f:
            for l in f.readlines():
                splitText = l.split('\t') # Split into sections
                splitText = [x.strip() for x in splitText] # Get rid of whitespace
                textSections.append(splitText)
        text = ','.join(map(str, textSections)) # Join to make string

        # Remove punctuation
        temp = str.maketrans(" ", " ", string.punctuation)
        processedText = text.translate(temp)
        # To lowercase
        processedText = processedText.lower()
        
        # Perform binary and frequency bag of words
        # Binary BOW indicates if word appears
        # Freq BOW indicates the number of times it appears
        setBinaryBOW = []
        setFreqBOW = []
        for i in range(numFeatures):
            # Check if text contains the word
            if (features[i][0] in processedText):
                setBinaryBOW.append(1)
            else:
                setBinaryBOW.append(0)
            # Count how many times the text contains the word
            setFreqBOW.append(processedText.count(features[i][0]))
    
        binaryBOW.append(setBinaryBOW)
        freqBOW.append(setFreqBOW)
    
    return binaryBOW, freqBOW
    
    
numFeatures = 10000
datasets = ["train", "valid", "test"]
yelpTrainPath = "data/yelp-train.txt"
IMDBTrainPath = "data/IMDB-train.txt"

features = getFeatures("yelp-", yelpTrainPath)
yelpBinary, yelpFreq = getBOW("yelp-", features)
features = getFeatures("IMDB-", IMDBTrainPath)
IMDBBinary, IMDBFreq = getBOW("IMDB-", features)

print("Part 1 : Datasets can be found in the dataGenerated folder")


Part 1 : Datasets can be found in the dataGenerated folder


In [61]:
import numpy as np
import pandas as pd
import pickle

with open("dataTemp/yelpBinary.txt", "wb") as fp:
    pickle.dump(yelpBinary, fp)
    
with open("dataTemp/yelpFreq.txt", "wb") as fp:
    pickle.dump(yelpFreq, fp)
    
with open("dataTemp/IMDBBinary.txt", "wb") as fp:
    pickle.dump(IMDBBinary, fp)
    
with open("dataTemp/IMDBFreq.txt", "wb") as fp:
    pickle.dump(IMDBFreq, fp)
    
print("All variables stored")

All variables stored


In [62]:
import pickle

with open("dataTemp/yelpBinary.txt", "rb") as fp:
    yelpBinaryTemp = pickle.load(fp)
    
with open("dataTemp/yelpFreq.txt", "rb") as fp:
    yelpFreqTemp = pickle.load(fp)
    
with open("dataTemp/IMDBBinary.txt", "rb") as fp:
    IMDBBinaryTemp = pickle.load(fp)
    
with open("dataTemp/IMDBFreq.txt", "rb") as fp:
    IMDBFreq = pickle.load(fp)
    
print(type(yelpBinaryTemp))
print(yelpBinaryTemp)
print(type(yelpFreqTemp))
print(yelpFreqTemp)
print(type(IMDBBinaryTemp))
print(IMDBBinaryTemp)
print(type(IMDBFreq))
print(IMDBFreq)

<class 'list'>
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,