In [112]:
import numpy as np
import pandas as pd

In [113]:
#num of tags to consider
N = 4

#num of user actions to consider
MIN_USER_ACTION = 7

In [114]:
class CsvReader:
    
    def __init__(self,directory,include):
        self.csv_reader = pd.read_csv(directory,usecols=include)
        
    def getColumn(self,name):
        return self.csv_reader[name]
    
    def findRows(self,column,value):
        return self.csv_reader[self.csv_reader[column] == value]
    
    def length(self):
        return len(self.csv_reader)
    
    def merge(self,other_csv,col,how):
        return self.csv_reader.merge(other_csv,on=col,how=how)
    
    def getReader(self):
        return self.csv_reader
    
    def sort(self,col):
        self.csv_reader.sort(col)
    
class Util:
    
    tagMap = dict()
    latestNumber = -1
    
    @staticmethod
    def collectTags(key):
        if key in Util.tagMap:
            return
        Util.tagMap[key] = Util.latestNumber + 1
        Util.latestNumber +=1
        
    @staticmethod
    def writeToFile():
        
        f = open('taglist.txt','w')
        f.write(str(Util.latestNumber)+"\n")

        for key,value in Util.tagMap.items():
            f.write(str(key)+":"+str(value)+"\n")
        f.close()
    
    @staticmethod
    def clear():
        Util.tagMap = dict()
        Util.latestNumber = -1
        
    @staticmethod
    def load(directory):
        with open(directory,'r')as f:
            Util.latestNumber = int(f.readline())
          
            for line in f:
                line = line.split(':')
                line[1] = line[1][0:len(line[1])-1]
                Util.tagMap[line[0]] = line[1]
    
class Business:

    def __init__(self,star,tags):
    
        self.feature_vector = np.zeros(shape=(1,1+N),dtype=np.float32)
        
        self.tags = tags
        self.tags = self.tags.split(';')
        self.tags.sort()
        
        while(len(self.tags) < N):
            self.tags.append("None")
        
        
        self.feature_vector[0][0] = star
        

    def calculateFeatureVector(self):
   
        for i in range(1,N+1):
            self.feature_vector[0][i] = int(Util.tagMap[self.tags[i-1]])/Util.latestNumber
    
    def __str__(self):
        string = "("
        for i in range(0,len(self.feature_vector[0])):
            if(i != len(self.feature_vector[0])-1):
                string += str(self.feature_vector[0][i])+","
            else:
                string += str(self.feature_vector[0][i])
        string += ")"
        return string

In [115]:
business_review_combined_csv = CsvReader("C:\\Users\\Steven\\Desktop\\yelp_updated.csv",["business_id","restaurant_rating","categories","date","user_id"])

In [116]:
Util.load("C:\\Users\\Steven\\Desktop\\taglist.txt")

In [117]:
businesses = dict()

for row in business_review_combined_csv.getReader().iterrows():
    business_id = row[1]["business_id"]
    stars = row[1]["restaurant_rating"]
    categories = row[1]["categories"]
    
    business = Business(stars,categories)
    
    businesses[business_id] = business

print("finish loading in businesses dictionary")

finish loading in businesses dictionary


In [118]:
training_data_writer = open("training.txt","w")

startIndex = 0

for index,series in business_review_combined_csv.getReader().iterrows():
    
    if(index == 0):
        startIndex = index
    else:
    
        if(business_review_combined_csv.getReader().iat[startIndex,business_review_combined_csv.getReader().columns.get_loc('user_id')] != series[0]):
            
            if((index-startIndex) >= MIN_USER_ACTION+1):
                
                business_vectors = []
            
                for i in range(0,(index-startIndex)):
                   
                    business_vector =  businesses[business_review_combined_csv.getReader().iat[startIndex+i,business_review_combined_csv.getReader().columns.get_loc('business_id')]]
                    
                    business_vector.calculateFeatureVector()
                    
                    business_vectors.append(business_vector)
                    if(not i == 0 and (len(business_vectors)%(MIN_USER_ACTION+1) == 0)):
                        for j in range(0, len(business_vectors)):
                            if(not j == len(business_vectors)-1):
                                training_data_writer.write(str(business_vectors[j])+",")
                            else:
                                training_data_writer.write(str(business_vectors[j])+"\n")
                        business_vectors.pop(0)
                
                startIndex = index
                
                training_data_writer.write("\n")
                
            else:
                startIndex = index
        else:
            pass



In [119]:
training_data_writer.close()