# CSE6242 Project
## Main Idea:
1.Given **business X attributes** and **business X categories**matrix, more generally categorize each business using **k means**.

2.Use users' reviews on business to determine their preference by:
    $\rightarrow$Calculate each user's average normalized ratings for each category;
    $\rightarrow$Create a final **user X categories** matrix with ratings.

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import csv
from tqdm import tqdm

In [2]:
def sparse_matrix (base_type=float):
    from collections import defaultdict
    return defaultdict (lambda: defaultdict (base_type))

## Part1. Data Cleaning

### 1.Clean business_attributes data

In [3]:
b_attr_org = pd.read_csv('~/desktop/gatech/cse6242/project/data_sorted/business_attributes.csv', low_memory = False)

In [4]:
#rearrange the format of DataFrame
b_attr_all = b_attr_org.drop('Unnamed: 0', 1)
b_attr_all.set_index('BusinessID', inplace = True)
del b_attr_all.index.name

In [5]:
#try to find valid attributes
#b_attr.notnull().sum().to_csv(path = '~/desktop/gatech/cse6242/project/data_sorted/business_attrs_valid.csv')
#attrs_valid = b_attr.notnull().sum().to_frame(name = 'notnull_freq')
#attrs_valid.sort_values(by = 'notnull_freq', ascending = False)
#attrs_valid.loc[attrs_valid['notnull_freq'] >= 10000].index.tolist()

In [6]:
#choose valid attributes as columns
b_Ambience = b_attr_all[['Ambience.casual','Ambience.romantic', 'Ambience.trendy',
                     'Ambience.touristy', 'Ambience.classy', 'Ambience.intimate', 
                    'Ambience.hipster', 'Ambience.upscale', 'Ambience.divey']]
#drop rows with only missing values and fill NaN value with 0.5
b_Ambience_valid = b_Ambience.dropna(how='all').copy()
b_Ambience_valid = b_Ambience_valid.fillna(0.5)
#convert 'T' 'F' to 0,1
b_Ambience_valid= b_Ambience_valid.astype(float)

#only consider the columns 'Ambience.casual','Ambience.romantic', 'Ambience.trendy' as our largest groups for ambience
large_attr = ['Ambience.casual','Ambience.romantic', 'Ambience.trendy']

### 2.Clean business_categories data

In [7]:
b_cat_org = pd.read_csv ('~/desktop/gatech/cse6242/project/data_sorted/business_category.csv',low_memory = False)

In [8]:
#rearrange the format of DataFrame business_cat
b_cat_all = b_cat_org.drop(['Unnamed: 0','Restaurants','Food'], 1)
b_cat_all.set_index('business_id', inplace = True)
del b_cat_all.index.name

In [9]:
len(b_cat_org)

26729

In [10]:
b_cat_valid = b_cat_all.dropna(how='all').copy()
#convert 'T' 'F' to 0,1
b_cat_valid = b_cat_valid.astype(float)

#determine the freq table
b_cat_freq = (b_cat_valid==1).sum()

#only consider the columns that has freq>500 as our largest groups
large_cat = b_cat_freq[b_cat_freq>500].keys().tolist()

## Part2. Generalize Categories By K means(???)

### 1.Define **kmeans** function to generalize categories to specific large category.

In [11]:
from collections import defaultdict
def kmeans(df,generalized_cat):
    nrow = df.shape[0]
    ncol = df.shape[1]

    #create a dictionary to store {col[i]:(large_col[j],similarity degree)}
    degree = defaultdict(float)
    #create the sparse matrix based on the raw matrix b_Ambience_valid above
    df_spm = sp.csr_matrix(df.values)
    #create b_attrs transpose matrix
    df_T = df.T
    
    #calculate degree matrix
    for i in range(ncol):
        degree_i = 0.0 #initialize col i degree
        #large_col are the 3 columns that we are going to generalize other columns to:casual, romantic, trendy
        for j in range(len(generalized_cat)):
            #for each category i, find the dot product value with main category j
            df_j = sp.csr_matrix(df_T.loc[generalized_cat[j]:generalized_cat[j]])
            #calculate columns i degree on each columns in large_col
            degree_j = df_j*df_spm[:,i]     
            #store the value only if it is the largest one
            if degree_j.nnz!=0:
                if degree_j.data[0] > degree_i:
                    #assign the nearest mail column for i and its degree
                    degree[df.columns[i]]=(generalized_cat[j],degree_j.data[0]) 
                    degree_i = degree_j.data[0] #update degree_i with bigger value
        #check if there is no degree found, assign it 0.0 
        if degree_i == 0.0:
            degree[df.columns[i]]=(df.columns[i],0.0)
            
    df_copy = df.copy()
    #Replace the attr name in following form: 
    # (original attr, generalized attr)
    for i in range(ncol):
        df_copy.rename(columns={df_copy.columns[i]:df_copy.columns[i]+','+degree[df_copy.columns[i]][0]},inplace=True)

    #drop the categories that has degree == 0 (not belongs to any large group)
    for i in df_copy.columns:
        if i.split(',')[1] not in generalized_cat:
            df_copy.drop(i,1,inplace=True)
            
    spm_gnr = sparse_matrix()

    #for each business, get it's generalized attribute if exists
    for i in range(df_copy.shape[0]):
        for j in range(df_copy.shape[1]):
            if df_copy[df_copy.columns[j]].iloc[i]!=0:
                j_attr = df_copy.columns[j].split(',')[1]
                spm_gnr[df_copy.index[i]][j_attr] =+ df_copy[df_copy.columns[j]].iloc[i]
    
    return spm_gnr

### 2.Create the **business X category_attr** sparese matrix for further steps.

#### Create the **business X general_attr** sparese matrix for further steps.

In [12]:
b_attr_gnr = kmeans(b_Ambience_valid,large_attr)

#### Create the **business X category_attr** sparese matrix for further steps.

In [13]:
B_cat_spm = sparse_matrix()
nrow = b_cat_valid.shape[0]
ncol = b_cat_valid.shape[1]
b_cat_copy = b_cat_valid.copy()

#for each business, get it's category if exists
for i in range(nrow):
    for j in range(ncol):
        if b_cat_copy[b_cat_copy.columns[j]].iloc[i]==1.0:
            j_cat = b_cat_copy.columns[j]
            B_cat_spm[b_cat_copy.index[i]][j_cat] =+ 1.0

In [14]:
# Combine business X general_attr and business_category sparse matrix to 
# create the **business X category_attr** sparese matrix
busicat_attr_T = sparse_matrix()
for ambience_k,ambience_v in b_attr_gnr.items():
    for category_k, category_v in B_cat_spm.items():
        if ambience_k == category_k:
            for x in list(category_v):
                for a in list(ambience_v):
                    category_v[x] +=  ambience_v[a]
                    category_v[x +"_"+ a] =  category_v.pop(x)
                    busicat_attr_T.update({category_k: category_v})

In [15]:
busicat_attr = pd.DataFrame(busicat_attr_T).T

In [16]:
#drop rows with only missing values and fill NaN value with 0
busicat_attr_copy = busicat_attr.dropna(how="all").copy()
busicat_attr_copy = busicat_attr_copy.fillna(0)

### 3.Create the business X generalized category_attr sparese matrix for further steps

In [17]:
#Set the generalized categories we want
large_cat_attr = []

for i in large_cat:
    large_cat_attr.append(i+ "_Ambience.casual")
    large_cat_attr.append(i+ "_Ambience.romantic")
    large_cat_attr.append(i+ "_Ambience.trendy")

In [18]:
busicat_attr_gnr = pd.DataFrame(kmeans(busicat_attr_copy,large_cat_attr)).T

In [19]:
busicat_attr_gnr = busicat_attr_gnr.dropna(thresh = 1)
Business_catattr = busicat_attr_gnr.notnull().astype('int')
Business_catattr = Business_catattr.replace(0, np.NaN)

## Part3. Integrate "Reviews" with Big Categories
Given **"reviews.csv"** file and business X category_attr matrix derived above, determine each user's preference by calculating the average normalized ratings for each category.

In [20]:
#reviews = pd.read_csv('~/desktop/gatech/cse6242/project/data_sorted/merged_reviews_users.csv', low_memory = False)
all_reviews = pd.read_csv('~/desktop/gatech/cse6242/project/data_sorted/merged_all_reviews.csv', low_memory = False)

In [21]:
#rearrange the format of DataFrame reviews
reviews_all = all_reviews.drop('Unnamed: 0',1)
#retrieve columns that we need only
reviews_sub = reviews_all[['BusinessID','UserID','UserName',
                      'Avg_user_rating','rating']]

In [22]:
reviews_sub.head()

Unnamed: 0,BusinessID,UserID,UserName,Avg_user_rating,rating
0,__etvGuL2dh_a1LOT0gNYQ,FczdrDVX94FdHJ7Y4JZrWA,Christopher_102,3.15,4
1,__etvGuL2dh_a1LOT0gNYQ,8EyiYlKUMdJsQD3sA-fAjA,Patricia_79,4.07,5
2,__etvGuL2dh_a1LOT0gNYQ,nP2HH7Qe5UtSsjeEH3NZuQ,Elena_9,4.06,4
3,__etvGuL2dh_a1LOT0gNYQ,NqcLDh324mINcicHKspQaA,pj_1,3.1,5
4,__etvGuL2dh_a1LOT0gNYQ,tFyQbNbBQEyEc9oCr1pJUg,Marshall_9,4.3,5


In [43]:
Business_catattr.to_csv(path_or_buf = '~/desktop/gatech/cse6242/project/Business_catattr.csv')

In [24]:
len(reviews_sub["BusinessID"].unique())

8975

In [25]:
reviews_cat_joined= reviews_sub.join(Business_catattr,on ='BusinessID', how='left')
len(reviews_cat_joined)

526280

In [26]:
#drop rows with all NA ratings
reviews_cat_joined_copy = reviews_cat_joined.dropna(thresh=6).copy()
len(reviews_cat_joined_copy)

516526

In [27]:
col = reviews_cat_joined_copy.columns.values.tolist()
cat = col[5:] #get the columns name for all categories

In [28]:
#Now put each user's rating to corresponding categories
for i in cat:
    #for each category, if 1, times with rating and normalize 
    reviews_cat_joined_copy[i]=reviews_cat_joined_copy['rating']*reviews_cat_joined_copy[i]

In [29]:
reviews_cat_joined_copy.head()

Unnamed: 0,BusinessID,UserID,UserName,Avg_user_rating,rating,American (New)_Ambience.casual,American (New)_Ambience.romantic,American (New)_Ambience.trendy,American (Traditional)_Ambience.casual,American (Traditional)_Ambience.romantic,...,Seafood_Ambience.trendy,Steakhouses_Ambience.casual,Steakhouses_Ambience.romantic,Steakhouses_Ambience.trendy,Sushi Bars_Ambience.casual,Sushi Bars_Ambience.romantic,Sushi Bars_Ambience.trendy,Thai_Ambience.casual,Thai_Ambience.romantic,Thai_Ambience.trendy
0,__etvGuL2dh_a1LOT0gNYQ,FczdrDVX94FdHJ7Y4JZrWA,Christopher_102,3.15,4,,,,4.0,,...,,,,,,,,,,
1,__etvGuL2dh_a1LOT0gNYQ,8EyiYlKUMdJsQD3sA-fAjA,Patricia_79,4.07,5,,,,5.0,,...,,,,,,,,,,
2,__etvGuL2dh_a1LOT0gNYQ,nP2HH7Qe5UtSsjeEH3NZuQ,Elena_9,4.06,4,,,,4.0,,...,,,,,,,,,,
3,__etvGuL2dh_a1LOT0gNYQ,NqcLDh324mINcicHKspQaA,pj_1,3.1,5,,,,5.0,,...,,,,,,,,,,
4,__etvGuL2dh_a1LOT0gNYQ,tFyQbNbBQEyEc9oCr1pJUg,Marshall_9,4.3,5,,,,5.0,,...,,,,,,,,,,


In [30]:
#Since UserName and UserID are both unique, we only keep one of them.
user_rating = reviews_cat_joined_copy[reviews_cat_joined_copy.columns[2:]]
user_rating.reset_index(level=None, drop=True, inplace=True)

In [31]:
user_rating_copy = user_rating.copy()

In [32]:
#Now calculate each user's average rating for each category
User_Avg_cat = pd.DataFrame()
col_list = user_rating_copy.columns[3:].tolist()
User_Avg_cat = user_rating_copy.groupby('UserName')[col_list].mean()

In [33]:
len(User_Avg_cat)

18772

In [34]:
User_avg_inter = user_rating_copy[user_rating_copy.columns[:2]]
User_Avg= User_avg_inter.join(User_Avg_cat,on ='UserName', how='left')

In [35]:
User_Avg.drop_duplicates(subset = 'UserName', inplace = True)

In [36]:
User_Avg.loc[User_Avg['UserName'] =='Christopher_102']

Unnamed: 0,UserName,Avg_user_rating,American (New)_Ambience.casual,American (New)_Ambience.romantic,American (New)_Ambience.trendy,American (Traditional)_Ambience.casual,American (Traditional)_Ambience.romantic,American (Traditional)_Ambience.trendy,Asian Fusion_Ambience.casual,Asian Fusion_Ambience.romantic,...,Seafood_Ambience.trendy,Steakhouses_Ambience.casual,Steakhouses_Ambience.romantic,Steakhouses_Ambience.trendy,Sushi Bars_Ambience.casual,Sushi Bars_Ambience.romantic,Sushi Bars_Ambience.trendy,Thai_Ambience.casual,Thai_Ambience.romantic,Thai_Ambience.trendy
0,Christopher_102,3.15,1.0,,,3.5,,2.0,,,...,,,,,,,,,,


In [37]:
#Now, for each category's rating, normalize by substract that user's average rating
user_avg = User_Avg.copy()
avg = user_avg.columns[2:]
for i in avg:  
    user_avg[i] = User_Avg[i]-User_Avg['Avg_user_rating'] 
user_avg.drop('Avg_user_rating',1,inplace=True)
user_avg = user_avg.round(2)

In [38]:
user_avg_busi = user_avg.set_index('UserName', drop=True, inplace=False)
del user_avg_busi.index.name

In [44]:
#Busi_X_User = sparse_matrix()

In [45]:
#for i in tqdm(Business_catattr.index.tolist()):
#    for j in user_avg_busi.T.columns.tolist():
#        Busi_X_User[i][j] = user_avg_busi.T[j].multiply(Business_catattr.loc[i]).mean()

## Part4. Predict User's Rating On Categories w/o Rating Info