### Recommender system: Day in the city

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, pairwise_distances, calinski_harabaz_score
from sklearn.neighbors import DistanceMetric
import ast
import requests
import time
%matplotlib inline

### 2. EDA -  business.json

In [3]:
# reading in file
df = pd.read_json('yelp_dataset/business.json', lines = True) 
df.describe()

Unnamed: 0,is_open,latitude,longitude,review_count,stars
count,192609.0,192609.0,192609.0,192609.0,192609.0
mean,0.82304,38.541803,-97.594785,33.538962,3.585627
std,0.381635,4.941964,16.697725,110.135224,1.018458
min,0.0,33.204642,-115.493471,3.0,1.0
25%,1.0,33.637408,-112.274677,4.0,3.0
50%,1.0,36.144815,-111.759323,9.0,3.5
75%,1.0,43.602989,-79.983614,25.0,4.5
max,1.0,51.299943,-72.911982,8348.0,5.0


In [4]:
# Dropping Canadian cities

df.postal_code = df.postal_code.apply(lambda x: x if len(x) == 5 else np.nan)

In [5]:
# Dropping Null Attributes column
df = df[~df['postal_code'].isnull()]  
df = df[~df['attributes'].isnull()]    

In [6]:
# Limiting low frequency cities to 500. Below is the list of cities to keep

keep_city = list(pd.DataFrame(df.groupby('city').filter(lambda x : len(x)>500)['city'].value_counts()).reset_index()['index'])
print('Cities to keep:', keep_city)
df.drop(df[~df.city.isin(keep_city)].index, inplace=True)

df.drop(df[df.is_open == 0].index, inplace = True) # dropping permanently closed businesses
df.drop(['is_open'], axis = 1, inplace = True) # dropping because are all open
df.head()


Cities to keep: ['Las Vegas', 'Phoenix', 'Charlotte', 'Scottsdale', 'Pittsburgh', 'Mesa', 'Henderson', 'Tempe', 'Chandler', 'Madison', 'Cleveland', 'Glendale', 'Gilbert', 'Peoria', 'North Las Vegas', 'Champaign', 'Surprise', 'Concord', 'Goodyear', 'Matthews', 'Avondale', 'Huntersville', 'Fort Mill']


Unnamed: 0,address,attributes,business_id,categories,city,hours,latitude,longitude,name,postal_code,review_count,stars,state
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC
7,"4545 E Tropicana Rd Ste 8, Tropicana","{'RestaurantsPriceRange2': '3', 'GoodForKids':...",gbQN7vr_caG_A1ugSmGhWg,"Hair Salons, Hair Stylists, Barbers, Men's Hai...",Las Vegas,"{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'...",36.099872,-115.074574,Supercuts,89121,3,3.5,NV
11,2450 E Indian School Rd,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...",1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,"{'Monday': '7:0-0:0', 'Tuesday': '7:0-0:0', 'W...",33.495194,-112.028588,Taco Bell,85016,18,3.0,AZ
16,"4848 E Cactus Rd, Ste 100","{'BusinessAcceptsCreditCards': 'True', 'Busine...",giC3pVVFxCRR89rApqklyw,"Hair Stylists, Beauty & Spas, Hair Salons, Men...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '9:0-19:0', '...",33.600071,-111.977371,Knot Salon,85254,5,5.0,AZ


In [7]:
df.columns

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'latitude', 'longitude', 'name', 'postal_code', 'review_count', 'stars',
       'state'],
      dtype='object')

Distance between businesses

In [20]:
!conda install -y geopy

Collecting package metadata: done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/osx-64::nbconvert==5.4.0=py37_1
  - defaults/osx-64::jupyterlab==0.35.3=py37_0
  - defaults/osx-64::jupyter==1.0.0=py37_7
  - defaults/osx-64::ipywidgets==7.4.2=py37_0
  - defaults/osx-64::notebook==5.7.4=py37_0
  - defaults/osx-64::spyder==3.3.2=py37_0
  - defaults/osx-64::jupyterlab_server==0.2.0=py37_0
  - defaults/osx-64::widgetsnbextension==3.4.2=py37_0
  - defaults/osx-64::_ipyw_jlab_nb_ext_conf==0.1.0=py37_0
  - defaults/osx-64::xlwings==0.15.1=py37_0
failed

PackagesNotFoundError: The following packages are not available from current channels:

  - geopy

Current channels:

  - https://repo.anaconda.com/pkgs/main/osx-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/free/osx-64
  - https://repo.anaconda.com/pkgs/free/noarch
  - https://repo.anacond

In [26]:
!version

/bin/sh: version: command not found


In [24]:
# from geopy.distance import geodesic
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="specify_your_app_name_here")
# location = geolocator.geocode("175 5th Avenue NYC")
# print(location.address)
from geopy.distance import great_circle

ModuleNotFoundError: No module named 'geographiclib'

In [16]:


df_location = df[['business_id','name','city','address','state','latitude','longitude']]
df_location = df_location.reset_index()

df_location['lat_long'] = "(" + df_location['latitude'].map(str) + ',' + df_location['longitude'].map(str) + ")"


In [17]:
df_location.longitude.isna().sum()

0

In [18]:
df_location['lat_long'].isna().sum()

0

In [19]:
df_location['lat_long'] = df_location['lat_long'].apply(ast.literal_eval)

In [None]:
df_test = df_location.copy()
for num, val in enumerate(df_location.lat_long):
    ls = []
    for b in df_location.lat_long:
        ls.append(great_circle(val, b).miles)
    df_test[f"distance_{num}"] = ls
        
df_test

Cleaning Features

In [None]:
# Categories df

cat = pd.read_json('categories.json')

In [None]:
cat.head(2)

In [None]:
df.categories = df.categories.apply(lambda x: x.replace(" ",""))

In [None]:
df.categories = df.categories.str.split(",")

In [None]:
df = df.reset_index().drop(['index'],axis = 1)

In [None]:
ls = []
for i in range(df.shape[0]):
    if df.categories[i] == None:
        ls.append(np.nan)
    else:
        x = str(cat.parents[cat.title == df.categories[i][0]])
        ls.append(x[x.index("[")+1:x.index("]")])
df["parents"] = ls

In [None]:
# Restaurants only
df_non_restaurants = df.drop(df[df.parents == 'restaurants'].index)
df = df.drop(df[df.parents != 'restaurants'].index)
df.head()

In [None]:
# Converting Attributes dictionary in dataframe

attributes_df = df.attributes.apply(pd.Series)
attributes_df = pd.concat([df, attributes_df], axis = 1)

#Dropping non-attribute columns
attributes_df = attributes_df.drop(['address', 'attributes','hours',
                                    'latitude', 'longitude', 'name', 'postal_code'], axis = 1)

In [None]:
#Cleaning Noise Level

ch = {"u'average'": "'average'","u'quiet'": "'quiet'","u'loud'": "'loud'","u'very_loud'": "'very_loud'"}
attributes_df.NoiseLevel = attributes_df.NoiseLevel.replace(to_replace = ch, value=None)

In [None]:
attributes_df.RestaurantsPriceRange2.value_counts()

In [None]:
attributes_df[(attributes_df.review_count > 50) & (attributes_df.GoodForMeal.isnull() == True)].business_id.count()

In [None]:
attributes_df = attributes_df[attributes_df.GoodForMeal.isnull() == False]

In [None]:
attributes_df = attributes_df.reset_index().drop(['index'], axis = 1)

In [None]:
goodformeal_df = attributes_df.GoodForMeal.apply(ast.literal_eval)
goodformeal_df = goodformeal_df.apply(pd.Series)
goodformeal_df.head(2)

In [None]:
attributes_df = pd.concat([attributes_df, goodformeal_df], axis = 1)

In [None]:
cat_df = attributes_df.categories.apply(lambda x: dict(zip(x, [True]*len(x)))).apply(pd.Series)

In [None]:
attributes_df = pd.concat([attributes_df, cat_df], axis = 1)

In [None]:
attributes_df.shape

In [None]:
#Categories to drop : low frequency < 5

cat_drop = []
for col in cat_df.columns:
    if cat_df[col].count() <=5:
        cat_drop.append(col)
len(cat_drop)

In [None]:
# Dropping from attributes df low frequency columns

for col in cat_drop:
    attributes_df.drop([col],axis =1, inplace=True)

In [None]:
attributes_df.shape

In [None]:
attributes_df = attributes_df[attributes_df.RestaurantsPriceRange2 != 'None']
attributes_df.drop(['categories'],axis = 1)

In [None]:
clean_df = attributes_df.drop(['categories','city','state','parents','RestaurantsTakeOut',
                   'RestaurantsDelivery','DriveThru','GoodForMeal','WiFi',
                   'Caters','RestaurantsTableService','BYOBCorkage','Corkage',
                   'BYOB','CoatCheck','Smoking','GoodForDancing','DogsAllowed',
                   'BusinessAcceptsBitcoin','ByAppointmentOnly','AgesAllowed',
                   'Open24Hours','DietaryRestrictions','RestaurantsCounterService',
                   'AcceptsInsurance','Music','BestNights', 'HappyHour'],axis = 1)

In [None]:
# Number of Ambience null values to drop
sum(clean_df.Ambience.isnull())

In [None]:
# Unpacking Ambience dictionary & dropping null values

clean_df = clean_df[clean_df.Ambience.isnull() == False]
ambience_df = clean_df.Ambience.apply(ast.literal_eval)
ambience_df = ambience_df.apply(pd.Series)
clean_df = pd.concat([clean_df, ambience_df], axis = 1)
clean_df.drop(['Ambience'],axis = 1,inplace = True)
clean_df.head(2)

In [None]:
clean_df[clean_df.BusinessParking.isnull() == True]

In [None]:
clean_df[clean_df.BusinessParking.isnull() == True]

In [None]:
# Unpacking Parking column - replacing null values by no street parking

parking_df = clean_df.BusinessParking.replace(to_replace = np.nan, value = "{'street':False}")
parking_df = parking_df.apply(ast.literal_eval)
parking_df = parking_df.apply(pd.Series)

In [None]:
# Assigning Boolean value to parking: True if garage, validated, lot, valet parking available; False otherwise.

parking_df['parking'] = parking_df['garage']|parking_df['validated']|parking_df['lot']|parking_df['valet']
parking_df.drop(['garage','street','validated','lot','valet'], axis = 1, inplace=True)
clean_df = pd.concat([clean_df, parking_df], axis = 1)
clean_df.drop(['BusinessParking'],axis = 1,inplace = True)

In [None]:
#Cleaning Restaurants Attire

attire = {"u'casual'": "'casual'","u'dressy'": "'dressy'","u'formal'":"'dressy'","'formal'":"'dressy'"}
clean_df.RestaurantsAttire = clean_df.RestaurantsAttire.replace(to_replace = attire, value=None)

In [None]:
#Cleaning Alcohol

alc = {"u'none'": "False","'none'": "False","None": "False","u'full_bar'": "True","'full_bar'": "True",
      "u'beer_and_wine'": "True","'beer_and_wine'":"True"}
clean_df.Alcohol = clean_df.Alcohol.replace(to_replace = alc, value=None)

In [None]:
clean_df.WheelchairAccessible.value_counts()

In [None]:
clean_df.head(3)

In [None]:
# Filling NA values

clean_df.RestaurantsPriceRange2.fillna(1, inplace = True)
clean_df.RestaurantsPriceRange2.isna().sum()

In [None]:
clean_df.RestaurantsAttire.value_counts()

In [None]:


wch = {False:0, True:1,'False':0, 'True':1,'None':0}
clean_df.WheelchairAccessible = clean_df.WheelchairAccessible.replace(to_replace = wch, value=None)
clean_df.GoodForKids = clean_df.GoodForKids.replace(to_replace = wch, value=None)
clean_df.HasTV = clean_df.HasTV.replace(to_replace = wch, value=None)
clean_df.OutdoorSeating = clean_df.OutdoorSeating.replace(to_replace = wch, value=None)
clean_df.BikeParking = clean_df.BikeParking.replace(to_replace = wch, value=None)
clean_df.Alcohol = clean_df.Alcohol.replace(to_replace = wch, value=None)
clean_df.BusinessAcceptsCreditCards = clean_df.BusinessAcceptsCreditCards.replace(to_replace = wch, value=None)
clean_df.RestaurantsReservations = clean_df.RestaurantsReservations.replace(to_replace = wch, value=None)
clean_df.RestaurantsGoodForGroups = clean_df.RestaurantsGoodForGroups.replace(to_replace = wch, value=None)

clean_df.RestaurantsAttire.fillna("'casual'", inplace = True)  

clean_df.NoiseLevel = clean_df.NoiseLevel.replace('None', value="'average'")
clean_df.NoiseLevel.fillna("'average'", inplace = True)  

clean_df.WheelchairAccessible.fillna(2, inplace = True)  # 2 means unknown
clean_df.fillna(0, inplace=True)
clean_df = clean_df.replace(to_replace = wch, value=None)

In [None]:
# change type
clean_df.RestaurantsPriceRange2 = clean_df.RestaurantsPriceRange2.astype(int)

In [None]:
clean_df = pd.concat([clean_df,pd.get_dummies(clean_df.NoiseLevel),pd.get_dummies(clean_df.RestaurantsAttire)],
                     axis = 1)
clean_df.drop(['NoiseLevel','RestaurantsAttire'], axis = 1, inplace = True)

In [None]:
clean_df.head(2)

In [None]:
clean_df = clean_df.reset_index()

In [None]:
clean_df.drop(['index'],axis = 1, inplace= True)

### Part 2: Modeling

In [None]:
X = clean_df.drop(['business_id'], axis = 1)

In [None]:
X.shape

#### Part 2 A: Clustering

In [None]:

k_means = KMeans(n_clusters=10) # Must set number of clusters at initialization time!
k_means.fit(X)

In [None]:
preds = k_means.predict(X)

In [None]:
centers = k_means.cluster_centers_

In [None]:
pd.DataFrame(k_means.cluster_centers_, columns = X.columns)

Optimal number of clusters

In [None]:
k_means_4 = KMeans(n_clusters=4).fit(X)
k_means_7 = KMeans(n_clusters=7).fit(X)
k_means_10 = KMeans(n_clusters=10).fit(X)
k_means_13 = KMeans(n_clusters=13).fit(X)
k_means_16 = KMeans(n_clusters=16).fit(X)
k_means_19 = KMeans(n_clusters=19).fit(X)
k_means_22 = KMeans(n_clusters=22).fit(X)
k_means_25 = KMeans(n_clusters=100).fit(X)

k_list = [k_means_4,k_means_7,k_means_10,k_means_13,k_means_16,k_means_19,k_means_22,k_means_25]

In [None]:
CH_score = []
for ls in k_list:
    labels = ls.labels_
    CH_score.append(calinski_harabaz_score(X,labels))

In [None]:
plt.plot([4,7,10,13,16,19,22,100], CH_score)
plt.xticks([4,7,10,13,16,19,22,100])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

In [None]:
# Testing different number of clusters to find 

k_means_16 = KMeans(n_clusters=16).fit(X)
k_means_19 = KMeans(n_clusters=19).fit(X)
k_means_22 = KMeans(n_clusters=22).fit(X)
k_means_25 = KMeans(n_clusters=25).fit(X)
k_means_30 = KMeans(n_clusters=30).fit(X)

k_means_5 = KMeans(n_clusters=5).fit(X)
k_means_50 = KMeans(n_clusters=50).fit(X)
k_means_100 = KMeans(n_clusters=100).fit(X)
k_means_150 = KMeans(n_clusters=150).fit(X)
k_means_200 = KMeans(n_clusters=200).fit(X)
k_means_120 = KMeans(n_clusters=120).fit(X)
k_means_130 = KMeans(n_clusters=130).fit(X)
k_means_140 = KMeans(n_clusters=140).fit(X)
k_means_150 = KMeans(n_clusters=150).fit(X)
k_means_160 = KMeans(n_clusters=160).fit(X)
k_means_170 = KMeans(n_clusters=170).fit(X)
k_means_180 = KMeans(n_clusters=180).fit(X)
k_means_190 = KMeans(n_clusters=190).fit(X)


k_list = [k_means_16,k_means_19,k_means_22,k_means_25,k_means_30]


CH_score = []
for ls in k_list:
    labels = ls.labels_
    CH_score.append(calinski_harabaz_score(X,labels))


In [None]:
plt.plot([16,19,22,25,30], CH_score)
plt.xticks([16,19,22,25,30])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

In [None]:
k_list2 = [k_means_120, 
k_means_130 ,
k_means_140 ,
k_means_150 ,
k_means_160 ,
k_means_170 ,
k_means_180 ,
k_means_190 ]

CH_score = []
for ls in k_list2:
    labels = ls.labels_
    CH_score.append(calinski_harabaz_score(X,labels))

In [None]:
plt.plot([120,130,140,150,160,170,180,190], CH_score)
plt.xticks([120,130,140,150,160,170,180,190])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

In [None]:
k_means_130 = KMeans(n_clusters=130, init= 'random').fit_predict(X)

In [None]:
X_and_Y = pd.concat([X, pd.Series(k_means_130)],axis = 1)
X_and_Y.rename(columns={0:'pred'}, inplace=True)

In [None]:
# Added init parameter

k_means_5 = KMeans(n_clusters=5, init = 'random').fit(X)
k_means_50 = KMeans(n_clusters=50, init = 'random').fit(X)
k_means_100 = KMeans(n_clusters=100, init = 'random').fit(X)
k_means_150 = KMeans(n_clusters=150, init = 'random').fit(X)
k_means_200 = KMeans(n_clusters=200, init = 'random').fit(X)
k_means_120 = KMeans(n_clusters=120, init = 'random').fit(X)
k_means_130 = KMeans(n_clusters=130, init = 'random').fit(X)
k_means_140 = KMeans(n_clusters=140, init = 'random').fit(X)
k_means_150 = KMeans(n_clusters=150, init = 'random').fit(X)
k_means_160 = KMeans(n_clusters=160, init = 'random').fit(X)
k_means_170 = KMeans(n_clusters=170, init = 'random').fit(X)
k_means_180 = KMeans(n_clusters=180, init = 'random').fit(X)
k_means_190 = KMeans(n_clusters=190, init = 'random').fit(X)

k_list2 = [k_means_120, 
k_means_130 ,
k_means_140 ,
k_means_150 ,
k_means_160 ,
k_means_170 ,
k_means_180 ,
k_means_190 ]

CH_score = []
for ls in k_list2:
    labels = ls.labels_
    CH_score.append(calinski_harabaz_score(X,labels))

Calinski Harabaz Score : The higher VRC values point out the most appropriate number of clusters; it appraises both the distance between different clusters, as well as the closeness of data within each of those clusters (Cali´nskiCali´nski and Harabasz, 1974)

In [None]:
plt.plot([120,130,140,150,160,170,180,190], CH_score)
plt.xticks([120,130,140,150,160,170,180,190])
plt.title("Calinski Harabaz Scores for Different Values of K")
plt.ylabel("Variance Ratio")
plt.xlabel("K=")
plt.show()

Agglomerative clustering

In [None]:
def agg_cluster(X, n_list):
    klist = []
    preds_list = []
    for n in n_list:
        agg = AgglomerativeClustering(n_clusters = n, affinity= 'euclidean', linkage = 'ward')
        preds_list.append(agg.fit_predict(X))
        klist.append(agg.fit(X))
    return klist, preds_list

In [None]:
k_list, agg_preds= agg_cluster(X, [5,10,50,100])

In [None]:
def calinski_plot(X, n_list):
    k_list, preds_list = agg_cluster(X, n_list)
    CH_score = []
    for ls in k_list:
        labels = ls.labels_
        CH_score.append(calinski_harabaz_score(X,labels))

    plt.plot(n_list, CH_score)
    plt.xticks(n_list)
    plt.title("Calinski Harabaz Scores for Different Values of K")
    plt.ylabel("Variance Ratio")
    plt.xlabel("K=")
    plt.show()

In [None]:
calinski_plot(X, [120,130, 140, 150])

In [None]:
_, preds = agg_cluster(X, [140])

In [None]:
def concat_preds(X, preds): 
    X_and_Y = pd.concat([X, pd.Series(preds[0])],axis = 1)
    X_and_Y.rename(columns={0:'pred'}, inplace=True)
    return X_and_Y

In [None]:
X_and_Y = concat_preds(X, preds)

In [None]:
X_and_Y.head()

In [None]:
#Gaussian Mixture 

gmm = GaussianMixture(n_components=100)
gmm.fit(X)

In [None]:
gmm_preds = gmm.predict(X)

In [None]:
gmm_preds

In [None]:
def calinski_plot(X, n_list):
    
    CH_score = []
    
    for n in n_list:
        gmm = GaussianMixture(n_components = n)
        gmm_preds = gmm.fit_predict(X)
        CH_score.append(calinski_harabaz_score(X,gmm_preds))

    plt.plot(n_list, CH_score)
    plt.xticks(n_list)
    plt.title("Calinski Harabaz Scores for Different Values of Components")
    plt.ylabel("Variance Ratio")
    plt.xlabel("Number of components=")
    plt.show()

In [None]:
calinski_plot(X, [50,100,120,130,140,150,200])

In [None]:
silhouette_score(X, gmm_preds, metric= '')

In [None]:
similarity_matrix = pd.DataFrame(pairwise_distances(X, metric='cosine'), columns=clean_df.iloc[:,0])
similarity_matrix.head(2)

In [None]:
similarity_matrix.shape

In [None]:
df_similarity = pd.concat([clean_df.iloc[:,0],similarity_matrix], axis=1)

https://datascience.stackexchange.com/questions/8681/clustering-for-mixed-numeric-and-nominal-discrete-data

In [None]:
similarity_matrix.iloc[0].sort_values()

In [None]:
X_and_Y[X_and_Y.pred == 47.0]

In [None]:
sorted(list(similarity_matrix.iloc[0]))[1:11]

In [None]:
def gower_distance(X):
    individual_variable_distances = []
    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

In [None]:
!pip install kmodes

In [None]:
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes

In [None]:
kmode = KModes(n_clusters = 100, verbose = 2)
clustered_modes = kmode.fit_predict(X.iloc[:,1:])

In [None]:
clustered_modes

### K-means & K-Mode combination -> K-Prototype algorithm



> The k-means algorithm is well known for its efficiency in clustering large data sets. However, working only on numeric values prohibits it from being used to cluster real world data containing categorical values.

In this paper we present two algorithms which extend the k-means algorithm to categorical domains and domains with mixed numeric and categorical values.

The k-modes algorithm uses a simple matching dissimilarity measure to deal with categorical objects, replaces the means of clusters with modes, and uses a frequency-based method to update modes in the clustering process to minimise the clustering cost function.

With these extensions, the k-modes algorithm enables the clustering of categorical data in a fashion similar to k-means.

The k-prototypes algorithm, through the definition of a combined dissimilarity measure, further integrates the k-means and k-modes algorithms to allow for clustering objects described by mixed numeric and categorical attributes. 

In [None]:
kproto = KPrototypes(n_clusters=2, init = 'cao', verbose=2)
clusters = kproto.fit_predict(X, categorical= list(range(2,165)))

In [None]:
df_test.name = df_test.name.str.lower()

In [None]:
df_test[["name", "city", "state"]] = df_test[["name", "city", "state"]].apply(lambda x: x.str.lower())

In [None]:
df_test.drop(['index'], axis=1, inplace=True)

In [None]:
df_test.iloc[:,8:].shape

In [None]:
df_distance = df_test.iloc[:,8:]
df_distance.columns = df_test.business_id
df_distance.columns.name = None

# df_distance.index.rename('index', inplace = True)
df_distance['business_id'] = df_test.business_id

In [None]:
df_distance.head()

In [None]:
df_final = df_similarity.merge(df_test[['name', 'business_id', 'city', 'state']], on='business_id')

In [None]:
df_final.head()

In [None]:
def best_restaurant(name, city=None, num = 10):
    business = df_final[df_final['name'] == name].business_id
    biz_id = business.tolist()[0]
    rec_df = df_final[[biz_id, 'name', 'city', 'state', 'business_id']].merge(df_distance[['business_id', biz_id]], on='business_id')
    
    if city == None:
        return rec_df.sort_values(by=f"{biz_id}_x", ascending=True)[1:num]
    else:
        rec_df = rec_df[rec_df['city']==city.lower()]
        return rec_df.sort_values(by=f"{biz_id}_x", ascending=True)[1:num]
        

In [None]:
best_restaurant('marathon diner', 'henderson')

In [None]:
a = best_restaurant("maria's mexican restaurant & bakery")

In [None]:
def find_closest(name):
    business = df_final[df_final['name'] == name].business_id
    biz_id = business.tolist()[0]
    rec_df = df_final[[biz_id, 'name', 'city', 'state', 'business_id']].merge(df_distance[['business_id', biz_id]], on='business_id')
    return rec_df.sort_values(by=f"{biz_id}_y", ascending=True)
    

In [None]:
def find_dessert(name, max_dist = 5, num = 10 ):
    df = find_closest(name)
    df = df[df[df.columns[-1]] <= max_dist]
    desserts = X_dessert.merge(df, how = 'inner', on = 'business_id')
    
    return desserts

In [None]:
find_dessert("maria's mexican restaurant & bakery", max_dist = 5)

In [None]:
a = find_closest("maria's mexican restaurant & bakery")
a[a[a.columns[-1]] <=5]

In [None]:
ex.sort_values(by = ['has_dessert',ex.columns[-1]], ascending = [False, True])

In [None]:
find_closest("maria's mexican restaurant & bakery", max_dist = 5)

In [None]:
def best_dessert(

In [None]:
X_dessert = clean_df.copy()
X_dessert['has_dessert'] = X_dessert['dessert']+X_dessert['Desserts']+X_dessert['BubbleTea']+X_dessert['Cafes']+X_dessert['Creperies']+X_dessert['IceCream&FrozenYogurt']+X_dessert['Waffles']
X_dessert = X_dessert[X_dessert['has_dessert'] != 0]
X_dessert.shape


In [None]:
X_bar = clean_df.copy()

In [None]:
for col in X_bar.columns:
    print(col)

In [None]:
bar_indice = ['Alcohol', 'latenight', 'Bars', 'Nightlife', 'CocktailBars', 'WineBars','Venues&EventSpaces',
             'DiveBars','Gastropubs','Breweries','Brasseries','BeerBar','Lounges','Karaoke','MusicVenues',
             'divey','loud','very_loud']
X_bar['has_bar'] = X_bar.apply(sum, axis = 1, )


In [None]:
X_bar = X_bar[X_bar['has_bar'] == 1]
X_bar.shape


In [None]:
from mpl_toolkits.mplot3d import Axes3D 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xs = X_dessert.stars
ys = X_dessert.has_dessert
zs = X_dessert.review_count
ax.scatter(xs, ys, zs, )

# Collaborative Filtering

In [10]:
user_to_user = pd.read_csv("DataToRun.csv")

In [176]:
user_to_user.count()

Unnamed: 0                          1800
# 1 Nails                              2
#YelpQueens Autumn’s Up                1
#YelpsThrowBackThursday                1
#getfried fry cafe                     1
$10 DRESS                              1
$2 Shoes                               1
$99 drain rooter                       1
& Waffles                              2
&pizza                                 2
-8℃ Ice Cream                          1
1 800 Flowers                          1
1 Brother’s Pizza                      1
1 Hawaiian Barbecue                    1
1 Way Installation                     1
1-800-Flowers                          1
1-800-GOT-JUNK? LA County East         1
1-800-GOT-JUNK? Las Vegas              1
10 Barrel Brewing - Denver             1
10 Barrel Brewing - Portland           1
10 Body Type Acupuncture Clinic        1
10 Speed Coffee-Calabasas              1
100 Degree Hot Pot                     2
100 North Kitchen And Lounge           1
100 Sails Restau

In [264]:
y = list([x for x in list(user_to_user.columns) if (user_to_user[x].value_counts().sum() > 2)])

user_df = user_to_user[y]

In [265]:
user_df = user_df.rename(index=str, columns={"Unnamed: 0": "User_ID"})

In [266]:
user_df.head()

Unnamed: 0,User_ID,100 Sails Restaurant & Bar,10e,16 A Handcrafted Experience,168 Market,4ur Nails & Spa,595 Craft And Kitchen,7 Leaves Cafe,7-Eleven,702 Nail Lounge,...,Zion Canyon Brewpub,Zion National Park,bin 702,friend_count,review_count,ssooniestyle,uBreakiFix Summerlin,what’s Crepe,zuma,é by José Andrés
0,9j2EDEvHL6m6vzITBUlvvA,,,,,,,,,,...,,,,56,52.0,,,,,
1,X2Cf71Ab7EM9Yz4qJnAfug,,,,,,,,,,...,,,,1432,166.0,,,,,
2,QluW09sYdJb4NBKutSeh1Q,,,,,,,,,,...,,,,9,206.0,,,,,
3,Z9uMICXDQeEX4RRM6xE75A,,,,,,,,,,...,,,,5,26.0,,,,,
4,xtZ5JHQARr2fF9yEhX_21g,,,,,,,,,,...,,,,6,10.0,,,,,


In [267]:
X = user_df.drop(["review_count", "friend_count"], axis=1)

In [268]:
X = X.set_index("User_ID")


In [269]:
X.head()

Unnamed: 0_level_0,100 Sails Restaurant & Bar,10e,16 A Handcrafted Experience,168 Market,4ur Nails & Spa,595 Craft And Kitchen,7 Leaves Cafe,7-Eleven,702 Nail Lounge,777 Towing,...,Zero Degrees,Zest - Bistro & Bar,Zion Canyon Brewpub,Zion National Park,bin 702,ssooniestyle,uBreakiFix Summerlin,what’s Crepe,zuma,é by José Andrés
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9j2EDEvHL6m6vzITBUlvvA,,,,,,,,,,,...,,,,,,,,,,
X2Cf71Ab7EM9Yz4qJnAfug,,,,,,,,,,,...,,,,,,,,,,
QluW09sYdJb4NBKutSeh1Q,,,,,,,,,,,...,,,,,,,,,,
Z9uMICXDQeEX4RRM6xE75A,,,,,,,,,,,...,,,,,,,,,,
xtZ5JHQARr2fF9yEhX_21g,,,,,,,,,,,...,,,,,,,,,,


In [270]:
A = X.copy()
A = A.fillna(0)
A.head()

Unnamed: 0_level_0,100 Sails Restaurant & Bar,10e,16 A Handcrafted Experience,168 Market,4ur Nails & Spa,595 Craft And Kitchen,7 Leaves Cafe,7-Eleven,702 Nail Lounge,777 Towing,...,Zero Degrees,Zest - Bistro & Bar,Zion Canyon Brewpub,Zion National Park,bin 702,ssooniestyle,uBreakiFix Summerlin,what’s Crepe,zuma,é by José Andrés
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9j2EDEvHL6m6vzITBUlvvA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
X2Cf71Ab7EM9Yz4qJnAfug,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
QluW09sYdJb4NBKutSeh1Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z9uMICXDQeEX4RRM6xE75A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xtZ5JHQARr2fF9yEhX_21g,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [274]:
B.shape

(1508, 2541)

In [273]:
drop = list([x for x in list((A.T).columns) if (A.T[x].sum() > 9)])

B = A.T[drop].T

In [275]:
UID = pd.DataFrame(B.index)

In [276]:
UID

Unnamed: 0,User_ID
0,9j2EDEvHL6m6vzITBUlvvA
1,X2Cf71Ab7EM9Yz4qJnAfug
2,QluW09sYdJb4NBKutSeh1Q
3,xtZ5JHQARr2fF9yEhX_21g
4,uaFOvyGPRvSETSS6Lk8-5A
5,pa8zvcFxnj6jFtywlxKf_g
6,LH5TWnLrS5-V3_ORAV2ImA
7,ZJYqaNXVTayu8oFIGC9D5Q
8,AQaDEFPvOpT9gYRb2MaR_g
9,AXN85O1z1qSpuBGYsr3b0A


In [277]:
B_vals = B.values
ratings_mean = np.mean(B_vals, axis=1)
B_norm = B_vals - ratings_mean.reshape(-1, 1)
B_norm[:20]

array([[-0.03423849, -0.03423849, -0.03423849, ..., -0.03423849,
        -0.03423849, -0.03423849],
       [-0.04919323, -0.04919323, -0.04919323, ..., -0.04919323,
        -0.04919323, -0.04919323],
       [-0.01338056, -0.01338056, -0.01338056, ..., -0.01338056,
        -0.01338056, -0.01338056],
       ...,
       [-0.03896104, -0.03896104, -0.03896104, ..., -0.03896104,
        -0.03896104, -0.03896104],
       [-0.01338056, -0.01338056, -0.01338056, ..., -0.01338056,
        -0.01338056, -0.01338056],
       [-0.01810311, -0.01810311, -0.01810311, ..., -0.01810311,
        -0.01810311, -0.01810311]])

# Matrix Factorization With SVD

In [278]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(B_norm, k = 50)

In [320]:
def recommender_preds(A, k):
    A_vals = A.values
    ratings_mean = np.mean(A_vals, axis=1)
    A_norm = A_vals - ratings_mean.reshape(-1, 1)
    U, sigma, Vt = svds(A_norm, k = k)
    sigma = np.diag(sigma)
    predictions = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
    predictions_df = pd.DataFrame(predictions, columns = A.columns)
    return sigma, predictions_df

In [321]:
sigma, predictions_df = recommender_preds(B, 50)

In [322]:
sigma, sigma.shape

(array([[30.37615158,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , 30.40894938,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        , 30.59705129, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., 42.37823552,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         47.31045124,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        , 56.11920173]]), (50, 50))

In [323]:
predictions_df.head()

Unnamed: 0,100 Sails Restaurant & Bar,10e,16 A Handcrafted Experience,168 Market,4ur Nails & Spa,595 Craft And Kitchen,7 Leaves Cafe,7-Eleven,702 Nail Lounge,777 Towing,...,Zero Degrees,Zest - Bistro & Bar,Zion Canyon Brewpub,Zion National Park,bin 702,ssooniestyle,uBreakiFix Summerlin,what’s Crepe,zuma,é by José Andrés
0,0.033631,0.027246,0.046246,0.048525,0.038238,0.045162,-0.07881,-0.024603,0.03999,0.036469,...,0.034634,0.199557,0.051155,0.043602,0.02683,0.055561,0.061538,0.017996,-0.139447,0.011995
1,0.000837,-0.03769,0.067168,-0.03693,0.047474,0.058762,-0.381835,-0.18254,0.101034,-0.003425,...,0.157523,0.420378,-0.021162,0.097478,-0.024293,0.049386,-0.100822,-0.070572,0.037218,-0.021835
2,0.000202,-0.004596,-0.009867,0.009757,-0.027041,-0.025636,-0.06934,0.019538,0.021049,-0.013472,...,-0.019358,-0.036709,-0.021479,-0.019928,-0.049892,-0.032679,-0.030494,0.043319,0.035417,-0.01474
3,-0.000351,0.004521,-0.008957,0.027085,0.005978,-0.028247,-0.033224,0.03517,0.001485,0.004324,...,0.007673,0.00741,0.00522,-0.005931,0.00888,-0.002428,-0.002167,0.044579,0.032046,-0.000786
4,0.024176,-0.008589,0.053265,0.010459,-0.005673,-0.042172,-0.030244,-0.016981,-0.004384,-0.010093,...,0.011139,-0.059035,-0.000763,-0.008274,0.012502,0.002469,-0.013268,0.121863,-0.057847,-0.017959


In [324]:
user_row_id = UID.iloc[1].values[0]
user_row_id

'X2Cf71Ab7EM9Yz4qJnAfug'

In [325]:
test = pd.DataFrame(X.loc['X2Cf71Ab7EM9Yz4qJnAfug'])
test = test.merge(pd.DataFrame(predictions_df.iloc[0]), on=test.index)


In [326]:
test.head()

Unnamed: 0,key_0,X2Cf71Ab7EM9Yz4qJnAfug,0
0,100 Sails Restaurant & Bar,,0.033631
1,10e,,0.027246
2,16 A Handcrafted Experience,,0.046246
3,168 Market,,0.048525
4,4ur Nails & Spa,,0.038238


In [327]:
# Recommending top restaurant not yet rated by user
def recommender(predictions_df, user, original_ratings_df, num_recommendations=5):

    # Get  the user's predictions
    user_row_id = UID.iloc[user -1].values[0]

    # Get the original user data and merge in the movie information
    user_data = pd.DataFrame(original_ratings_df.loc[user_row_id])
    user_data = user_data.merge(pd.DataFrame(predictions_df.iloc[user - 1]), on=user_data.index)

    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = user_data[user_data[user_row_id].isnull() == True].sort_values(by=user_data.columns[-1], ascending =False)[:num_recommendations]
    rated = user_data[user_data[user_row_id].isnull() == False].sort_values(by=user_row_id, ascending =False)
    recommendations = recommendations.rename(columns={"key_0": "Locations", 
                                                      f"{user_row_id}" : "User Rating", 
                                                      recommendations.columns[-1] : "Recommendation"})
    rated = rated.rename(columns ={"key_0": "Locations", 
                                                      f"{user_row_id}" : "User Rating", 
                                                      recommendations.columns[-1] : "Recommendation"})
    # Print user information
    print ('User {0} has already rated {1} places.'.format(user_row_id, rated.shape[0]))
    print ('Recommending highest {0} predicted ratings locations not already rated.'.format(num_recommendations))

    return rated, recommendations

In [343]:
rated, recommendations = recommender(predictions_df, 1, X)

User 9j2EDEvHL6m6vzITBUlvvA has already rated 20 places.
Recommending highest 5 predicted ratings locations not already rated.


In [344]:
print(rated, "\n\n", recommendations)

                                  Locations  User Rating         0
193                           Big Ern’s BBQ          5.0  0.132122
1357                            Mom’s Diner          5.0  0.165800
2181                        The Great Greek          5.0  0.645043
2030                        Sweet Addiction          5.0  0.179131
1772                        Ronald’s Donuts          5.0  0.244845
1741                          Rice n Noodle          5.0  0.209090
1591                          Phat Phrank’s          5.0  0.195134
1563                        Park on Fremont          5.0  0.378265
377               Casa Don Juan  - Downtown          5.0  0.217256
1364         Montana Meat Company - Durango          5.0  0.276946
1343                     Mint Indian Bistro          5.0  0.222975
1336                              Milkywave          5.0  0.235832
1111  La Belle Terre French Bakery And Café          5.0  0.092960
1064                           Kids Kingdom          5.0  0.18

# Using a different K value

In [336]:
sigma2, preds2 = recommender_preds(B, 100)

In [353]:
rated, recommendations = recommender(preds2, 22, X)

User q8DpClBORerI6CGWIOOfAw has already rated 10 places.
Recommending highest 5 predicted ratings locations not already rated.


In [354]:
print(rated, "\n\n", recommendations)

                                    Locations  User Rating        21
641      Ellis Island Hotel, Casino & Brewery          5.0  0.223703
1043                           Joël  Robuchon          5.0  0.265342
2394               Vic & Anthony’s Steakhouse          5.0  1.746757
163                                  Bay Poke          4.0  0.190619
560                              Din Tai Fung          4.0  1.698017
1026  Joe Vicari’s Andiamo Italian Steakhouse          4.0  0.706244
1211                   Lou Malnati’s Pizzeria          4.0  0.166423
1598                         Pho Lantern Cafe          4.0  0.177451
2162                       The D Casino Hotel          4.0  0.208982
1586                                    Perch          3.0  0.086713 

                       Locations  User Rating  Recommendation
169              Beauty & Essex          NaN        0.543711
808   Gordon Ramsay Pub & Grill          NaN        0.538382
1607          Pieology Pizzeria          NaN        0.4