In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)


import os
from modules.dataImporter import yelp_import

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="data"></a>
# Data

We are using subsets of each table since we have a large dataset to work with. For this notebook, we used _business_ and _review_ tables.

In [2]:
# import the data (chunksize returns jsonReader for iteration)

datasets = yelp_import('small')

businesses = datasets['businesses']
reviews = datasets['reviews']

KeyboardInterrupt: 

In [None]:
# read the data
for business in businesses:
    subset_business = business
    break

for review in reviews:
    subset_review = review
    break

In [None]:
# peak the tables
display(subset_business.head(2))
display(subset_review.head(2))

In [None]:
print(subset_business.shape)
print(subset_review.shape)

In [None]:
# No New York, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("New"))]["city"].value_counts()

In [None]:
# No San Diego and San Francisco, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("San"))]["city"].value_counts()

In [None]:
# No Paris, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("Paris"))]["city"].value_counts()

In [None]:
subset_business["city"].value_counts().sort_values(ascending=False)

<a id="preprocessing-data"></a>
# Preprocessing the Data

We chose Philadelphia since it has the highest number of restraunts. The restaurant is the most popular category among businesses. 

In [None]:
# Businesses in Philadelphia and currently open business
city = subset_business[
    (subset_business["city"].str.contains("Philadelphia"))
    & (subset_business["is_open"] == 1)
]
Philadelphia = city[
    ["business_id", "name", "address", "categories", "attributes", "stars"]
]
Philadelphia

In [None]:
# getting just restaurants from Philadelphia business
rest = Philadelphia[
    Philadelphia["categories"].str.contains("Restaurant.*") == True
].reset_index()
rest

<a id="get-dummies"></a>
* ** Get Dummies from attributes and categories columns**

> In "attributes" column has nested attributes. In order to create a feature table, we need to separate those nested attributes into their own columns. Therefore, the following functions will be used to achieve this goal.

In [None]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)


# convert string to dictionary
import ast


def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}")

In [None]:
# get dummies from nested attributes
rest["BusinessParking"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "BusinessParking")), axis=1
)
rest["Ambience"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Ambience")), axis=1
)
rest["GoodForMeal"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "GoodForMeal")), axis=1
)
rest["Dietary"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Dietary")), axis=1
)
rest["Music"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Music")), axis=1
)

In [None]:
rest

In [None]:
# create table with attribute dummies
df_attr = pd.concat(
    [
        rest["attributes"].apply(pd.Series),
        rest["BusinessParking"].apply(pd.Series),
        rest["Ambience"].apply(pd.Series),
        rest["GoodForMeal"].apply(pd.Series),
        rest["Dietary"].apply(pd.Series),
    ],
    axis=1,
)
df_attr_dummies = pd.get_dummies(df_attr)
df_attr_dummies

In [None]:
# get dummies from categories
df_categories_dummies = pd.Series(rest["categories"]).str.get_dummies(",")
df_categories_dummies

In [None]:
# pull out names and stars from rest table
result = rest[["name", "stars"]]
result

In [None]:
# Concat all tables and drop Restaurant column
df_final = pd.concat([df_attr_dummies, df_categories_dummies, result], axis=1)
df_final.drop("Restaurants", inplace=True, axis=1)

In [None]:
# map floating point stars to an integer
mapper = {1.0: 1, 1.5: 2, 2.0: 2, 2.5: 3, 3.0: 3, 3.5: 4, 4.0: 4, 4.5: 5, 5.0: 5}
df_final["stars"] = df_final["stars"].map(mapper)

In [None]:
# Final table for the models
df_final

## Check how many attributes(Tags) in the dataset for restraunts for our recomendation algorithms

In [None]:
# Check how many attributes(Tags) for restraunts for our recomendation algorithms
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
df_final.head()

In [None]:
# List out all attributes with values
df_final.drop(["name", "stars"], axis=1).sum(axis=0).sort_values(ascending=False).head(
    100
)

<a id="content-based"></a>
# Content Based Filtering- Model

In this section, we are going to build a system that recognizes the similarity between restaurants based on specific features and recommends restaurants that are most similar to a particular restaurant. __df_final__ (features) table used to build this system.

In [None]:
# Create X (all the features) and y (target)
X = df_final.iloc[:, :-2]
y = df_final["stars"]

* **Split the data into train and test set (80:20)**

In [None]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(
    X, y, test_size=0.2, random_state=1
)

* **Instantiate and fit the model**

In [None]:
y_train_knn.head()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_knn, y_train_knn)

# y_pred = knn.predict(X_test)

accuracy_train = knn.score(X_train_knn, y_train_knn)
accuracy_test = knn.score(X_test_knn, y_test_knn)

print(f"Score on training set: {accuracy_train}")
print(f"Score on test set: {accuracy_test}")

The restaurant of the validation set

In [None]:
# look at the last row for the test
display(df_final.iloc[-1:])

# look at the restaurant name from the last row.
print("Validation set (Restaurant name): ", df_final["name"].values[-1])

* **Test the model:** 

> We used the last row as a validation set (we didn't include this last row for modeling). 

In [None]:
# test set from the df_final table (only last row): Restaurant name: "Steak & Cheese & Quick Pita Restaurant"
test_set = df_final.iloc[-1:, :-2]

# validation set from the df_final table (exclude the last row)
X_val = df_final.iloc[:-1, :-2]
y_val = df_final["stars"].iloc[:-1]

In [None]:
# fit model with validation set
n_knn = knn.fit(X_val, y_val)

After fitting the KNN model to the validation set, we are going to find the distances between the validation set and the other restaurants based on their similar features. 

In [None]:
# distances and indeces from validation set (Steak & Cheese & Quick Pita Restaurant)
distances, indeces = n_knn.kneighbors(test_set)
# n_knn.kneighbors(test_set)[1][0]

# create table distances and indeces from "Steak & Cheese & Quick Pita Restaurant"
final_table = pd.DataFrame(n_knn.kneighbors(test_set)[0][0], columns=["distance"])
final_table["index"] = n_knn.kneighbors(test_set)[1][0]
final_table.set_index("index")

We are creating the following ***result*** table which displays similar restaurants to the validation restrauant by their distances. Based on this recommendation system, the short distance means having more similarity to the validation restrauant.

In [None]:
# get names of the restaurant that similar to the validation restrauant
result = final_table.join(df_final, on="index")
result[["distance", "index", "name", "stars"]].head(5)