# ADA EndSem P3

- Sampad Kumar Kar
- MCS202215

# 0. Imports

In [20]:
import os, sys
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# 1. Data Loading and Cleaning

The dataset has been fragmented into multiple regions in `.txt` format. We merge them into a single `dataframe` first.

In [7]:
data_dir_path = os.path.join('data', 'raw', 'data')

# list all the files in the data directory
print(os.listdir(data_dir_path))

['chicago.txt', 'los_angeles.txt', 'features.txt', 'new_york.txt', 'README', 'washington_dc.txt', 'atlanta.txt', 'boston.txt', 'new_orleans.txt', 'san_francisco.txt']


In [8]:
# select the city_files
city_files = ['chicago.txt', 'los_angeles.txt', 'new_york.txt', 'washington_dc.txt', 'atlanta.txt', 'boston.txt', 'new_orleans.txt', 'san_francisco.txt']

In [9]:
# select the columns to be used
cols = ['rest_id', 'rest_name', 'rest_features']

# create an empty dataframe
city_dfs = {
    city_file.split('.')[0]: pd.DataFrame(
        [line.strip().split('\t') for line in open(os.path.join(data_dir_path, city_file), 'r')],
        columns=cols
    )
    for city_file in city_files
}

# Using list comprehension to unpack the city_dfs dictionary into individual dataframes
df_atlanta, df_boston, df_chicago, df_los_angeles, df_new_orleans, df_new_york, df_san_francisco, df_washington_dc = [city_dfs[city] for city in city_dfs]

# Concatenating all the dataframes into one
df = pd.concat(city_dfs.values(), keys=city_dfs.keys(), names=['Region']).reset_index()

df

Unnamed: 0,Region,level_1,rest_id,rest_name,rest_features
0,chicago,0,0000000,Moti Mahal,214 035 149 021 117 075 204 051 163
1,chicago,1,0000001,Village,026 249 174 004 132 249 198 191 192 125 075 20...
2,chicago,2,0000002,Millrose Brewing Company,137 249 194 215 213 174 249 191 192 008 075 20...
3,chicago,3,0000003,Dover Straits,137 190 174 249 212 075 205 053 165
4,chicago,4,0000004,Eat Your Hearts Out,214 249 249 197 111 025 025 112 075 205 053 164
...,...,...,...,...,...
4155,san_francisco,409,0000409,Carrara's,123 075 204 053 163
4156,san_francisco,410,0000410,THE LARK CREEK INN,100 080 253 099 231 250 200 189 191 192 063 11...
4157,san_francisco,411,0000411,Bonta,231 250 123 124 076 205 053 166
4158,san_francisco,412,0000412,Phnom Penh,253 250 036 032 076 205 052 164


# 2. Recommender System using Jaccard Similarity

## 2.1 Jaccard Similarity

Since, each restaurant has a corresponding *bag* of features, we can think of them as sets. Then, we can use the Jaccard Similarity to find the similarity between two restaurants.

The jacard similarity between two sets $A$ and $B$ is defined as:

$$
J(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$




In [1]:
# we write a helper function which takes in a string containing the features (which are 3 digit codes, can be thought as integers) separated by a space and returns a set of the codes (integers)

def get_features(feature_string):
    return set(map(int, feature_string.split()))

In [19]:
# define a helper function that takes in 2 sets and returns the jaccard similarity score
def jaccard_similarity(set1, set2):
    if len(set1.union(set2)) == 0:
        return 0
    return len(set1.intersection(set2)) / len(set1.union(set2))

In [24]:
# helper function to recommend similar restaurants based on region

def recommend_similar_restaurant(restaurant_id, region, top_n=5):
    # get the index of the restaurant
    restaurant_idx = df[(df['Region'] == region) & (df['rest_id'] == restaurant_id)].index[0]
    # get the features of the restaurant
    restaurant_features = get_features(df.iloc[restaurant_idx]['rest_features'])

    # get the features of all the restaurants in the region
    region_features = df[df['Region'] == region]['rest_features'].apply(get_features)

    # calculate the jaccard similarity of the restaurant with all the restaurants in the region
    jaccard_similarities = region_features.apply(lambda x: jaccard_similarity(restaurant_features, x))

    # get the top n similar restaurants
    similar_restaurants = jaccard_similarities.sort_values(ascending=False)[1:top_n+1]

    # get the restaurant ids of the similar restaurants
    similar_restaurant_ids = df.iloc[similar_restaurants.index]['rest_id'].values

    # get the restaurant names of the similar restaurants
    similar_restaurant_names = df.iloc[similar_restaurants.index]['rest_name'].values

    # return a dataframe containing the restaurant ids and names
    return pd.DataFrame({'Restaurant ID': similar_restaurant_ids, 'Restaurant Names': similar_restaurant_names})

## 2.1 Recommender System

We try our recommender system on some sample restaurants.

In [25]:
restaurant_id = '0000000'
region = 'chicago'

similar_restaurants = recommend_similar_restaurant(restaurant_id, region, top_n=5)
print(similar_restaurants)

  Restaurant ID Restaurant Names
0       0000664   Standard India
1       0000237    Gandhi Indian
2       0000435          Hi Howe
3       0000520         Hashalom
4       0000261    Old Jerusalem


In [26]:
restaurant_id = '0000001'
region = 'chicago'

similar_restaurants = recommend_similar_restaurant(restaurant_id, region, top_n=5)
print(similar_restaurants)

  Restaurant ID       Restaurant Names
0       0000040                 Rico's
1       0000198          Mill Race Inn
2       0000239  Spavone's Seven Hills
3       0000545               Como Inn
4       0000450             Bravissimo


In [27]:
restaurant_id = '0000002'
region = 'chicago'

similar_restaurants = recommend_similar_restaurant(restaurant_id, region, top_n=5)
print(similar_restaurants)

  Restaurant ID    Restaurant Names
0       0000413  Outback Steakhouse
1       0000281  Barn of Barrington
2       0000198       Mill Race Inn
3       0000146      Southgate Cafe
4       0000053               Bones


In [28]:
restaurant_id = '0000003'
region = 'chicago'

similar_restaurants = recommend_similar_restaurant(restaurant_id, region, top_n=5)
print(similar_restaurants)

  Restaurant ID           Restaurant Names
0       0000335  Don's Fishmarket & Tavern
1       0000506                   Pomodoro
2       0000024          Timbers Charhouse
3       0000177               Ceiling Zero
4       0000038           Shaw's Deerfield


In [29]:
restaurant_id = '0000004'
region = 'chicago'

similar_restaurants = recommend_similar_restaurant(restaurant_id, region, top_n=5)
print(similar_restaurants)

  Restaurant ID Restaurant Names
0       0000119           Jane's
1       0000342      Basta Pasta
2       0000273       Bertucci's
3       0000414       Wild Onion
4       0000613          Frida's


So, our reccommender system works as expected.