# SI 507 Final Project

Name: Shushun Ren, UM-ID: 24284779; Date: 2023/12/08

## 1. Yelp API Interaction

In [2]:
import pandas as pd

In [3]:
import requests

def fetch_data_from_yelp(api_key, query_params, offset=0, limit=20):
    endpoint = "https://api.yelp.com/v3/businesses/search"
    headers = {'Authorization': f'Bearer {api_key}'}
    query_params['offset'] = offset
    query_params['limit'] = limit

    response = requests.get(endpoint, headers=headers, params=query_params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Example usage
api_key = 'Eh3_aLuXmR-Z4hpoO1CUUBziT-nW_7KOaYCDeCcT1zMsmySUSKFPa8DDdZCw0YTjTasb_hKUkCkxX5G0p7p9t5lYMO4TmUFTx41bArcuGFHTBAE8s2LU5VHi_oNzZXYx'
query_params = {'term': 'restaurants', 'location': 'San Francisco'}

In [4]:
## using cache
import json
import os

def save_to_cache(file_name, data):
    with open(file_name, 'w') as f:
        json.dump(data, f)

def load_from_cache(file_name):
    if os.path.exists(file_name):
        with open(file_name, 'r') as f:
            return json.load(f)
    return None

# Usage
cache_file = 'yelp_data_cache.json'
data = load_from_cache(cache_file)

if not data:
    # Make API call and fetch data
    print("Couldn't find cache file! No worries, we can load data for you.\n")
    all_data = []
    offset = 0
    limit = 50  # Max limit per request as per Yelp API documentation
    while True:
        data = fetch_data_from_yelp(api_key, query_params, offset, limit)
        if data and data['businesses']:
            all_data.extend(data['businesses'])
            offset += limit
        else:
            break
    save_to_cache(cache_file, all_data)
    print("Total number of restaurants fetched:", len(all_data))

In [5]:
yelp_data = pd.DataFrame(data)
yelp_data.head()

## choose columns of interest
yelp_data = yelp_data.drop(['id', 'alias', 'image_url', 'phone', 'display_phone'], axis=1)
yelp_data.head()

## change the format for cuisine categories
def concatenate_titles(row):
    return ', '.join([d['title'] for d in row])
yelp_data['categories'] = yelp_data['categories'].apply(concatenate_titles)

## change the format for coordinates
yelp_data['latitude'] = yelp_data['coordinates'].apply(lambda x: x['latitude'])
yelp_data['longitude'] = yelp_data['coordinates'].apply(lambda x: x['longitude'])
yelp_data = yelp_data.drop(['coordinates'], axis=1)

## change the format for location
yelp_data['location'] = yelp_data['location'].apply(lambda x: ', '.join(x['display_address']) if x.get('display_address') else None)
yelp_data.head()

Unnamed: 0,name,is_closed,url,review_count,categories,rating,transactions,price,location,distance,latitude,longitude
0,Marufuku Ramen,False,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4932,Ramen,4.5,"[pickup, delivery]",$$,"1581 Webster St, Ste 235, San Francisco, CA 94115",2720.804034,37.78522,-122.43157
1,Bottega,False,https://www.yelp.com/biz/bottega-san-francisco...,963,"Italian, Pasta Shops, Pizza",4.5,"[delivery, pickup]",$$,"1132 Valencia St, San Francisco, CA 94110",1506.250343,37.75472,-122.4212
2,Dumpling Story,False,https://www.yelp.com/biz/dumpling-story-san-fr...,123,"Asian Fusion, Chinese, Noodles",4.5,[],,"2114 Fillmore St, San Francisco, CA 94115",3167.354822,37.789307,-122.433806
3,Starbelly,False,https://www.yelp.com/biz/starbelly-san-francis...,2320,"New American, Breakfast & Brunch, Cocktail Bars",4.0,"[pickup, delivery, restaurant_reservation]",$$,"3583 16th St, San Francisco, CA 94114",490.660444,37.764075,-122.432572
4,Oodle Yunnan Rice Noodle,False,https://www.yelp.com/biz/oodle-yunnan-rice-noo...,84,"Chinese, Noodles, Soup",4.5,[],,"3420 Balboa St, San Francisco, CA 94121",5466.943193,37.776005,-122.495635


**Variables of the Yelp Fusion Dataset**:
- id: A unique identifier for the business.
- alias: A human-readable identifier, often a simplified name with hyphens.
- name: The name of the business.
- image_url: URL of an image associated with the business.
- is_closed: Boolean indicating if the business is currently closed.
- url: Yelp page URL of the business.
- **review_count**: The number of reviews the business has received.
- **categories**: List of categories the business is associated with (e.g. Types of cuisine).
- **rating**: Average rating of the business.
- coordinates: Geographic coordinates of the business.
- **transactions**: Types of transactions offered by the business (e.g., delivery, pickup).
- location: Address and location details of the business.
- phone: Contact phone number.
- display_phone: Formatted phone number.
- **distance**: Distance of the business from the search location.
- **price**: Price of the cuisine ($, $$, $$$)

## Data Preprocessing

In [6]:
# Basic descriptive statistics
print(yelp_data.describe())

# For non-numeric data, you can use:
print(yelp_data.describe(include=['object']))

       review_count       rating     distance     latitude    longitude
count   1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     717.738000     4.276000  3116.109408    37.774829  -122.432403
std     1148.679581     0.389841  1475.388829     0.017891     0.026474
min        0.000000     0.000000    42.382355    37.706848  -122.508242
25%      107.750000     4.000000  1883.428930    37.762920  -122.447352
50%      308.500000     4.500000  3235.352783    37.776952  -122.429347
75%      794.750000     4.500000  4246.722894    37.787365  -122.413147
max    12234.000000     5.000000  9299.919531    37.837545  -122.369560
          name                                                url  categories  \
count     1000                                               1000        1000   
unique     969                                               1000         728   
top     Souvla  https://www.yelp.com/biz/marufuku-ramen-san-fr...  Vietnamese   
freq         5              

## Foursquare API

In [6]:
## using cache
import json
import os
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs

# parameters
url = "https://api.foursquare.com/v3/places/search"

headers = {
    "accept": "application/json",
    "Authorization": "fsq3Cwnebzog9aBNs0BY1PQh5D3duc6P/7TdxQNFNOsqOHY="
}

# Define the center (latitude and longitude) and radius of your search
center_lat, center_lon = 37.7749, -122.4194  # Example: New York City coordinates
radius = 5000  # Search radius in meters

# Initialize an empty list to store results
all_results = []

# Set initial parameters
params = {
    "ll": f"{center_lat},{center_lon}",  # Latitude and Longitude
    "radius": radius,  # Radius in meters
    "limit": 50  # Adjust based on API's max limit per request
}

# Usage
cache_file = 'foursq_data_cache.json'
data2 = load_from_cache(cache_file)

if not data2:
    # Make API call and fetch data
    print("Couldn't find cache file! No worries, we can load data for you.\n")

    while len(all_results) < 1000:
        response = requests.get(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            all_results.extend(data.get('results', []))
            
            # Check for the 'Link' field in the response headers
            if 'link' in response.headers:
                link_header = response.headers['link']
                next_page_url = link_header.split(';')[0].strip('<>')
                
                # Extract the cursor from the URL
                parsed_url = urlparse(next_page_url)
                next_cursor = parse_qs(parsed_url.query).get('cursor', [None])[0]
                
                # Set the cursor parameter for the next request
                if next_cursor:
                    params['cursor'] = next_cursor
                else:
                    break  # No more results
            else:
                break  # No 'Link' field in the header, stop fetching more data
        else:
            print(f"Failed to fetch data: {response.status_code}")
            break  # Stop if the request was unsuccessful

    save_to_cache(cache_file, all_results)
    print("Total number of restaurants fetched:", len(all_results))


In [7]:
foursq = pd.DataFrame(data2)

## choose columns of interest
foursq = foursq.drop(['fsq_id', 'link', 'timezone'], axis=1)

In [8]:
foursq['chains'] = foursq['chains'].apply(lambda x: ', '.join([d['name'] for d in x]) if x else None)

In [9]:
foursq['categories'][0]
def combine_names(category_list):
    if category_list and isinstance(category_list, list):
        return '; '.join([category['name'] for category in category_list if 'name' in category])
    else:
        return None

foursq['categories'] = foursq['categories'].apply(combine_names)

In [10]:
foursq['geocodes'] = foursq['geocodes'].apply(lambda x: f"{x['main']['latitude']}, {x['main']['longitude']}" if x.get('main') else None)

In [11]:
# foursq['location'][0]
foursq['location'] = foursq['location'].apply(lambda x: x['formatted_address'] if 'formatted_address' in x else None)

In [32]:
# Convert the 'geocodes' column into two separate columns for latitude and longitude
foursq[['latitude', 'longitude']] = foursq['geocodes'].str.split(', ', expand=True)

# Convert the new columns to numeric types if necessary
foursq['latitude'] = pd.to_numeric(foursq['latitude'])
foursq['longitude'] = pd.to_numeric(foursq['longitude'])

# You can now drop the original 'geocodes' column if it's no longer needed
foursq.drop('geocodes', axis=1, inplace=True)

## Preprocessing and merge data

In [29]:
yelp_data.head(1)

Unnamed: 0,name,is_closed,url,review_count,categories,rating,transactions,price,location,distance,latitude,longitude,main_category
0,Marufuku Ramen,False,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4932,Ramen,4.5,"[pickup, delivery]",$$,"1581 Webster St, Ste 235, San Francisco, CA 94115",2720.804034,37.78522,-122.43157,Ramen


In [33]:
foursq.head(1)

Unnamed: 0,categories,chains,closed_bucket,distance,location,name,related_places,main_category,latitude,longitude
0,Concert Hall; Jazz and Blues Venue; Restaurant,,VeryLikelyOpen,247,"201 Franklin St (at Fell St), San Francisco, C...",SFJAZZ Center,{'children': [{'fsq_id': '59068431a22db720a43d...,Concert Hall,37.776278,-122.421445


In [47]:
combined_df = pd.merge(yelp_data, foursq, how='outer')
combined_df.head(5)



Unnamed: 0,name,is_closed,url,review_count,categories,rating,transactions,price,location,distance,latitude,longitude,main_category,chains,closed_bucket,related_places
0,Marufuku Ramen,False,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4932.0,Ramen,4.5,"[pickup, delivery]",$$,"1581 Webster St, Ste 235, San Francisco, CA 94115",2720.804034,37.78522,-122.43157,Ramen,,,
1,Bottega,False,https://www.yelp.com/biz/bottega-san-francisco...,963.0,"Italian, Pasta Shops, Pizza",4.5,"[delivery, pickup]",$$,"1132 Valencia St, San Francisco, CA 94110",1506.250343,37.75472,-122.4212,Italian,,,
2,Dumpling Story,False,https://www.yelp.com/biz/dumpling-story-san-fr...,123.0,"Asian Fusion, Chinese, Noodles",4.5,[],,"2114 Fillmore St, San Francisco, CA 94115",3167.354822,37.789307,-122.433806,Asian Fusion,,,
3,Starbelly,False,https://www.yelp.com/biz/starbelly-san-francis...,2320.0,"New American, Breakfast & Brunch, Cocktail Bars",4.0,"[pickup, delivery, restaurant_reservation]",$$,"3583 16th St, San Francisco, CA 94114",490.660444,37.764075,-122.432572,New American,,,
4,Oodle Yunnan Rice Noodle,False,https://www.yelp.com/biz/oodle-yunnan-rice-noo...,84.0,"Chinese, Noodles, Soup",4.5,[],,"3420 Balboa St, San Francisco, CA 94121",5466.943193,37.776005,-122.495635,Chinese,,,


In [38]:
import networkx as nx
# Create a graph
combined_df['main_category'] = combined_df['categories'].str.split(', ').str[0]
G = nx.Graph()

# Add nodes (restaurants)
for _, row in combined_df.iterrows():
    G.add_node(row['name'], category=row['main_category'], rating=row['rating'], price=row['price'])

# Add edges based on shared categories
for name1, attr1 in G.nodes(data=True):
    for name2, attr2 in G.nodes(data=True):
        if name1 != name2 and attr1['category'] == attr2['category']:
            G.add_edge(name1, name2)

In [39]:
## json file for graphs
import json
from networkx.readwrite import json_graph

# Assuming G is your graph
data = json_graph.node_link_data(G)
with open('graph.json', 'w') as f:
    json.dump(data, f, indent=4)

In [40]:
## standalone python file for json
import json
from networkx.readwrite import json_graph

with open('graph.json', 'r') as f:
    data = json.load(f)
    G = json_graph.node_link_graph(data)

# Here you can add code to visualize or analyze the graph

In [51]:
class TreeNode:
    def __init__(self, name):
        self.name = name
        self.children = []

    def add_child(self, child):
        self.children.append(child)

def get_rating_category(rating):
    if rating > 4.5:
        return ">4.5"
    elif 4.0 <= rating <= 4.5:
        return "4.0-4.5"
    else:
        return "<4.0"

def get_distance_category(distance):
    if distance < 500:
        return "<500m"
    elif 500 <= distance <= 1500:
        return "500-1500m"
    else:
        return ">1500m"

# Create root node
root = TreeNode("Restaurants")

# Create category nodes and price range nodes
category_nodes = {}
for category in yelp_data['main_category'].unique():
    category_node = TreeNode(category)
    root.add_child(category_node)
    category_nodes[category] = category_node

    # Subdivide by price range
    for price in yelp_data[yelp_data['main_category'] == category]['price'].unique():
        price_node = TreeNode(price)
        category_node.add_child(price_node)

        # Subdivide by rating category
        for rating_category in [">4.5", "4.0-4.5", "<4.0"]:
            rating_node = TreeNode(rating_category)
            price_node.add_child(rating_node)

            # Subdivide by distance category
            for distance_category in ["<500m", "500-1500m", ">1500m"]:
                distance_node = TreeNode(distance_category)
                rating_node.add_child(distance_node)

                # Add restaurants as children of the corresponding distance category
                for _, row in yelp_data[(yelp_data['main_category'] == category) & 
                                        (yelp_data['price'] == price) & 
                                        (get_rating_category(row['rating']) == rating_category)].iterrows():
                    if get_distance_category(row['distance']) == distance_category:
                        restaurant_node = TreeNode(row['name'])
                        distance_node.add_child(restaurant_node)

In [49]:
## export_tree_json.py - json file to serialize the trees
import json

def serialize_tree(node):
    tree_dict = {"name": node.name, "children": [serialize_tree(child) for child in node.children]}
    return tree_dict

# Assuming 'root' is your tree root
tree_data = serialize_tree(root)
with open('tree.json', 'w') as f:
    json.dump(tree_data, f, indent=4)

In [50]:
## Standalone Python File for Reading JSON
import json

def deserialize_tree(data):
    node = TreeNode(data['name'])
    for child_data in data.get('children', []):
        child_node = deserialize_tree(child_data)
        node.add_child(child_node)
    return node

with open('tree.json', 'r') as f:
    tree_data = json.load(f)
    root = deserialize_tree(tree_data)


In [42]:
restaurant = combined_df.to_dict(orient='records')

In [46]:
combined_df.head(2)

Unnamed: 0,name,is_closed,url,review_count,categories,rating,transactions,price,location,distance,latitude,longitude,main_category,chains,closed_bucket,related_places
0,Marufuku Ramen,False,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4932.0,Ramen,4.5,"[pickup, delivery]",$$,"1581 Webster St, Ste 235, San Francisco, CA 94115",2720.804034,37.78522,-122.43157,Ramen,,,
1,Bottega,False,https://www.yelp.com/biz/bottega-san-francisco...,963.0,"Italian, Pasta Shops, Pizza",4.5,"[delivery, pickup]",$$,"1132 Valencia St, San Francisco, CA 94110",1506.250343,37.75472,-122.4212,Italian,,,


In [44]:
import json

# Assuming restaurants is your list of dictionaries
with open('restaurants.json', 'w') as file:
    json.dump(restaurant, file, indent=4)