In [1]:
# imports
import pandas as pd
from pandas import json_normalize
import json
import requests
import os
from pprint import pp
import time
from IPython.display import clear_output
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [26]:
# Set the Foursquare API endpoint
foursquare_url = "https://api.foursquare.com/v3/places/search"


foursquare_headers = {
    "accept": "application/json",
    "Authorization": "fsq3nvnxiw/vWtTRRy+EStwXYp/YdZtmYClsPAg4ZioXp38="
}

radius = 1000 # 1km
response = []
categories = '13000'
limit = 30
fields = 'fsq_id,name,rating,popularity,price,categories,distance,geocodes'
time_per_request = 1 / 100

fs_response = requests.get(foursquare_url, headers=foursquare_headers)

# Check if the request was successful 
if fs_response.status_code == 200:

    data = fs_response.json()
else:
    # Print an error message if the request was not successful
    print(f"Error: {fs_response.status_code}")


In [9]:
# Load locations from stations.csv
stations = pd.read_csv('../data/stations.csv', index_col=None)

# Create a list of paired (latitude, longitude)
locations = [f"{lat},{long}" for lat, long in zip(stations['latitude'], stations['longitude'])]

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [10]:
# Load source
with open('../data/responses_fsq.json', 'r') as f:
        responses = json.load(f)

In [11]:
# Validate if categories can have mulitple items

# Flatten the JSON structure using json_normalize
df = json_normalize(responses, record_path='results')

# Calculate the number of categories for each business
df['num_categories'] = df['categories'].apply(len)

# Check if any business has multiple categories
businesses_with_multiple_categories = df[df['num_categories'] > 1]
print(f"Businesses with multiple categories: {len(businesses_with_multiple_categories)}")
print(businesses_with_multiple_categories[['name', 'categories', 'num_categories']])

Businesses with multiple categories: 5179
                       name  \
2                  Buceo 95   
3                      Bánh   
4     Malecon Restaurant II   
5               Osteria 106   
6                Mama's Too   
...                     ...   
7471     Mike's Coffee Shop   
7472  Brooklyn Public House   
7474            Graziella's   
7477                   Olea   
7480                 Karasu   

                                             categories  num_categories  
2     [{'id': 13025, 'name': 'Wine Bar', 'short_name...               2  
3     [{'id': 13032, 'name': 'Cafe, Coffee, and Tea ...               3  
4     [{'id': 13097, 'name': 'Caribbean Restaurant',...               3  
5     [{'id': 13236, 'name': 'Italian Restaurant', '...               2  
6     [{'id': 13064, 'name': 'Pizzeria', 'short_name...               2  
...                                                 ...             ...  
7471  [{'id': 13035, 'name': 'Coffee Shop', 'short_n...            

In [12]:
# List to store extracted items
extracted_items = []

# Iterate through each data entry
for entry in responses:
    # Use json_normalize to flatten the 'businesses' data
    flattened_businesses = json_normalize(entry['results'])

    # Iterate through each row in flattened_businesses
    for idx, business in flattened_businesses.iterrows():
        # Loop through categories and create a new entry for each category
        for category in business['categories']:
            extracted_item = {
                'fsq_id': business['fsq_id'],
                'name': business['name'],
                's_lat': entry['context']['geo_bounds']['circle']['center']['latitude'],
                's_lon': entry['context']['geo_bounds']['circle']['center']['longitude'],
                'p_lat': business.get('geocodes.main.latitude', np.nan),
                'p_lon': business.get('geocodes.main.longitude', np.nan),
                'category_id': category['id'],
                'category_name': category['name'],
                'distance': business['distance'],
                'rating': business.get('rating', np.nan),
                'popularity': business.get('popularity', np.nan),
                'price': business.get('price', np.nan)
            }
            extracted_items.append(extracted_item)

Put your parsed results into a DataFrame

In [13]:
# Create dataframe
df_businesses = pd.DataFrame(extracted_items)
df_businesses.head()

Unnamed: 0,fsq_id,name,s_lat,s_lon,p_lat,p_lon,category_id,category_name,distance,rating,popularity,price
0,4fdc9a79e4b0735a6deafc25,Starbucks,40.795,-73.9645,40.795045,-73.965638,13035,Coffee Shop,100,8.1,0.997958,1.0
1,554c1424498e6819b466c0e7,Bob's Your Uncle,40.795,-73.9645,40.798842,-73.962943,13010,Dive Bar,447,8.4,0.979331,2.0
2,49c079b1f964a52068551fe3,Buceo 95,40.795,-73.9645,40.793603,-73.97109,13025,Wine Bar,581,8.6,0.972596,3.0
3,49c079b1f964a52068551fe3,Buceo 95,40.795,-73.9645,40.793603,-73.97109,13347,Tapas Restaurant,581,8.6,0.972596,3.0
4,5f9dfef4780f611751f28d05,Bánh,40.795,-73.9645,40.800819,-73.96573,13032,"Cafe, Coffee, and Tea House",654,8.8,0.976677,2.0


In [14]:
#Pre-Cleaning
print(df_businesses.duplicated().sum())


2


In [16]:
fsq_with_dupes = df_businesses.copy() 
df_businesses.drop_duplicates(inplace=True)
print(df_businesses.duplicated().sum())

0


In [17]:
#Null values
df_businesses.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15437 entries, 0 to 15438
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fsq_id         15437 non-null  object 
 1   name           15437 non-null  object 
 2   s_lat          15437 non-null  float64
 3   s_lon          15437 non-null  float64
 4   p_lat          15437 non-null  float64
 5   p_lon          15437 non-null  float64
 6   category_id    15437 non-null  int64  
 7   category_name  15437 non-null  object 
 8   distance       15437 non-null  int64  
 9   rating         15189 non-null  float64
 10  popularity     15408 non-null  float64
 11  price          14261 non-null  float64
dtypes: float64(7), int64(2), object(3)
memory usage: 1.5+ MB


In [21]:
#create CSV file

# Create csv file
df_businesses.to_csv('../data/fsq_businesses.csv', index=False)

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [30]:
import requests
import json
from IPython.display import clear_output  # Import clear_output if using in a Jupyter Notebook

# Set the Yelp API endpoint
yelp_url = 'https://api.yelp.com/v3/businesses/search'

# Set the headers with the correct Yelp API key format
yelp_headers = {
    "Authorization": "Bearer jnLK-fvgtspnRi-ZB6c8hr8urzohPxgKF_HD0Qlc67YFpPpRlGDlQsUsLvF6mO6X28aEEvO8IeCBFe7Y6fBHkbZ1Se6BLgWX7htJdQLO8a97pBBcZf4jh4eo4-laZXYx"
}

# Set the parameters for the request
latitude = 40.795000
longitude = -73.964500
yelp_params = {
    'latitude': latitude,
    'longitude': longitude,
    'radius': 1000,
}

# Send the request to Yelp API
yelp_response = requests.get(yelp_url, headers=yelp_headers, params=yelp_params)

# Check if the request was successful
if yelp_response.status_code == 200:
    yelp_data = yelp_response.json()
    # Save intermediate progress after each successful request
    with open('../data/responses_yelp.json', 'w') as f:
        json.dump(yelp_data, f)

    # Print progress
    remaining = yelp_response.headers.get('RateLimit-Remaining')
    clear_output(wait=True)
    print(f"Remaining calls: {remaining}")
else:
    # Print an error message if the request was not successful
    print(f"Error: {yelp_response.status_code}")


Remaining calls: 496.0


Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [45]:
# Load source
with open('../data/responses_yelp.json', 'r') as f:
        responses = json.load(f)



In [32]:
#Validate if categories can have mulitple items

# Flatten the JSON structure using json_normalize
df = json_normalize(responses, record_path='businesses')

# Calculate the number of categories for each business
df['num_categories'] = df['categories'].apply(len)

# Check if any business has multiple categories
businesses_with_multiple_categories = df[df['num_categories'] > 1]
print(f"Businesses with multiple categories: {len(businesses_with_multiple_categories)}")
print(businesses_with_multiple_categories[['name', 'categories', 'num_categories']])

Businesses with multiple categories: 18
                            name  \
0                Jacob's Pickles   
1                      Arco Cafe   
2                  The Calaveras   
4                 Nobody Told Me   
5                The Grand Feast   
6     The Tang - Upper West Side   
7   Bareburger - Upper West Side   
8                        e's BAR   
9                         Amelie   
11         Fumo- Upper West Side   
12                       Sushi W   
13          The Shell Restaurant   
14                        Bosino   
15                  Flor de Mayo   
16                 The Ellington   
17      Smoke Jazz & Supper Club   
18            Melba's Restaurant   
19                 Marlow Bistro   

                                           categories  num_categories  
0   [{'alias': 'comfortfood', 'title': 'Comfort Fo...               3  
1   [{'alias': 'sardinian', 'title': 'Sardinian'},...               2  
2   [{'alias': 'bars', 'title': 'Bars'}, {'alias':...      

In [None]:
import numpy as np

# List to store extracted items
extracted_items = []

# Iterate through each data entry
for entry in responses:

    # Initialize desired info for each business
    for idx, business in flattened_businesses.iterrows():
        # Extract business info
        extracted_item = {
            'id': business['id'],
            'name': business['name'],
            'rating': business.get('rating', np.nan),
            'price': business.get('price', np.nan),
            'review_count': business['review_count'],
            's_lat': entry['region']['center']['latitude'],
            's_lon': entry['region']['center']['longitude'],
            'p_lat': business.get('coordinates.latitude', np.nan),
            'p_lon': business.get('coordinates.longitude', np.nan),
            'categories': [category['alias'] for category in business['categories']],
            'distance': business['distance']
        }
        extracted_items.append(extracted_item)


Put your parsed results into a DataFrame

In [46]:
# Create dataframe
df_businesses = pd.DataFrame(extracted_items)
df_businesses.head()

In [47]:
#Pre-cleaning

#duplicates
print(df_businesses.duplicated().sum())

#Null-values
print(df_businesses.info())

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None


In [48]:
df_businesses.head()

In [50]:
# Create csv file
df_businesses.to_csv('../data/yelp_businesses.csv', index=False)

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

Yelp tends to have a larger database of businesses, providing detailed reviews and a wide range of business attributes. The wealth of user-generated content, such as reviews and ratings, contributes to a more informative and thorough understanding of each POI. Users can access authentic reviews, overall ratings, and user-uploaded photos, offering a holistic view of a business's popularity and quality. Since I am looking for detailed reviews and a wide range of business attributes, Yelp's API stands out as an optimal choice.

Get the top 10 restaurants according to their rating

In [52]:
# Convert 'poi_rating' column to numeric to enable sorting
df['rating'] = pd.to_numeric(df['rating'])

# Sort the DataFrame by 'poi_rating' in descending order
df_sorted = df.sort_values(by='rating', ascending=False)

# Get the top 10 restaurants
top_10_restaurants = df_sorted.head(10)

# Display the result
print(top_10_restaurants)

                        id                              alias  \
5   WWyVq90yo4u3kWEj1Lr7rw           the-grand-feast-new-york   
13  UAMbfUodEHMAIHTltCtyLA      the-shell-restaurant-new-york   
10  Fit_iIDn__NPVZpVlKFBFQ               osteria-106-new-york   
2   xrKa2SpgWzPEEbji3Iw2DQ             the-calaveras-new-york   
3   sKIYGvENchifk1YwFe77pA               saiguette-new-york-2   
4   xEF3Kvd0yw74pjnlFB2Sgg            nobody-told-me-new-york   
6   TzhAlljC_843JO7UDDUIaQ  the-tang-upper-west-side-new-york   
9   UJe9_p6NEl0kA6xHVo5PKw                  amelie-new-york-2   
1   f_1BjTChf6bobreSRpqhsw                 arco-cafe-new-york   
16  h9nuvIu8TyrQcYy8J1AOxg           the-ellington-new-york-4   

                          name  \
5              The Grand Feast   
13        The Shell Restaurant   
10                 Osteria 106   
2                The Calaveras   
3                    Saiguette   
4               Nobody Told Me   
6   The Tang - Upper West Side   
9           