In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import json
import requests as req
import os

# set the path where the data is stored
assets_path = '../data/'

# read the city bikes data and select the columns 'Latitude' and 'Longitude'
cb = pd.read_csv(assets_path + 'citybikes.csv',
                 usecols=['Latitude', 'Longitude', 'Id'])

# set the Point of Interest (POI) types and their corresponding codes
POI = {'restaurants': 13065, 'bars': 13003,
       'parking': 19020, 'museums': 10027,
       'hotels': 19014, 'servicestations': 19007}

# set the search parameters: limit of results and search radius
limit = 50
radius = 500

# set the name of the city to search for POIs
city = 'Vancouver'


# Foursquare


- Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice.
- Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)
- Put your parsed results into a DataFrame

In [2]:
# Retrieve the Foursquare API key from environment variables
FOUR_SQUARE_API = os.getenv('FOURSQUARE')

# Define the base URL and endpoint for the Foursquare API
base_url = "https://api.foursquare.com"
endpoint = '/v3/places/search'

# Define a function to retrieve data from the Foursquare API
def get_fs_data(df, categories):

    # Initialize an empty list to store the raw data from Foursquare
    foursquare_raw = []

    # Iterate over the categories dictionary
    for key, value in categories.items():

        # Iterate over the rows in the input dataframe
        for index, row in df.iterrows():

            # Define the parameters for the API request
            params = f"?ll={row['Latitude']}%2C{row['Longitude']}&radius={radius}&categories={value}&limit={limit}"
            full_url = f"{base_url}{endpoint}{params}"

            # Define the headers for the API request
            headers = {
                "accept": "application/json",
                "Authorization": f"{FOUR_SQUARE_API}"
            }

            # Send the API request and retrieve the results
            res = req.get(full_url, headers=headers).json()['results']

            # Iterate over the results and add the relevant data to the foursquare_raw list
            for i in res:
                foursquare_raw.append({
                    "fsq_id": i['fsq_id'],
                    "category_id": value,
                    "category_name": key,
                    "chains": i['chains'],
                    "distance": i['distance'],
                    "latitude": i['geocodes']['main']['latitude'],
                    "longitude": i['geocodes']['main']['longitude'],
                    "location_country": i.get('location', {}).get('country', None),
                    "location_cross_street": i.get('location', {}).get('cross_street', None),
                    "location_formatted_address": i['location']['formatted_address'],
                    "location_locality": i.get('location', {}).get('locality', None),
                    "location_postcode": i.get('location', {}).get('postcode', None),
                    "location_region": i.get('location', {}).get('region', None),
                    "location_timezone": i.get('timezone', None),
                    "name": i['name'],
                    "bike_station_id": row['Id'],
                })

    # Return the raw data from Foursquare
    return foursquare_raw


In [4]:
# # Calling a function 'get_fs_data' to retrieve financial statement data and creating a dataframe 'fs_data' to store it
# fs_data = pd.DataFrame(get_fs_data(cb,POI)).sample(n=2500)

# # Saving the dataframe 'fs_data' as a CSV file to the specified path
# fs_data.to_csv(assets_path + 'fs_data.csv', index=False)

# Reading the saved CSV file from the specified path and storing the data in the dataframe 'fs_data'
fs_data = pd.read_csv(assets_path + 'fs_data.csv')

# Displaying the first few rows of the dataframe 'fs_data'
fs_data.head()

Unnamed: 0,fsq_id,category_id,category_name,chains,distance,latitude,longitude,location_country,location_cross_street,location_formatted_address,location_locality,location_postcode,location_region,location_timezone,name,bike_station_id
8329,8e5beb5205f149ec8254f51d,13003,bars,[],326,49.280365,-123.123017,CA,,"938 Howe St, Vancouver BC V6Z 1N9",Vancouver,V6Z 1N9,BC,,Ceili's Irish Pub Van Ltd,fad9ebcf614dd1a72593a34072ff76f8
11501,4b99bfbdf964a520e38f35e3,19014,hotels,[],501,49.286283,-123.116951,CA,Hastings,"921 Pender St W (Hastings), Vancouver BC V6C 1M2",Vancouver,V6C 1M2,BC,America/Vancouver,Days Inn by Wyndham Vancouver Downtown,b3bb9f58d3530d33a1a09b6be9e973e0
11941,51f1db45498e806a76d53301,19014,hotels,[],469,49.283128,-123.116519,CA,,"610 Granville St, Vancouver BC V6C 3T3",Vancouver,V6C 3T3,BC,America/Vancouver,The Hudson,0b543fc4e694fe07a54dac48bb1b3390
3186,553bdde9498e4eca654a36d3,13065,restaurants,[],375,49.283844,-123.120961,CA,btwn Burrard & Hornby St,"900 Georgia St W (btwn Burrard & Hornby St), V...",Vancouver,V6C 2W6,BC,America/Vancouver,Notch8 Restaurant,bf8408067b0e0c963f3ff526977bcef3
2688,4f1b5861e4b08382320d3f2e,13065,restaurants,[],273,49.283798,-123.064405,CA,btwn Victoria & Semlin,"1950 Triumph St (btwn Victoria & Semlin), Vanc...",Vancouver,V5L 1K5,BC,America/Vancouver,Parallel 49 Brewing Co,7c17f51469f1a7145455f65ef6afbd87


# Yelp


- Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice.
- Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)
- Put your parsed results into a DataFrame


In [5]:
# Retrieving Yelp API key from environment variables
YELP_API = os.getenv('YELP')

# Setting base URL and endpoint for Yelp API
base_url = "https://api.yelp.com"
endpoint = '/v3/businesses/search'

# Defining function to retrieve Yelp data
def get_yelp_data(df, categories):
    # Initializing empty list to store Yelp data
    yelp_raw = []

    # Looping through each category and location in the input dictionary
    for key, value in categories.items():
        for index, row in df.iterrows():
            # Building API query parameters
            params = f"?latitude={row['Latitude']}&longitude={row['Longitude']}&radius={radius}&categories={value}&sort_by=best_match&limit={limit}"
            # Combining base URL, endpoint, and query parameters to form full API URL
            full_url = f"{base_url}{endpoint}{params}"
            # Adding authorization header with Yelp API key
            headers = {
                "accept": "application/json",
                "Authorization": f"Bearer {YELP_API}"
            }
            # Making API request and retrieving JSON response
            res = req.get(full_url, headers=headers).json()['businesses']
            
            # Looping through each business in the response and adding relevant data to the list
            for i in res:
                yelp_raw.append({
                    'name': i['name'],
                    "yelp_id": i.get('id', None),
                    "category_id": value,
                    "category_name": key,
                    'rating': i.get('rating', None),
                    'review_count': i.get('review_count', None),
                    'price': i.get('price', None),
                    'distance': i.get('distance', None),
                    "latitude": i.get('coordinates', {}).get('latitude', None),
                    "longitude": i.get('coordinates', {}).get('longitude', None),
                    "bike_station_id": row['Id'],
                })

    # Returning the list of Yelp data
    return yelp_raw


In [6]:
# # Create a pandas DataFrame object by calling the `get_yelp_data()` function, passing in two arguments: 
# # `cb` (a Yelp API client object) and `POI` (a point of interest location).
# yelp_data = pd.DataFrame(get_yelp_data(cb, POI)).sample(n=2500)

# # Save the DataFrame as a CSV file, excluding the index column, to the specified file path in the assets directory.
# yelp_data.to_csv(assets_path + 'yelp_data.csv', index=False)

# Read the CSV file containing Yelp data into a new DataFrame object.
yelp_data = pd.read_csv(assets_path + 'yelp_data.csv')

# Display the first 5 rows of the DataFrame to check that data was loaded successfully.
yelp_data.head()


Unnamed: 0,name,yelp_id,category_id,category_name,rating,review_count,price,distance,latitude,longitude,bike_station_id
33787,The Shameful Tiki Room,CwL5jwXhImT_7K5IB7mOvA,10027,museums,4.5,361,$$,510.925128,49.245958,-123.100954,97388f4e4ea102a30345524bda465f4e
45703,Donair Dude,hQoFgRJhYZn4AM5uE3c9fA,19007,servicestations,3.5,264,$,455.811297,49.281217,-123.132758,a6aec7b8c25da4c02829cd7839b8933d
17234,Salad Loop,1h7FlHHPXpaRBAi9WAq0IA,13003,bars,4.5,3,$,457.613461,49.26082,-123.125595,2ca388bba54706770900ca2a9d7aa5be
48260,Bistro Sakana,0zErZVaaOhJUV1lNEqEc6Q,19007,servicestations,3.5,133,$$$,238.161396,49.27528,-123.120964,fedff7a263c182df94bda7307059cc52
50063,Juno Provisions,9ltXAbwXKZa7FECq9tdGgg,19007,servicestations,4.0,12,,134.81742,49.264409,-123.070016,983e58671adcc67af94457d7887dbb15


# Comparing Results


Which API provided you with more complete data? Provide an explanation.


- FS provides only companies names, categories and coordinates whereas YELP gives basic coordinates to find place and on top of that rating, review count, price category and others(not included)
- Yelp return more data for the same radius.

In [10]:
fs_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12591 entries, 0 to 12590
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fsq_id                      12591 non-null  object 
 1   category_id                 12591 non-null  int64  
 2   category_name               12591 non-null  object 
 3   chains                      12591 non-null  object 
 4   distance                    12591 non-null  int64  
 5   latitude                    12591 non-null  float64
 6   longitude                   12591 non-null  float64
 7   location_country            12591 non-null  object 
 8   location_cross_street       6158 non-null   object 
 9   location_formatted_address  12591 non-null  object 
 10  location_locality           12591 non-null  object 
 11  location_postcode           11913 non-null  object 
 12  location_region             12591 non-null  object 
 13  location_timezone           966

In [11]:
yelp_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52176 entries, 0 to 52175
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             52176 non-null  object 
 1   yelp_id          52176 non-null  object 
 2   category_id      52176 non-null  int64  
 3   category_name    52176 non-null  object 
 4   rating           52176 non-null  float64
 5   review_count     52176 non-null  int64  
 6   price            37103 non-null  object 
 7   distance         52176 non-null  float64
 8   latitude         52170 non-null  float64
 9   longitude        52170 non-null  float64
 10  bike_station_id  52176 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 4.4+ MB


Get the top 10 restaurants according to their rating


In [12]:
yelp_data.sort_values('rating', ascending=False).head(10)


Unnamed: 0,name,yelp_id,category_id,category_name,rating,review_count,price,distance,latitude,longitude,bike_station_id
19782,Manoush'eh,K1nbiOrySlw_-XG-3NmErQ,19020,parking,5.0,225,$,413.179279,49.276671,-123.125701,6620c54bd6b3a79800b2b67e999b24d0
17464,Word.,zu58Fz_lur97NGf4GjwQYA,19020,parking,5.0,1,,557.568787,49.266923,-123.091882,17a3320a52bcc62161908967f8a06613
17429,Granville Island Water Park,TMJ04QPJx-xf-UtpUk-AMQ,19020,parking,5.0,3,,558.858379,49.269683,-123.133993,988f8f74c4a12d1a30ddf2f7cd6318f8
17434,International Sandwich Factory,IxdERYPleFd5aPuer7tABQ,19020,parking,5.0,1,,485.896133,49.263305,-123.128656,988f8f74c4a12d1a30ddf2f7cd6318f8
17437,The Cranky Old Fork,aI2C7Kma_RrnWD9Sh-4HvQ,19020,parking,5.0,1,,437.764582,49.26375,-123.12711,988f8f74c4a12d1a30ddf2f7cd6318f8
17438,Granville Island Turkey Trot,jwmsT8wdUQgzL08WPePdDg,19020,parking,5.0,1,,353.355189,49.26903,-123.13178,988f8f74c4a12d1a30ddf2f7cd6318f8
25011,Uncle Pu's Sichuan Taste,vSOo_zxOwnye6ZWGgmoKkA,19020,parking,5.0,1,,91.481523,49.264007,-123.174955,9c1397c06e07bf74fc049f1a0872eea0
43664,The Magnet,JGZbGEKXmZ7OeqjYaGp4bw,19007,servicestations,5.0,16,,357.204221,49.282402,-123.111079,9413725a5d3bb5ce2ad0682b07a3ffab
17453,Quizine Kitchen,s2SbCuEKb85ETbzCOwj7TA,19020,parking,5.0,6,,336.883108,49.262535,-123.080673,17a3320a52bcc62161908967f8a06613
24999,Starbucks,2SQpPcjdPfEvUJGS5I4PLg,19020,parking,5.0,1,,18.12448,49.264354,-123.173773,9c1397c06e07bf74fc049f1a0872eea0
