In [63]:
# imports
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from geopy.distance import great_circle
from geopy.point import Point
import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv
import os

load_dotenv()

True

In [64]:
# Extract csv data for get the long and lat
df = pd.read_csv("../data/bike_stations.csv")

# Extract coordinates
coords = df[['latitude', 'longitude']].to_numpy()

# Convert meters to radians (for DBSCAN with haversine distance)
kms_per_radian = 6371.0088
epsilon = 0.5 / kms_per_radian  # 0.5 km = 500 meters

# Apply DBSCAN clustering
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine')
df['cluster'] = db.fit_predict(np.radians(coords))

# print("Clusters assigned:")
# print(df[['name', 'latitude', 'longitude', 'cluster']].head())
# Mean coordinates per cluster
cluster_centroids = df.groupby('cluster')[['latitude', 'longitude']].mean().reset_index()
print(cluster_centroids.head())

   cluster   latitude  longitude
0        0  43.684513 -79.373033
1        1  43.665939 -79.311547
2        2  43.691500 -79.294351
3        3  43.655026 -79.393131
4        4  43.762444 -79.500654


# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [84]:
def get_foursquare_places(lat, lon, radius=500, category="13065"):  # Food & Drink category
    api_key = os.getenv('FOURSQUARE_API_KEY')
    url = "https://places-api.foursquare.com/places/search"
    # url = "https://api.foursquare.com/v3/places/search"
    headers = {
    "Accept": "application/json",
    "Authorization": f"Bearer {api_key}",
    "X-Places-API-Version": '2025-06-17'  # Use this format, not a future date
}
    params = {
        "ll": f"{lat},{lon}",
        "radius": radius,
        "categories": category,
        "limit": 50
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json().get("results", [])
    else:
        print(f"Foursquare API error at ({lat},{lon}): {response.status_code}")
        return []

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [85]:
fsq_pois = []
#  Apply clustering (e.g., 10 clusters)
kmeans = KMeans(n_clusters=10, random_state=0)
df['cluster'] = kmeans.fit_predict(coords)

# Get centroids
centroids = kmeans.cluster_centers_
cluster_centroids = pd.DataFrame(centroids, columns=['latitude', 'longitude'])
cluster_centroids['cluster'] = cluster_centroids.index

for _, row in cluster_centroids.iterrows():
    lat, lon = row['latitude'], row['longitude']
    results = get_foursquare_places(lat, lon)
    
    for r in results:
        fsq_pois.append({
            "cluster": row['cluster'],
            "name": r.get("name"),
            "category": r["categories"][0]["name"] if r.get("categories") else None,
            "lat": r["latitude"],
            "lon": r["longitude"]
        })

Put your parsed results into a DataFrame

In [87]:
df_fsq = pd.DataFrame(fsq_pois)
print(df_fsq.head(10))

   cluster                             name             category        lat  \
0      0.0          St. Michael's Cathedral               Church  43.655007   
1      0.0  Hokkaido Ramen Santouka らーめん山頭火     Ramen Restaurant  43.656435   
2      0.0                   Kyoto Katsugyu  Japanese Restaurant  43.656890   
3      0.0                  Mackenzie House             Monument  43.655678   
4      0.0                    Page One Cafe                 Café  43.657243   
5      0.0                     Burrito Boyz   Burrito Restaurant  43.656331   
6      0.0                         SukoThai     Asian Restaurant  43.655528   
7      0.0                   Ali Basha Cafe           Hookah Bar  43.656728   
8      0.0                      Jazz Bistro          Music Venue  43.655678   
9      0.0           The Senator Restaurant                Diner  43.655641   

         lon  
0 -79.377061  
1 -79.377586  
2 -79.376245  
3 -79.378250  
4 -79.376021  
5 -79.378541  
6 -79.374907  
7 -79.3753

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [75]:
def get_yelp_data(lat, lon, radius=300):
    import requests
    api_key = os.getenv('YELP_API_KEY')
    YELP_API_KEY = "your_yelp_api_key"
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {api_key}"}
    params = {
        "latitude": lat,
        "longitude": lon,
        "radius": radius,
        "term": "restaurant",
        "limit": 50
    }

    response = requests.get(url, headers=headers, params=params)
    
    return response.json()

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [101]:
poi_data = []

#  Apply clustering (e.g., 10 clusters)
kmeans = KMeans(n_clusters=10, random_state=0)
df['cluster'] = kmeans.fit_predict(coords)

# Get centroids
centroids = kmeans.cluster_centers_
cluster_centers = pd.DataFrame(centroids, columns=['latitude', 'longitude'])
cluster_centers['cluster'] = cluster_centers.index

for _, row in cluster_centers.iterrows():
    lat, lon = row['latitude'], row['longitude']
    result = get_yelp_data(lat, lon)
    
    for biz in result.get("businesses", []):
        
        poi_data.append({
            "cluster": row['cluster'],
            "name": biz['name'],
            "category": biz['categories'][0]['title'] if biz.get("categories") else None,
            "rating": biz.get("rating"),
            "review_count": biz.get("review_count"),
            "latitude": biz['coordinates']['latitude'],
            "longitude": biz['coordinates']['longitude'],
            "address": biz['location']['display_address']
        })

Put your parsed results into a DataFrame

In [109]:
df_ylp = pd.DataFrame(poi_data)
df_ylp.shape[0]
# df_ylp.head()

208

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 


Both Four square and Yelp provide the category of the name pf places which is good. But when we compare all the data, Yelp provides more complete data as it includes 
1. review counts, user ratings of customers
2. distance of the location( coverage)
3. Detail information about the place like address and business hours 

Get the top 10 restaurants according to their rating

In [112]:
# As the YELP has detail data we are choosing YELP
restaurants_df = df_ylp[df_ylp['category'].str.contains('Restaurant', case=False, na=False)]
df_ylp.shape[0]

# top10_restaurants = restaurants_df.sort_values(by='rating', ascending=False).head()

# top10_restaurants.head(10)
# print(top10_restaurants[['name', 'rating', 'review_count', 'address']])

208