In [1]:
import pandas as pd
import geopandas as gpd
from meteostat import Stations, Hourly
from datetime import datetime
import numpy as np
from scipy.spatial import cKDTree
from geopy.distance import geodesic
import h3
import folium

In [2]:
df = pd.read_csv('Taxi_Trips__2013-2023__20240713.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6495570 entries, 0 to 6495569
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Trip ID                     object 
 1   Taxi ID                     object 
 2   Trip Start Timestamp        object 
 3   Trip End Timestamp          object 
 4   Trip Seconds                float64
 5   Trip Miles                  float64
 6   Pickup Census Tract         float64
 7   Dropoff Census Tract        float64
 8   Pickup Community Area       float64
 9   Dropoff Community Area      float64
 10  Fare                        float64
 11  Tips                        float64
 12  Tolls                       float64
 13  Extras                      float64
 14  Trip Total                  float64
 15  Payment Type                object 
 16  Company                     object 
 17  Pickup Centroid Latitude    float64
 18  Pickup Centroid Longitude   float64
 19  Pickup Centroid Locat

In [4]:
#Convert columns to string
df[['Pickup Census Tract','Dropoff Census Tract', 
    'Pickup Community Area','Dropoff Community Area']] = df[['Pickup Census Tract','Dropoff Census Tract', 
                                                             'Pickup Community Area','Dropoff Community Area']].fillna(0).astype(int).astype(str).replace('0', None)

df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])
df['Trip End Timestamp'] = pd.to_datetime(df['Trip End Timestamp'])

df.drop('Community Areas', axis=1, inplace=True)



In [5]:
df.isnull().mean() * 100

Trip ID                        0.000000
Taxi ID                        0.002386
Trip Start Timestamp           0.000000
Trip End Timestamp             0.001278
Trip Seconds                   0.019783
Trip Miles                     0.000308
Pickup Census Tract           55.679271
Dropoff Census Tract          56.811211
Pickup Community Area          3.726186
Dropoff Community Area         9.538316
Fare                           0.186943
Tips                           0.186943
Tolls                          0.186943
Extras                         0.186943
Trip Total                     0.186943
Payment Type                   0.000000
Company                        0.000000
Pickup Centroid Latitude       3.689407
Pickup Centroid Longitude      3.689407
Pickup Centroid Location       3.689407
Dropoff Centroid Latitude      8.928300
Dropoff Centroid Longitude     8.928300
Dropoff Centroid  Location     8.928300
dtype: float64

Definition: The longitude (latitude) of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago.

--> Two issues:
- 1. Null values for locations outside Chicago
  2. Mix longitude (latitude) of census tract and community area

--> If "Pickup Census Tract" is null, the longitude (latitude) is from "Community Area". Otherwise, the longitude (latitude) is from "Census Tract"

Strategy: 
- It is required to use "Census Tract" + No way to impute missing "Census Tract" --> Drop all missing values in Census Tract
- Use information from "Census Tract" to fill in missing values in "Community Areas" and longitudes and latitudes.
- Can drop missing values in other columns for simplicity


In [6]:
df = df.dropna(subset=['Pickup Census Tract', 'Dropoff Census Tract'])

In [7]:
df.isnull().mean() * 100

Trip ID                       0.000000
Taxi ID                       0.000000
Trip Start Timestamp          0.000000
Trip End Timestamp            0.000000
Trip Seconds                  0.015837
Trip Miles                    0.000253
Pickup Census Tract           0.000000
Dropoff Census Tract          0.000000
Pickup Community Area         0.112449
Dropoff Community Area        1.638250
Fare                          0.262237
Tips                          0.262237
Tolls                         0.262237
Extras                        0.262237
Trip Total                    0.262237
Payment Type                  0.000000
Company                       0.000000
Pickup Centroid Latitude      0.028572
Pickup Centroid Longitude     0.028572
Pickup Centroid Location      0.028572
Dropoff Centroid Latitude     0.254156
Dropoff Centroid Longitude    0.254156
Dropoff Centroid  Location    0.254156
dtype: float64

In [8]:
# Load the shapefiles or GeoJSON files
census_tracts = gpd.read_file('census_tract/geo_export_0caa4a0b-c5b9-4cef-b66d-72eced332409.shp')
#community_areas = gpd.read_file('community_area/geo_export_0ec1e45e-aa60-4dc6-a5c8-cb395ee19c94.shp')

census_tracts['count_areas'] = census_tracts.groupby('geoid10')['commarea'].transform('nunique')

census_tracts['count_areas'].unique()

array([1])

In Chicago, one Census Tract belongs exactly to one Community Area, so we can fill missing Community Area with Census Tract

In [9]:
census_tracts['centroid'] = census_tracts.geometry.centroid
census_tracts['latitude'] = census_tracts['centroid'].y
census_tracts['longitude'] = census_tracts['centroid'].x




In [10]:
df = pd.merge(df, census_tracts[['geoid10','commarea','centroid','latitude','longitude']], left_on='Pickup Census Tract', right_on='geoid10')
df['Pickup Community Area'] = df['Pickup Community Area'].fillna(df['commarea'])
df['Pickup Centroid Latitude'] = df['Pickup Centroid Latitude'].fillna(df['latitude'])
df['Pickup Centroid Longitude'] = df['Pickup Centroid Longitude'].fillna(df['longitude'])
df['Pickup Centroid Location'] = df['Pickup Centroid Location'].fillna(df['centroid'])
df = df.drop(columns=['geoid10','commarea','centroid','latitude','longitude'])

df = pd.merge(df, census_tracts[['geoid10','commarea','centroid','latitude','longitude']], left_on='Dropoff Census Tract', right_on='geoid10')
df['Dropoff Community Area'] = df['Dropoff Community Area'].fillna(df['commarea'])
df['Dropoff Centroid Latitude'] = df['Dropoff Centroid Latitude'].fillna(df['latitude'])
df['Dropoff Centroid Longitude'] = df['Dropoff Centroid Longitude'].fillna(df['longitude'])
df['Dropoff Centroid  Location'] = df['Dropoff Centroid  Location'].fillna(df['centroid'])
df = df.drop(columns=['geoid10','commarea','centroid','latitude','longitude'])

df = df.dropna()

In [11]:
df.isnull().mean() * 100

Trip ID                       0.0
Taxi ID                       0.0
Trip Start Timestamp          0.0
Trip End Timestamp            0.0
Trip Seconds                  0.0
Trip Miles                    0.0
Pickup Census Tract           0.0
Dropoff Census Tract          0.0
Pickup Community Area         0.0
Dropoff Community Area        0.0
Fare                          0.0
Tips                          0.0
Tolls                         0.0
Extras                        0.0
Trip Total                    0.0
Payment Type                  0.0
Company                       0.0
Pickup Centroid Latitude      0.0
Pickup Centroid Longitude     0.0
Pickup Centroid Location      0.0
Dropoff Centroid Latitude     0.0
Dropoff Centroid Longitude    0.0
Dropoff Centroid  Location    0.0
dtype: float64

In [12]:
df

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
0,716d7a0a2a097facc3f0f63e326830ecdf923d0a,2d72c5e6313ad93f663008a55045cad0c76164b057dcb7...,2023-12-31 23:45:00,2024-01-01 00:00:00,649.0,2.57,17031833000,17031330100,28,33,...,1.0,12.50,Credit Card,City Service,41.885281,-87.657233,POINT (-87.6572331997 41.8852813201),41.859350,-87.617358,POINT (-87.6173580061 41.859349715)
1,8142f4b1547f4e4a683a80b5a6c7d0325ce09559,f75191fdf728d7ed7f4277ee1e39372c16658b87abc26a...,2023-12-31 23:45:00,2024-01-01 00:00:00,600.0,1.10,17031320100,17031320400,32,32,...,2.0,9.50,Cash,Chicago Independents,41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),41.877406,-87.621972,POINT (-87.6219716519 41.8774061234)
2,b68a7310d2ba573ce09f55fa546408264e0b3dd7,9454a3cb5d7e8ef84ca9bfff9f1a5d235021fef66b41a8...,2023-12-31 23:45:00,2024-01-01 00:00:00,755.0,2.37,17031833000,17031081201,28,8,...,0.0,11.28,Mobile,City Service,41.885281,-87.657233,POINT (-87.6572331997 41.8852813201),41.899156,-87.626211,POINT (-87.6262105324 41.8991556134)
3,ec183abaa7ff142f17ebcdafa1f3d4e611a9f494,f6d1b6c930d62f6d8cbbd8f86a593ff057408c82f76474...,2023-12-31 23:45:00,2024-01-01 00:00:00,786.0,2.02,17031081500,17031330100,8,33,...,0.0,10.00,Cash,Chicago Independents,41.892508,-87.626215,POINT (-87.6262149064 41.8925077809),41.859350,-87.617358,POINT (-87.6173580061 41.859349715)
4,ed445ada05f17c5f359892eda3c329e1445b5e7b,4b034948aceedd53262ae713f864b0364953a1852b6b24...,2023-12-31 23:45:00,2023-12-31 23:45:00,4.0,0.00,17031320100,17031320100,32,32,...,0.0,25.62,Credit Card,Sun Taxi,41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2726032,3f07cb261574b204709ba5337494527094a9bae4,4ab7a7510c1ebcc9b2e3eaa7bdd6508dbea34da7986aca...,2023-01-01 00:00:00,2023-01-01 00:15:00,1341.0,16.63,17031980000,17031081500,76,8,...,6.0,53.00,Credit Card,Sun Taxi,41.979071,-87.903040,POINT (-87.9030396611 41.9790708201),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)
2726033,61ddfa9c7fecac1b43962d8447fa930371377925,90738dbca5b9c7c2de984bf7e96a81569178364ca8cd29...,2023-01-01 00:00:00,2023-01-01 00:00:00,384.0,0.48,17031081500,17031081403,8,8,...,0.0,8.00,Credit Card,Flash Cab,41.892508,-87.626215,POINT (-87.6262149064 41.8925077809),41.890922,-87.618868,POINT (-87.6188683546 41.8909220259)
2726034,5fa9587952ed348823fc68ce1e25f5cd031a5961,0602c4dcde4b0fa95e24da18797128a90565512392fffd...,2023-01-01 00:00:00,2023-01-01 00:30:00,1764.0,15.46,17031980000,17031071500,76,7,...,8.0,49.50,Cash,Medallion Leasin,41.979071,-87.903040,POINT (-87.9030396611 41.9790708201),41.914616,-87.631717,POINT (-87.6317173661 41.9146162864)
2726035,6e26e8341d032293c24990114d321d1607a83fd2,dba6a86e74669ab2eb5130718c23bf1800b66a4ec88836...,2023-01-01 00:00:00,2023-01-01 00:15:00,734.0,0.87,17031839100,17031081700,32,8,...,0.0,7.75,Cash,5 Star Taxi,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.892042,-87.631864,POINT (-87.6318639497 41.8920421365)


In [13]:
# Weather data

In [14]:
start = datetime(2023, 1, 1)
end = datetime(2024, 1, 1)

# Get weather stations in Chicago
stations = Stations()
chicago_stations = stations.nearby(41.8781, -87.6298)  # Latitude and Longitude of Chicago
station_list = chicago_stations.fetch(6) #6 closest stations in Chicago
station_list = station_list.drop('KCGX0', axis=0) #Remove one unavailable station

weather_data = Hourly(list(station_list.index), start, end) 
weather_df = weather_data.fetch()
weather_df.reset_index(inplace =True)
weather_df['time'] = pd.to_datetime(weather_df['time'])
weather_df.drop(['snow','wpgt','tsun'],axis=1,inplace=True) #weather stations do not track this info

In [15]:
# Idea from ChatGPT to shorten the time
gdf_stations = gpd.GeoDataFrame(station_list, geometry=gpd.points_from_xy(station_list.longitude, station_list.latitude))
gdf_stations.crs = "EPSG:4326"  # already used in census_tracts

gdf_taxis = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Pickup Centroid Longitude'], df['Pickup Centroid Latitude']))
gdf_taxis.crs = "EPSG:4326"  

tree = cKDTree(np.array(list(zip(gdf_stations.geometry.x, gdf_stations.geometry.y))))

closest_stations = []
for point in gdf_taxis.geometry:
    dist, idx = tree.query(np.array([point.x, point.y]), k=1)
    closest_stations.append(gdf_stations.iloc[idx].name)

df['closest_station'] = closest_stations

In [16]:
# Merge based on time and closest station
df = pd.merge_asof(df.sort_values('Trip Start Timestamp'), weather_df.sort_values('time'), left_on='Trip Start Timestamp', right_on='time', left_by='closest_station',right_by='station', direction='nearest', tolerance=pd.Timedelta('1 hour'))


In [17]:
df

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,station,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco
0,8979106b3da9c97bce2e834ecfc32658d8878a77,20748c08a2cc5aef4b6f0f88f0e78db32cac1a858f7e09...,2023-01-01 00:00:00,2023-01-01 00:15:00,351.0,0.22,17031081700,17031081700,8,8,...,72534,2023-01-01,2.0,-1.3,79.0,0.0,121.0,6.0,1013.0,3.0
1,f8106d60f1c7777e068395b5b8b8ad97f6d9bb25,13c0599d1bce4a6239d30c3feeba903749ab197df436dc...,2023-01-01 00:00:00,2023-01-01 00:00:00,486.0,0.78,17031081700,17031081401,8,8,...,72534,2023-01-01,2.0,-1.3,79.0,0.0,121.0,6.0,1013.0,3.0
2,2f5125aa2c7916ad6957abd266dfcacf83d81cfb,b9ce4a1df5c8bf18e20212f8310e1b1e72b44dff6c9157...,2023-01-01 00:00:00,2023-01-01 00:15:00,507.0,0.52,17031081700,17031081500,8,8,...,72534,2023-01-01,2.0,-1.3,79.0,0.0,121.0,6.0,1013.0,3.0
3,6e26e8341d032293c24990114d321d1607a83fd2,dba6a86e74669ab2eb5130718c23bf1800b66a4ec88836...,2023-01-01 00:00:00,2023-01-01 00:15:00,734.0,0.87,17031839100,17031081700,32,8,...,72534,2023-01-01,2.0,-1.3,79.0,0.0,121.0,6.0,1013.0,3.0
4,61ddfa9c7fecac1b43962d8447fa930371377925,90738dbca5b9c7c2de984bf7e96a81569178364ca8cd29...,2023-01-01 00:00:00,2023-01-01 00:00:00,384.0,0.48,17031081500,17031081403,8,8,...,72534,2023-01-01,2.0,-1.3,79.0,0.0,121.0,6.0,1013.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2718773,363810b6cfd667eace3ef3266ec553a546729ff5,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2023-12-31 23:45:00,2024-01-01 00:00:00,420.0,0.90,17031320100,17031320400,32,32,...,72534,2024-01-01,1.0,-2.2,79.0,0.0,320.0,24.0,1018.0,3.0
2718774,4e366fa290c59b3d3c6ced770bc8b6b1d3519a0c,071d031c64f608418d27905c9ffe95bf52695615683d5f...,2023-12-31 23:45:00,2023-12-31 23:45:00,709.0,1.21,17031081202,17031081700,8,8,...,72534,2024-01-01,1.0,-2.2,79.0,0.0,320.0,24.0,1018.0,3.0
2718775,52297b62f701559c614a0b281f4e0e0dda83d39d,b5f958b8508ba66e43f28559d6d54c66b9e7a7df9b45da...,2023-12-31 23:45:00,2024-01-01 00:15:00,1703.0,18.29,17031980000,17031320100,76,32,...,72530,2024-01-01,0.6,-2.1,82.0,0.0,320.0,27.7,1017.4,14.0
2718776,0ed132d8a023dde8abb89dba58fa04f35fe9bdb1,80919866bf50fd1163efd329eb8a9fdf198fda2465473a...,2023-12-31 23:45:00,2024-01-01 00:00:00,769.0,0.74,17031081800,17031081500,8,8,...,72534,2024-01-01,1.0,-2.2,79.0,0.0,320.0,24.0,1018.0,3.0


In [18]:
#Use geometry 
def polygon_to_h3(polygon, resolution):
    geojson = polygon.__geo_interface__
    exterior = geojson['coordinates'][0]
    hexagons = h3.polyfill({
        'type': 'Polygon',
        'coordinates': [exterior]
    }, resolution, geo_json_conformant=True)
    return list(hexagons)

resolution = [8,9,10]
for res in resolution:
    census_tracts[f'hexagons_{res}'] = census_tracts['geometry'].apply(polygon_to_h3, resolution=res)

In [19]:
def plot_hexagons(hex_ids, map_object):
    for hex_id in hex_ids:
        hex_boundary = h3.h3_to_geo_boundary(hex_id, geo_json=True)
        hex_boundary = [(coord[1], coord[0]) for coord in hex_boundary]
        folium.Polygon(locations=hex_boundary, color='blue', fill=True, fill_opacity=0.5).add_to(map_object)

center_lat = census_tracts.geometry.centroid.y.mean()
center_lon = census_tracts.geometry.centroid.x.mean()
map = folium.Map(location=[center_lat, center_lon], zoom_start=10)

for hex_list in census_tracts['hexagons_8']:
    plot_hexagons(hex_list, map)

map





In [22]:
import osmnx as ox

place_name = "Chicago, Illinois, USA"
tags = {'amenity': 'cafe'}
#tags = {'boundariy': 'statistical'}
# Get cafes in Chicago
stat = ox.geometries_from_place(place_name, tags)






In [23]:
stat

Unnamed: 0_level_0,Unnamed: 1_level_0,amenity,name,geometry,addr:city,addr:housenumber,addr:postcode,addr:state,addr:street,branch,brand,...,nodes,building,building:levels,roof:shape,diet:kosher:certification,diet:kosher:chalav_yisrael,diet:kosher:dairy,diet:kosher:pas_yisrael,diet:kosher:yoshon,start_date
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,318276729,cafe,Park Place Cafe,POINT (-87.63321 41.92084),,,,,,,,...,,,,,,,,,,
node,346234680,cafe,Starbucks,POINT (-87.67616 41.96164),Chicago,1900,60613,IL,West Montrose Avenue,1900 West Montrose,Starbucks,...,,,,,,,,,,
node,347159885,cafe,Starbucks,POINT (-87.63489 41.9113),Chicago,210,60610,IL,West North Avenue,,Starbucks,...,,,,,,,,,,
node,347162170,cafe,Savor the Flavor,POINT (-87.65337 41.92853),,,,,,,,...,,,,,,,,,,
node,347301872,cafe,Starbucks,POINT (-87.67893 41.9547),Chicago,4015,60618,IL,North Lincoln Avenue,Lincoln/Damen/Irving,Starbucks,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
way,1162077529,cafe,Starbucks,"POLYGON ((-87.8073 41.9831, -87.80739 41.9831,...",Chicago,5614,60656,IL,North Harlem Avenue,Harlem & Bryn Mawr,Starbucks,...,"[10807594196, 11482311282, 11482311283, 114823...",retail,,,,,,,,
way,1178693753,cafe,Cyrano's Café & Wine Bar,"POLYGON ((-87.62124 41.88829, -87.62125 41.888...",,,,,,,,...,"[4306418293, 10950680190, 10950680191, 1095068...",yes,,,,,,,,
way,1182086850,cafe,HERO Coffee Bar,"POLYGON ((-87.62673 41.87849, -87.62671 41.878...",Chicago,22,60604,IL,East Jackson Boulevard,,,...,"[10977487787, 2330813788, 10977487789, 1097748...",yes,,,,,,,,
way,1219122205,cafe,Starbucks,"POLYGON ((-87.80578 41.91655, -87.80574 41.916...",Chicago,2007,60707,IL,North Harlem Avenue,Harlem and Armitage,Starbucks,...,"[11296630078, 11296630082, 11296630085, 112966...",retail,,,,,,,,


In [21]:
#Use centroid lat and lon

resolution = 9  

def get_covering_hexagons(lat, lon, resolution):
    hex_id = h3.geo_to_h3(lat, lon, resolution)
    hexagons = h3.k_ring(hex_id, 1)
    return list(hexagons)

#df['hexagons_9'] = df.apply(
#    lambda row: get_covering_hexagons(row['Pickup Centroid Latitude'], row['Pickup Centroid Longitude'], resolution),
#    axis=1
#)

def plot_hexagons(hex_ids, map_object):
    for hex_id in hex_ids:
        hex_boundary = h3.h3_to_geo_boundary(hex_id, geo_json=False)
        hex_boundary = [(coord[0], coord[1]) for coord in hex_boundary]
        folium.Polygon(locations=hex_boundary, color='blue', fill=True, fill_color='blue', fill_opacity=0.4).add_to(map_object)

map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

In [39]:
distinct_combinations = df.groupby(['Pickup Centroid Latitude', 'Pickup Centroid Longitude']).size().reset_index(name='Count')

distinct_combinations['hexagons_9'] = distinct_combinations.apply(
    lambda row: get_covering_hexagons(row['Pickup Centroid Latitude'], row['Pickup Centroid Longitude'], resolution),
    axis=1
)

for hex_list in distinct_combinations['hexagons_9']:
    plot_hexagons(hex_list, map)
map