In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from meteostat import Hourly, Stations, Point
from datetime import datetime, timedelta

### Extracting weather data for Toronto Sites

In [2]:
toronto_sites = pd.read_csv('stations.csv')[['station_id','name','lat','lon']]
toronto_sites = toronto_sites.rename(columns={'lat': 'Latitude', 'lon':'Longitude'})
toronto_sites.head()

Unnamed: 0,station_id,name,Latitude,Longitude
0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
1,7001,Wellesley Station Green P,43.664964,-79.38355
2,7002,St. George St / Bloor St W,43.667131,-79.399555
3,7003,Madison Ave / Bloor St W,43.667018,-79.402796
4,7005,King St W / York St,43.648001,-79.383177


In [3]:
len(toronto_sites)

852

In [4]:
# Date range for Toronto Data
start = datetime(2023, 1, 1)
end = datetime(2024, 9, 30, 23, 59)

In [5]:
weather_results = []

# Loop through each row in the DataFrame to get weather data
for index, row in toronto_sites.iterrows():
    station_id = row['station_id']
    location_name = row['name']
    latitude = row['Latitude']
    longitude = row['Longitude']
    
    stations = Stations()
    nearby_stations = stations.nearby(latitude, longitude).fetch(1)

    data = Hourly(nearby_stations, start, end)
    data = data.fetch()

    data['station_id'] = station_id
    data['location_name'] = location_name
    data['latitude'] = latitude
    data['longitude'] = longitude

    weather_results.append(data)
    
final_df = pd.concat(weather_results)
final_df.head()




Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,station_id,location_name,latitude,longitude
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01 00:00:00,6.2,5.5,95.0,0.0,,330.0,25.9,,1008.4,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
2023-01-01 01:00:00,5.0,4.0,93.0,0.0,,320.0,25.9,,1008.9,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
2023-01-01 02:00:00,5.0,3.0,87.0,0.0,,320.0,22.3,,1009.5,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
2023-01-01 03:00:00,4.5,2.7,88.0,0.0,,310.0,27.7,,1010.5,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
2023-01-01 04:00:00,4.0,1.0,81.0,0.0,,310.0,22.3,,1011.0,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954


In [6]:
final_df['state'] = 'ON'
final_df['City'] = 'Toronto'

final_df.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,station_id,location_name,latitude,longitude,state,City
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-01-01 00:00:00,6.2,5.5,95.0,0.0,,330.0,25.9,,1008.4,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,ON,Toronto
2023-01-01 01:00:00,5.0,4.0,93.0,0.0,,320.0,25.9,,1008.9,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,ON,Toronto
2023-01-01 02:00:00,5.0,3.0,87.0,0.0,,320.0,22.3,,1009.5,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,ON,Toronto
2023-01-01 03:00:00,4.5,2.7,88.0,0.0,,310.0,27.7,,1010.5,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,ON,Toronto
2023-01-01 04:00:00,4.0,1.0,81.0,0.0,,310.0,22.3,,1011.0,,3.0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,ON,Toronto


In [7]:
len(final_df)

9983565

In [8]:
final_df.to_csv('toronto_from_Jan_2023_weather_data.csv')

### Extracting Chicago Data

In [2]:
import requests
import polars as pl

api_url = "https://data.cityofchicago.org/resource/ajtu-isnz.json"  

response = requests.get(api_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response into a list of dictionaries
    data = response.json()
    
    # Convert the data to a Polars DataFrame
    df = pl.DataFrame(data)
    
    # Select only the required columns (replace 'location_col', 'lat_col', 'lon_col' with actual column names)
    filtered_df = df.select([
        pl.col('pickup_community_area'),  
        pl.col('pickup_centroid_latitude'),       
        pl.col('pickup_centroid_longitude')        
    ])

    unique_locations_df = filtered_df.unique()
    
    print(unique_locations_df)

else:
    print(f"Failed to fetch data from the API. Status Code: {response.status_code}")


shape: (72, 3)
┌───────────────────────┬──────────────────────────┬───────────────────────────┐
│ pickup_community_area ┆ pickup_centroid_latitude ┆ pickup_centroid_longitude │
│ ---                   ┆ ---                      ┆ ---                       │
│ str                   ┆ str                      ┆ str                       │
╞═══════════════════════╪══════════════════════════╪═══════════════════════════╡
│ 29                    ┆ 41.860190019             ┆ -87.7172201               │
│ 24                    ┆ 41.901206994             ┆ -87.676355989             │
│ 11                    ┆ 41.978829526             ┆ -87.771166703             │
│ 7                     ┆ 41.922686284             ┆ -87.649488729             │
│ 49                    ┆ 41.706587882             ┆ -87.623366512             │
│ …                     ┆ …                        ┆ …                         │
│ 50                    ┆ 41.706125752             ┆ -87.598255838             │
│ 8          

In [3]:
df_pandas = unique_locations_df.to_pandas().dropna()

df_pandas

Unnamed: 0,pickup_community_area,pickup_centroid_latitude,pickup_centroid_longitude
0,29,41.860190019,-87.7172201
1,24,41.901206994,-87.676355989
2,11,41.978829526,-87.771166703
3,7,41.922686284,-87.649488729
4,49,41.706587882,-87.623366512
...,...,...,...
67,50,41.706125752,-87.598255838
68,8,41.89503345,-87.619710672
69,24,41.89830587,-87.653613982
70,68,41.777196255,-87.642497527


In [41]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71 entries, 0 to 71
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   pickup_community_area      71 non-null     object
 1   pickup_centroid_latitude   71 non-null     object
 2   pickup_centroid_longitude  71 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB


In [4]:
df_pandas['pickup_community_area'] = pd.to_numeric(df_pandas['pickup_community_area'], errors='coerce')
df_pandas['pickup_centroid_latitude'] = pd.to_numeric(df_pandas['pickup_centroid_latitude'], errors='coerce')
df_pandas['pickup_centroid_longitude'] = pd.to_numeric(df_pandas['pickup_centroid_longitude'], errors='coerce')

In [43]:
from meteostat import Stations

# Define the geographical coordinates (latitude and longitude)
latitude = 41.89503345   
longitude = -87.619710672  

# Create a Stations object and find the nearest weather stations (within a 50 km radius)
stations = Stations()

nearby_stations = stations.nearby(latitude, longitude).fetch(1)

# Display the nearest 1 station

print(list(nearby_stations.index))


['KCGX0']


In [45]:
%%time

# Date range for Chicago Data
start = datetime(2024, 1, 1)
end = datetime(2024, 9, 30, 23, 59)

weather_results = []

# Loop through each row in the DataFrame to get weather data
for index, row in df_pandas.iterrows():
    
    location_name = row['pickup_community_area']
    latitude = row['pickup_centroid_latitude']
    longitude = row['pickup_centroid_longitude']
    
    stations = Stations()
    nearby_stations = stations.nearby(latitude, longitude).fetch(1)

    if nearby_stations.empty:
        print(f"No nearby stations found for location: {location_name}")
        continue

    station_id = nearby_stations.index[0]

    data = Hourly(station_id, start, end)
    data = data.fetch()

    if data.empty:
        print(f"No weather data available for station: {station_id} at location: {location_name}")
        continue

    data['station_id'] = station_id
    data['location_name'] = location_name
    data['latitude'] = latitude
    data['longitude'] = longitude
    data['pickup_community_area'] = row['pickup_community_area']


    weather_results.append(data)
    
final_df = pd.concat(weather_results)

final_df['state'] = 'Illinois'
final_df['City'] = 'Chicago'

final_df.head()


No weather data available for station: KCGX0 at location: 5
No weather data available for station: KCGX0 at location: 7
No weather data available for station: KCGX0 at location: 22
No weather data available for station: KCGX0 at location: 27
No weather data available for station: KCGX0 at location: 1
No weather data available for station: KCGX0 at location: 24
No weather data available for station: KCGX0 at location: 28
No weather data available for station: KCGX0 at location: 34
No weather data available for station: KCGX0 at location: 37
No weather data available for station: KCGX0 at location: 14
No weather data available for station: KCGX0 at location: 36
No weather data available for station: KCGX0 at location: 46
No weather data available for station: KCGX0 at location: 31
No weather data available for station: KCGX0 at location: 43
No weather data available for station: KCGX0 at location: 8
No weather data available for station: KCGX0 at location: 8
No weather data available for

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,station_id,location_name,latitude,longitude,pickup_community_area,state,City
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-01 00:00:00,1.0,-2.2,79.0,0.0,,320.0,24.0,,1018.0,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
2024-01-01 01:00:00,1.1,-2.1,79.0,0.0,,320.0,24.1,,1018.7,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
2024-01-01 02:00:00,1.1,-2.1,79.0,0.0,,330.0,25.9,,1019.1,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
2024-01-01 03:00:00,1.1,-2.1,79.0,0.0,,330.0,25.9,,1019.8,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
2024-01-01 04:00:00,1.1,-2.1,79.0,0.0,,330.0,29.5,,1020.0,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago


In [46]:
len(set(final_df['pickup_community_area']))

24

In [47]:
final_df.to_csv('chicago_weather_data.csv')

### New York Weather Data

In [23]:
import geopandas as gpd

# Define the path to the ZIP file and the file within the ZIP
zip_file_path = "taxi_zones.zip"
file_within_zip = "taxi_zones.shp"  # The file inside the ZIP archive

# Construct the full path using the ZIP protocol
zip_path = f"zip://{zip_file_path}!{file_within_zip}"

# Read the shapefile within the ZIP using GeoPandas
taxi_zones_gdf = gpd.read_file(zip_path)

# Display the GeoDataFrame
print(taxi_zones_gdf.head())

   OBJECTID  Shape_Leng  Shape_Area                     zone  LocationID  \
0         1    0.116357    0.000782           Newark Airport           1   
1         2    0.433470    0.004866              Jamaica Bay           2   
2         3    0.084341    0.000314  Allerton/Pelham Gardens           3   
3         4    0.043567    0.000112            Alphabet City           4   
4         5    0.092146    0.000498            Arden Heights           5   

         borough                                           geometry  
0            EWR  POLYGON ((933100.918 192536.086, 933091.011 19...  
1         Queens  MULTIPOLYGON (((1033269.244 172126.008, 103343...  
2          Bronx  POLYGON ((1026308.77 256767.698, 1026495.593 2...  
3      Manhattan  POLYGON ((992073.467 203714.076, 992068.667 20...  
4  Staten Island  POLYGON ((935843.31 144283.336, 936046.565 144...  


In [38]:
len(set(taxi_zones_gdf['borough']))

6

In [36]:
## Extracting the centroid of each zone to get latitude and longitude
taxi_zones_gdf = taxi_zones_gdf.to_crs(epsg=4326)

taxi_zones_gdf['centroid'] = taxi_zones_gdf.geometry.centroid
taxi_zones_gdf['latitude'] = taxi_zones_gdf.centroid.y
taxi_zones_gdf['longitude'] = taxi_zones_gdf.centroid.x

taxi_zones_gdf.head(20)






Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,centroid,latitude,longitude
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.695, -74.18449 40.6951,...",POINT (-74.174 40.69183),40.691831,-74.174
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...",POINT (-73.8313 40.61675),40.616745,-73.831299
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870...",POINT (-73.84742 40.86447),40.864474,-73.847422
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",POINT (-73.97697 40.72375),40.723752,-73.976968
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562...",POINT (-74.18848 40.55266),40.552659,-74.188484
5,6,0.150491,0.000606,Arrochar/Fort Wadsworth,6,Staten Island,"POLYGON ((-74.06367 40.6022, -74.06351 40.6021...",POINT (-74.07177 40.60032),40.600324,-74.071771
6,7,0.107417,0.00039,Astoria,7,Queens,"POLYGON ((-73.90414 40.76752, -73.90325 40.767...",POINT (-73.91969 40.76149),40.761493,-73.919694
7,8,0.027591,2.7e-05,Astoria Park,8,Queens,"POLYGON ((-73.92334 40.77513, -73.92398 40.774...",POINT (-73.92309 40.77856),40.778559,-73.923086
8,9,0.099784,0.000338,Auburndale,9,Queens,"POLYGON ((-73.78502 40.76104, -73.78486 40.760...",POINT (-73.78795 40.75104),40.751035,-73.787949
9,10,0.099839,0.000436,Baisley Park,10,Queens,"POLYGON ((-73.78327 40.68999, -73.78234 40.688...",POINT (-73.79099 40.67895),40.678953,-73.790986


In [43]:
station_list = taxi_zones_gdf[['zone','geometry']].drop_duplicates()
len(station_list)

263

In [44]:
## Extracting the centroid of each zone to get latitude and longitude
station_list = station_list.to_crs(epsg=4326)

station_list['centroid'] = station_list.geometry.centroid
station_list['latitude'] = station_list.centroid.y
station_list['longitude'] = station_list.centroid.x

station_list.head(20)






Unnamed: 0,zone,geometry,centroid,latitude,longitude
0,Newark Airport,"POLYGON ((-74.18445 40.695, -74.18449 40.6951,...",POINT (-74.174 40.69183),40.691831,-74.174
1,Jamaica Bay,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...",POINT (-73.8313 40.61675),40.616745,-73.831299
2,Allerton/Pelham Gardens,"POLYGON ((-73.84793 40.87134, -73.84725 40.870...",POINT (-73.84742 40.86447),40.864474,-73.847422
3,Alphabet City,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",POINT (-73.97697 40.72375),40.723752,-73.976968
4,Arden Heights,"POLYGON ((-74.17422 40.56257, -74.17349 40.562...",POINT (-74.18848 40.55266),40.552659,-74.188484
5,Arrochar/Fort Wadsworth,"POLYGON ((-74.06367 40.6022, -74.06351 40.6021...",POINT (-74.07177 40.60032),40.600324,-74.071771
6,Astoria,"POLYGON ((-73.90414 40.76752, -73.90325 40.767...",POINT (-73.91969 40.76149),40.761493,-73.919694
7,Astoria Park,"POLYGON ((-73.92334 40.77513, -73.92398 40.774...",POINT (-73.92309 40.77856),40.778559,-73.923086
8,Auburndale,"POLYGON ((-73.78502 40.76104, -73.78486 40.760...",POINT (-73.78795 40.75104),40.751035,-73.787949
9,Baisley Park,"POLYGON ((-73.78327 40.68999, -73.78234 40.688...",POINT (-73.79099 40.67895),40.678953,-73.790986


In [55]:
# Date range for New York Data
start = datetime(2023, 1, 1)
end = datetime(2024, 9, 30, 23, 59)

weather_results = []

# Loop through each row in the DataFrame to get weather data
for index, row in station_list.iterrows():
    
    location_name = row['zone']
    latitude = row['latitude']
    longitude = row['longitude']
    
    stations = Stations()
    nearby_stations = stations.nearby(latitude, longitude).fetch(1)

    if nearby_stations.empty:
        print(f"No nearby stations found for location: {location_name}")
        continue

    station_id = nearby_stations.index[0]

    data = Hourly(station_id, start, end)
    data = data.fetch()

    if data.empty:
        print(f"No weather data available for station: {station_id} at location: {location_name}")
        continue

    data['station_id'] = station_id
    data['location_name'] = location_name
    data['latitude'] = latitude
    data['longitude'] = longitude


    weather_results.append(data)
    
final_df = pd.concat(weather_results)

final_df['state'] = 'New York'
final_df['City'] = 'New York'

final_df.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,station_id,location_name,latitude,longitude,state,City
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-01-01 00:00:00,10.0,10.0,100.0,1.0,,208.0,8.3,,1010.0,,9.0,KLDJ0,Newark Airport,40.691831,-74.174,New York,New York
2023-01-01 01:00:00,9.8,9.8,100.0,1.0,,0.0,0.0,,1010.0,,9.0,KLDJ0,Newark Airport,40.691831,-74.174,New York,New York
2023-01-01 02:00:00,11.0,11.0,100.0,0.5,,290.0,6.0,,1009.0,,8.0,KLDJ0,Newark Airport,40.691831,-74.174,New York,New York
2023-01-01 03:00:00,10.9,10.9,100.0,2.2,,290.0,5.4,,1008.0,,9.0,KLDJ0,Newark Airport,40.691831,-74.174,New York,New York
2023-01-01 04:00:00,11.3,11.3,100.0,1.2,,270.0,7.6,,1008.0,,9.0,KLDJ0,Newark Airport,40.691831,-74.174,New York,New York


In [56]:
len(final_df)

4010416

In [57]:
final_df.to_csv('NewYork_weather_data.csv')

In [52]:
len(set(final_df['location_name']))

260

In [54]:
final_df['latitude']

time
2024-01-01 00:00:00    40.691831
2024-01-01 01:00:00    40.691831
2024-01-01 02:00:00    40.691831
2024-01-01 03:00:00    40.691831
2024-01-01 04:00:00    40.691831
                         ...    
2024-09-30 19:00:00    40.778766
2024-09-30 20:00:00    40.778766
2024-09-30 21:00:00    40.778766
2024-09-30 22:00:00    40.778766
2024-09-30 23:00:00    40.778766
Name: latitude, Length: 1706536, dtype: float64

## Merging Chicago ride share data and Weather data

In [16]:
chicago_rides = pd.read_csv('combined_data_2024_04.csv')
chicago_rides.head()

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,percent_time_chicago,percent_distance_chicago,pickup_census_tract,dropoff_census_tract,pickup_community_area,...,trip_total,shared_trip_authorized,shared_trip_match,trips_pooled,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
0,0009afd3724da60cd49549f9dc56888251b75660,2024-04-01T00:00:00.000,2024-04-01T00:15:00.000,239.0,1.1069,0.9916,0.97579,,,24.0,...,8.01,False,False,1,41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),41.901207,-87.676356,POINT (-87.6763559892 41.9012069941)
1,000ed319d6cab8a7f3e108953c170841ec55ed6a,2024-04-01T00:00:00.000,2024-04-01T00:00:00.000,310.0,2.97,1.0,1.0,17031980000.0,17031770000.0,76.0,...,16.23,False,False,1,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.982775,-87.877305,POINT (-87.8773053996 41.9827750091)
2,007b21e5cc9636775ec28fc3a98e0aa24ab7a6d2,2024-04-01T00:00:00.000,2024-04-01T00:15:00.000,205.0,0.63,1.0,1.0,17031410000.0,17031410000.0,41.0,...,8.73,False,False,1,41.797971,-87.598945,POINT (-87.5989445134 41.7979711911),41.797827,-87.603746,POINT (-87.6037457654 41.7978270187)
3,00858e69b1832ac03cdf726f98d130f798465216,2024-04-01T00:00:00.000,2024-04-01T00:30:00.000,1753.0,16.973,0.99943,1.0,17031980000.0,17031060000.0,56.0,...,46.15,False,False,1,41.785999,-87.750934,POINT (-87.7509342894 41.785998518),41.942577,-87.647079,POINT (-87.6470785093 41.942577185)
4,00e5425778c5593fecc8f98732f1272e8d25cd97,2024-04-01T00:00:00.000,2024-04-01T00:00:00.000,283.0,0.8966,0.9965,1.0,,,21.0,...,5.65,False,False,1,41.938666,-87.711211,POINT (-87.7112105933 41.9386661962),41.938666,-87.711211,POINT (-87.7112105933 41.9386661962)


In [17]:
chicago_rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7371563 entries, 0 to 7371562
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   trip_id                     object 
 1   trip_start_timestamp        object 
 2   trip_end_timestamp          object 
 3   trip_seconds                float64
 4   trip_miles                  float64
 5   percent_time_chicago        float64
 6   percent_distance_chicago    float64
 7   pickup_census_tract         float64
 8   dropoff_census_tract        float64
 9   pickup_community_area       float64
 10  dropoff_community_area      float64
 11  fare                        float64
 12  tip                         float64
 13  additional_charges          float64
 14  trip_total                  float64
 15  shared_trip_authorized      bool   
 16  shared_trip_match           bool   
 17  trips_pooled                int64  
 18  pickup_centroid_latitude    float64
 19  pickup_centroid_longi

In [20]:
chicago_rides['unique_id'] = range(1,len(chicago_rides)+1)

In [24]:
len(set(chicago_rides['trip_id'])), len(set(chicago_rides['unique_id'])), len(chicago_rides['unique_id'])

(7371560, 7371563, 7371563)

In [25]:
chicago_rides = chicago_rides.drop('trip_id',axis=1)

In [28]:
chicago_rides['unique_id'] = chicago_rides['unique_id'].astype('int32')

In [30]:
chicago_rides = chicago_rides.drop(['pickup_centroid_latitude','pickup_centroid_longitude','dropoff_centroid_latitude','dropoff_centroid_longitude'],axis=1)
chicago_rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7371563 entries, 0 to 7371562
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   trip_start_timestamp       object 
 1   trip_end_timestamp         object 
 2   trip_seconds               float64
 3   trip_miles                 float64
 4   percent_time_chicago       float64
 5   percent_distance_chicago   float64
 6   pickup_census_tract        float64
 7   dropoff_census_tract       float64
 8   pickup_community_area      float64
 9   dropoff_community_area     float64
 10  fare                       float64
 11  tip                        float64
 12  additional_charges         float64
 13  trip_total                 float64
 14  shared_trip_authorized     bool   
 15  shared_trip_match          bool   
 16  trips_pooled               int64  
 17  pickup_centroid_location   object 
 18  dropoff_centroid_location  object 
 19  unique_id                  int32  
dtypes:

In [31]:
def reduce_size(df):
    '''
    Reduce pandas dataframe memory usage by optimising datatypes
    '''
    mem_before_reduction = df.memory_usage().sum()/1024**2
    print(f'Memory usage before optimising datatyes: {mem_before_reduction:.0f} MB')

    for col in df.columns:

        if str(df[col].dtype)[:3] == 'int':
            min_val = df[col].min()
            max_val = df[col].max()
            
            if min_val > np.iinfo(np.int8).min and max_val < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif min_val > np.iinfo(np.int16).min and max_val < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif min_val > np.iinfo(np.int32).min and max_val < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif min_val > np.iinfo(np.int64).min and max_val < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        elif str(df[col].dtype)[:5] == 'float':
            min_val = df[col].min()
            max_val = df[col].max()
            
            if min_val > np.finfo(np.float16).min and max_val < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif min_val > np.finfo(np.float32).min and max_val < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    
    
    mem_after_reduction = df.memory_usage().sum()/1024**2
    print(f'Memory usage after optimising datatyes: {mem_after_reduction:.0f} MB')
    return df
    

In [32]:
reduce_size(chicago_rides)

Memory usage before optimising datatyes: 998 MB
Memory usage after optimising datatyes: 471 MB


Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,percent_time_chicago,percent_distance_chicago,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tip,additional_charges,trip_total,shared_trip_authorized,shared_trip_match,trips_pooled,pickup_centroid_location,dropoff_centroid_location,unique_id
0,2024-04-01T00:00:00.000,2024-04-01T00:15:00.000,239.0,1.106445,0.991699,0.975586,,,24.0,24.0,5.0,1.0,2.009766,8.007812,False,False,1,POINT (-87.6763559892 41.9012069941),POINT (-87.6763559892 41.9012069941),1
1,2024-04-01T00:00:00.000,2024-04-01T00:00:00.000,310.0,2.970703,1.000000,1.000000,1.703198e+10,1.703177e+10,76.0,,10.0,0.0,6.230469,16.234375,False,False,1,POINT (-87.9030396611 41.9790708201),POINT (-87.8773053996 41.9827750091),2
2,2024-04-01T00:00:00.000,2024-04-01T00:15:00.000,205.0,0.629883,1.000000,1.000000,1.703141e+10,1.703141e+10,41.0,41.0,7.5,0.0,1.230469,8.726562,False,False,1,POINT (-87.5989445134 41.7979711911),POINT (-87.6037457654 41.7978270187),3
3,2024-04-01T00:00:00.000,2024-04-01T00:30:00.000,1753.0,16.968750,0.999512,1.000000,1.703198e+10,1.703106e+10,56.0,6.0,22.5,3.0,20.656250,46.156250,False,False,1,POINT (-87.7509342894 41.785998518),POINT (-87.6470785093 41.942577185),4
4,2024-04-01T00:00:00.000,2024-04-01T00:00:00.000,283.0,0.896484,0.996582,1.000000,,,21.0,21.0,2.5,1.0,2.150391,5.648438,False,False,1,POINT (-87.7112105933 41.9386661962),POINT (-87.7112105933 41.9386661962),5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7371558,2024-04-30T23:45:00.000,2024-05-01T00:00:00.000,389.0,2.886719,0.997559,0.990723,,,24.0,32.0,7.5,1.0,1.049805,9.546875,True,True,1,POINT (-87.6763559892 41.9012069941),POINT (-87.6251921424 41.8788655841),7371559
7371559,2024-04-30T23:45:00.000,2024-05-01T00:00:00.000,421.0,3.802734,0.997559,1.000000,,,28.0,27.0,7.5,0.0,1.320312,8.820312,True,True,1,POINT (-87.6635175498 41.874005383),POINT (-87.7058971305 41.8789144956),7371560
7371560,2024-04-30T23:45:00.000,2024-05-01T00:00:00.000,1209.0,6.007812,1.000000,1.000000,,,24.0,30.0,10.0,3.0,1.440430,14.437500,True,True,1,POINT (-87.6763559892 41.9012069941),POINT (-87.714003807 41.8390869059),7371561
7371561,2024-04-30T23:45:00.000,2024-05-01T00:00:00.000,724.0,5.082031,0.978027,0.896973,,,26.0,28.0,7.5,0.0,1.959961,9.460938,True,True,1,POINT (-87.7302324284 41.8785943576),POINT (-87.6635175498 41.874005383),7371562


In [9]:
chicago_rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7371563 entries, 0 to 7371562
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   trip_id                     object 
 1   trip_start_timestamp        object 
 2   trip_end_timestamp          object 
 3   trip_seconds                float16
 4   trip_miles                  float16
 5   percent_time_chicago        float16
 6   percent_distance_chicago    float16
 7   pickup_census_tract         float32
 8   dropoff_census_tract        float32
 9   pickup_community_area       float16
 10  dropoff_community_area      float16
 11  fare                        float16
 12  tip                         float16
 13  additional_charges          float16
 14  trip_total                  float16
 15  shared_trip_authorized      bool   
 16  shared_trip_match           bool   
 17  trips_pooled                int8   
 18  pickup_centroid_latitude    float16
 19  pickup_centroid_longi

In [49]:
## Extracting date and hour fields

chicago_rides['trip_start_timestamp'] = pd.to_datetime(chicago_rides['trip_start_timestamp'])

## Creating separate 'date' and 'hour' columns
chicago_rides['ride_date'] = pd.to_datetime(chicago_rides['trip_start_timestamp'].dt.date)
chicago_rides['ride_hour'] = chicago_rides['trip_start_timestamp'].dt.hour

print(chicago_rides.columns)
chicago_rides.head()



Index(['trip_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds',
       'trip_miles', 'percent_time_chicago', 'percent_distance_chicago',
       'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tip', 'additional_charges',
       'trip_total', 'shared_trip_authorized', 'shared_trip_match',
       'trips_pooled', 'pickup_centroid_latitude', 'pickup_centroid_longitude',
       'pickup_centroid_location', 'dropoff_centroid_latitude',
       'dropoff_centroid_longitude', 'dropoff_centroid_location', 'ride_date',
       'ride_hour'],
      dtype='object')


Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,percent_time_chicago,percent_distance_chicago,pickup_census_tract,dropoff_census_tract,pickup_community_area,...,shared_trip_match,trips_pooled,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,ride_date,ride_hour
0,0009afd3724da60cd49549f9dc56888251b75660,2024-04-01,2024-04-01T00:15:00.000,239.0,1.1069,0.9916,0.97579,,,24.0,...,False,1,41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),2024-04-01,0
1,000ed319d6cab8a7f3e108953c170841ec55ed6a,2024-04-01,2024-04-01T00:00:00.000,310.0,2.97,1.0,1.0,17031980000.0,17031770000.0,76.0,...,False,1,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.982775,-87.877305,POINT (-87.8773053996 41.9827750091),2024-04-01,0
2,007b21e5cc9636775ec28fc3a98e0aa24ab7a6d2,2024-04-01,2024-04-01T00:15:00.000,205.0,0.63,1.0,1.0,17031410000.0,17031410000.0,41.0,...,False,1,41.797971,-87.598945,POINT (-87.5989445134 41.7979711911),41.797827,-87.603746,POINT (-87.6037457654 41.7978270187),2024-04-01,0
3,00858e69b1832ac03cdf726f98d130f798465216,2024-04-01,2024-04-01T00:30:00.000,1753.0,16.973,0.99943,1.0,17031980000.0,17031060000.0,56.0,...,False,1,41.785999,-87.750934,POINT (-87.7509342894 41.785998518),41.942577,-87.647079,POINT (-87.6470785093 41.942577185),2024-04-01,0
4,00e5425778c5593fecc8f98732f1272e8d25cd97,2024-04-01,2024-04-01T00:00:00.000,283.0,0.8966,0.9965,1.0,,,21.0,...,False,1,41.938666,-87.711211,POINT (-87.7112105933 41.9386661962),41.938666,-87.711211,POINT (-87.7112105933 41.9386661962),2024-04-01,0


In [50]:
mode_pickup_community_area = chicago_rides['pickup_community_area'].mode()[0]
chicago_rides['pickup_community_area'].fillna(mode_pickup_community_area, inplace=True)

In [52]:
len(set(chicago_rides['pickup_community_area']))

77

In [53]:
chicago_weather = pd.read_csv('chicago_weather_data.csv')
chicago_weather.head()



Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,station_id,location_name,latitude,longitude,pickup_community_area,state,City
0,2024-01-01 00:00:00,1.0,-2.2,79.0,0.0,,320.0,24.0,,1018.0,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
1,2024-01-01 01:00:00,1.1,-2.1,79.0,0.0,,320.0,24.1,,1018.7,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
2,2024-01-01 02:00:00,1.1,-2.1,79.0,0.0,,330.0,25.9,,1019.1,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
3,2024-01-01 03:00:00,1.1,-2.1,79.0,0.0,,330.0,25.9,,1019.8,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago
4,2024-01-01 04:00:00,1.1,-2.1,79.0,0.0,,330.0,29.5,,1020.0,,3.0,72534,30,41.839087,-87.714004,30,Illinois,Chicago


In [54]:
## Extracting date and hour fields

chicago_weather['time'] = pd.to_datetime(chicago_weather['time'])

## Creating separate 'date' and 'hour' columns
chicago_weather['weather_date'] = pd.to_datetime(chicago_weather['time'].dt.date)
chicago_weather['weather_hour'] = chicago_weather['time'].dt.hour


In [55]:
merged_chicago_data = pd.merge(chicago_rides,chicago_weather,left_on=['pickup_community_area','ride_date','ride_hour'],
                               right_on=['pickup_community_area','weather_date','weather_hour'],
                               how='left')

In [56]:
merged_chicago_data.head()

Unnamed: 0,trip_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,percent_time_chicago,percent_distance_chicago,pickup_census_tract,dropoff_census_tract,pickup_community_area,...,tsun,coco,station_id,location_name,latitude,longitude,state,City,weather_date,weather_hour
0,0009afd3724da60cd49549f9dc56888251b75660,2024-04-01,2024-04-01T00:15:00.000,239.0,1.1069,0.9916,0.97579,,,24.0,...,,,,,,,,,NaT,
1,000ed319d6cab8a7f3e108953c170841ec55ed6a,2024-04-01,2024-04-01T00:00:00.000,310.0,2.97,1.0,1.0,17031980000.0,17031770000.0,76.0,...,,4.0,72530.0,76.0,41.980264,-87.913625,Illinois,Chicago,2024-04-01,0.0
2,000ed319d6cab8a7f3e108953c170841ec55ed6a,2024-04-01,2024-04-01T00:00:00.000,310.0,2.97,1.0,1.0,17031980000.0,17031770000.0,76.0,...,,4.0,72530.0,76.0,41.979071,-87.90304,Illinois,Chicago,2024-04-01,0.0
3,007b21e5cc9636775ec28fc3a98e0aa24ab7a6d2,2024-04-01,2024-04-01T00:15:00.000,205.0,0.63,1.0,1.0,17031410000.0,17031410000.0,41.0,...,,,,,,,,,NaT,
4,00858e69b1832ac03cdf726f98d130f798465216,2024-04-01,2024-04-01T00:30:00.000,1753.0,16.973,0.99943,1.0,17031980000.0,17031060000.0,56.0,...,,8.0,72534.0,56.0,41.792592,-87.769615,Illinois,Chicago,2024-04-01,0.0


In [59]:
merged_chicago_data.to_csv('chicago_weather_data_merged_2024_04.csv')

In [58]:
len(set(merged_chicago_data['station_id']))

5