In [87]:
import duckdb
import pandas as pd

# Open an in-memory DuckDB connection
con = duckdb.connect()

# Load the merged CSVs into DuckDB tables for faster repeated queries
con.execute("""
CREATE OR REPLACE TABLE processed_last12 AS
SELECT * 
FROM read_csv_auto('../../../data/processed/processed_last12.csv', HEADER=TRUE);
""")
con.execute("""
CREATE OR REPLACE TABLE processed_longterm AS
SELECT * 
FROM read_csv_auto('../../../data/processed/processed_longterm.csv', HEADER=TRUE);
""")

# Verify the tables exist
print(con.execute("SHOW TABLES;").df())

                 name
0    processed_last12
1  processed_longterm


In [88]:
processed_last12_preview = con.execute("""
  SELECT * 
  FROM processed_last12 
  ORDER BY started_at 
  LIMIT 5;
""").df()
processed_last12_preview

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week,hour_of_day
0,E92D44EC0F271F6D,electric_bike,2024-05-01 00:01:58,2024-05-01 00:05:45,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.899015,-87.629916,41.907993,-87.631501,member,3.783333,Wednesday,0
1,55FA739D575E7AE3,classic_bike,2024-05-01 00:02:35,2024-05-01 00:05:54,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.898969,-87.629912,41.907993,-87.631501,member,3.316667,Wednesday,0
2,7A296E31619D21E9,classic_bike,2024-05-01 00:04:31,2024-05-01 00:20:13,Broadway & Barry Ave,13137,Broadway & Wilson Ave,13074,41.937582,-87.644098,41.965221,-87.658139,member,15.7,Wednesday,0
3,2FB602B952B83DB7,electric_bike,2024-05-01 00:04:34,2024-05-01 00:12:17,Broadway & Granville Ave,15571,Sheridan Rd & Loyola Ave,RP-009,41.99477,-87.660287,42.001044,-87.661198,casual,7.716667,Wednesday,0
4,2D37389234DB639E,electric_bike,2024-05-01 00:05:13,2024-05-01 00:05:48,Western Ave & Ardmore Ave,464,Western Ave & Ardmore Ave,464,41.986607,-87.689669,41.986764,-87.68988,casual,0.583333,Wednesday,0


In [89]:
processed_longterm_preview = con.execute("""
  SELECT * 
  FROM processed_longterm 
  ORDER BY started_at 
  LIMIT 5;
""").df()
processed_longterm_preview

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week,hour_of_day
0,0D139A3203274B87,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,6.25,Friday,0
1,C7AE8E9CDB197A8E,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,19.683333,Friday,0
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,member,0.2,Friday,0
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,casual,0.05,Friday,0
4,6604F61AE4B14BC1,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,31.166667,Friday,0


In [90]:
member_casual_count_last12 = con.execute("""
  SELECT 
    member_casual,
    COUNT(*) AS trip_count,
    AVG(ride_length) AS avg_ride_length,
    MAX(ride_length) AS max_ride_length
  FROM processed_last12
  GROUP BY member_casual
  ORDER BY trip_count DESC;
""").df()

print("🔷 Trip Summary (Last 12 Months May 2024 - May 2025):")
member_casual_count_last12

🔷 Trip Summary (Last 12 Months May 2024 - May 2025):


Unnamed: 0,member_casual,trip_count,avg_ride_length,max_ride_length
0,member,2791749,12.288877,1497.475417
1,casual,1609958,23.651827,1499.6377


In [91]:
member_casual_count_longterm = con.execute("""
  SELECT 
    member_casual,
    COUNT(*) AS trip_count,
    AVG(ride_length) AS avg_ride_length,
    MAX(ride_length) AS max_ride_length
  FROM processed_longterm
  GROUP BY member_casual
  ORDER BY trip_count DESC;
""").df()

print("\n🔶 Trip Summary (Long-Term 2021-2023):")
member_casual_count_longterm


🔶 Trip Summary (Long-Term 2021-2023):


Unnamed: 0,member_casual,trip_count,avg_ride_length,max_ride_length
0,member,7951222,12.572112,1497.866667
1,casual,5338509,26.957304,55944.15


In [92]:
from pathlib import Path

# Correct export path (one level up)
export_dir = Path('../data/4_exports')
export_dir.mkdir(parents=True, exist_ok=True)

# Save last12 summary
last12_path = export_dir / 'trip_summary_last12.csv'
member_casual_count_last12.to_csv(last12_path, index=False)
print(f"✅ trip_summary_last12.csv successfully saved to {last12_path}")

# Save longterm summary
longterm_path = export_dir / 'trip_summary_longterm.csv'
member_casual_count_longterm.to_csv(longterm_path, index=False)
print(f"✅ trip_summary_longterm.csv successfully saved to {longterm_path}")


✅ trip_summary_last12.csv successfully saved to ..\data\4_exports\trip_summary_last12.csv
✅ trip_summary_longterm.csv successfully saved to ..\data\4_exports\trip_summary_longterm.csv


In [93]:
ride_by_type_last12 = con.execute("""
  SELECT 
    member_casual,
    rideable_type,
    COUNT(*) AS trip_count,
    AVG(ride_length) AS avg_ride_length,
    MAX(ride_length) AS max_ride_length
  FROM processed_last12
  GROUP BY member_casual, rideable_type
  ORDER BY member_casual DESC, rideable_type DESC;
""").df()

print("🔷 Trip Summary by Ride Type (Last 12 Months May 2024 - May 2025):")
ride_by_type_last12

🔷 Trip Summary by Ride Type (Last 12 Months May 2024 - May 2025):


Unnamed: 0,member_casual,rideable_type,trip_count,avg_ride_length,max_ride_length
0,member,electric_scooter,22087,7.889804,97.823533
1,member,electric_bike,1042602,10.689217,479.9112
2,member,classic_bike,1727060,13.310829,1497.475417
3,casual,electric_scooter,25740,11.299602,186.55355
4,casual,electric_bike,598585,15.236447,479.751283
5,casual,classic_bike,985633,29.085154,1499.6377


In [94]:
ride_by_type_longterm = con.execute("""
  SELECT 
    member_casual,
    rideable_type,
    COUNT(*) AS trip_count,
    AVG(ride_length) AS avg_ride_length,
    MAX(ride_length) AS max_ride_length
  FROM processed_longterm
  GROUP BY member_casual, rideable_type
  ORDER BY member_casual DESC, rideable_type DESC;
""").df()

print("\n🔶 Trip Summary by Ride Type (Long-Term 2021-2023):")
ride_by_type_longterm


🔶 Trip Summary by Ride Type (Long-Term 2021-2023):


Unnamed: 0,member_casual,rideable_type,trip_count,avg_ride_length,max_ride_length
0,member,electric_bike,2444319,11.01785,480.0
1,member,docked_bike,1,2.633333,2.633333
2,member,classic_bike,5506902,13.261996,1497.866667
3,casual,electric_bike,1751962,16.88825,480.0
4,casual,docked_bike,563146,66.325544,55944.15
5,casual,classic_bike,3023401,25.459168,1499.9


In [95]:
# Save last12 summary by ride type
ride_type_last12_path = export_dir / 'ride_by_type_last12.csv'
ride_by_type_last12.to_csv(ride_type_last12_path, index=False)
print(f"✅ ride_by_type_last12.csv successfully saved to {ride_type_last12_path}")

# Save longterm summary by ride type
ride_type_longterm_path = export_dir / 'ride_by_type_longterm.csv'
ride_by_type_longterm.to_csv(ride_type_longterm_path, index=False)
print(f"✅ ride_by_type_longterm.csv successfully saved to {ride_type_longterm_path}")

✅ ride_by_type_last12.csv successfully saved to ..\data\4_exports\ride_by_type_last12.csv
✅ ride_by_type_longterm.csv successfully saved to ..\data\4_exports\ride_by_type_longterm.csv


In [96]:
hourly_distribution_last12 = con.execute("""
SELECT 
  member_casual,                                       
  hour_of_day,
  COUNT(*) AS trip_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY member_casual), 2) AS trip_percentage,
  ROUND(AVG(ride_length), 2) AS avg_ride_length
FROM (
  SELECT 
    member_casual,
    CAST(strftime('%H', started_at::timestamp) AS INTEGER) AS hour_of_day,
    ride_length
  FROM processed_last12
)
GROUP BY member_casual, hour_of_day
ORDER BY trip_percentage DESC;
""").df()

print("🔷 Hourly Distribution (Last 12 Months May 2024 - May 2025):")
hourly_distribution_last12


🔷 Hourly Distribution (Last 12 Months May 2024 - May 2025):


Unnamed: 0,member_casual,hour_of_day,trip_count,trip_percentage,avg_ride_length
0,member,17,304381,10.9,13.22
1,casual,17,155743,9.67,22.17
2,member,16,265970,9.53,12.81
3,casual,16,146516,9.1,23.24
4,member,18,228012,8.17,12.9
5,casual,15,129288,8.03,25.16
6,casual,18,129126,8.02,22.14
7,casual,14,118113,7.34,26.79
8,member,8,203209,7.28,11.07
9,casual,13,113254,7.03,27.54


In [97]:
hourly_distribution_longterm = con.execute("""
SELECT 
  member_casual,                                       
  hour_of_day,
  COUNT(*) AS trip_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY member_casual), 2) AS trip_percentage,
  ROUND(AVG(ride_length), 2) AS avg_ride_length
FROM (
  SELECT 
    member_casual,
    CAST(strftime('%H', started_at::timestamp) AS INTEGER) AS hour_of_day,
    ride_length
  FROM processed_longterm 
)
GROUP BY member_casual, hour_of_day
ORDER BY trip_percentage DESC;
""").df()

print("🔶 Hourly Distribution (Long-Term 2021-2023):")
hourly_distribution_longterm 

🔶 Hourly Distribution (Long-Term 2021-2023):


Unnamed: 0,member_casual,hour_of_day,trip_count,trip_percentage,avg_ride_length
0,member,17,858440,10.8,13.29
1,casual,17,516514,9.68,24.95
2,member,16,707923,8.9,13.03
3,member,18,687186,8.64,13.21
4,casual,16,458619,8.59,26.52
5,casual,18,453151,8.49,25.1
6,casual,15,408468,7.65,29.09
7,casual,14,377260,7.07,30.51
8,casual,13,362566,6.79,30.53
9,member,15,526236,6.62,12.93


In [98]:
# Save hourly distribution — last 12 months
hourly_last12_path = export_dir / 'hourly_distribution_last12.csv'
hourly_distribution_last12.to_csv(hourly_last12_path, index=False)
print(f"✅ hourly_distribution_last12.csv successfully saved to {hourly_last12_path}")

# Save hourly distribution — long term
hourly_longterm_path = export_dir / 'hourly_distribution_longterm.csv'
hourly_distribution_longterm.to_csv(hourly_longterm_path, index=False)
print(f"✅ hourly_distribution_longterm.csv successfully saved to {hourly_longterm_path}")

✅ hourly_distribution_last12.csv successfully saved to ..\data\4_exports\hourly_distribution_last12.csv
✅ hourly_distribution_longterm.csv successfully saved to ..\data\4_exports\hourly_distribution_longterm.csv


In [99]:
daily_distribution_last12 = con.execute("""
SELECT
  member_casual,
  day_of_week,
  COUNT(*) AS trip_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY member_casual), 2) AS trip_percentage,
  ROUND(AVG(ride_length), 2) AS avg_ride_length
FROM (
  SELECT 
    member_casual,
    day_of_week,
    ride_length
  FROM processed_last12
)
GROUP BY member_casual, day_of_week
ORDER BY trip_percentage DESC;
""").df()

print("🔷 Daily Distribution (Last 12 Months May 2024 - May 2025):")
daily_distribution_last12


🔷 Daily Distribution (Last 12 Months May 2024 - May 2025):


Unnamed: 0,member_casual,day_of_week,trip_count,trip_percentage,avg_ride_length
0,casual,Saturday,336853,20.92,26.79
1,casual,Sunday,271530,16.87,27.09
2,member,Wednesday,458159,16.41,11.94
3,member,Thursday,437015,15.65,11.79
4,casual,Friday,251117,15.6,23.05
5,member,Tuesday,430158,15.41,11.68
6,member,Friday,410870,14.72,12.1
7,member,Monday,405465,14.52,11.68
8,member,Saturday,350267,12.55,13.76
9,casual,Thursday,197948,12.3,20.85


In [100]:
daily_distribution_longterm = con.execute("""
SELECT
  member_casual,
  day_of_week,
  COUNT(*) AS trip_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY member_casual), 2) AS trip_percentage,
  ROUND(AVG(ride_length), 2) AS avg_ride_length
FROM (
  SELECT 
    member_casual,
    day_of_week,
    ride_length
  FROM processed_longterm
)
GROUP BY member_casual, day_of_week
ORDER BY trip_percentage DESC;
""").df()

print("🔶 Daily Distribution (Long-Term 2021-2023):")
daily_distribution_longterm

🔶 Daily Distribution (Long-Term 2021-2023):


Unnamed: 0,member_casual,day_of_week,trip_count,trip_percentage,avg_ride_length
0,casual,Saturday,1145870,21.46,29.85
1,casual,Sunday,959901,17.98,31.41
2,member,Wednesday,1263259,15.89,11.95
3,member,Tuesday,1248265,15.7,11.92
4,member,Thursday,1242054,15.62,11.97
5,casual,Friday,766730,14.36,25.58
6,member,Friday,1126372,14.17,12.34
7,member,Monday,1108387,13.94,12.08
8,member,Saturday,1046022,13.16,14.13
9,casual,Thursday,653170,12.24,23.24


In [101]:
# Save daily distribution — last 12 months
daily_last12_path = export_dir / 'daily_distribution_last12.csv'
daily_distribution_last12.to_csv(daily_last12_path, index=False)
print(f"✅ daily_distribution_last12.csv successfully saved to {daily_last12_path}")

# Save daily distribution — long term
daily_longterm_path = export_dir / 'daily_distribution_longterm.csv'
daily_distribution_longterm.to_csv(daily_longterm_path, index=False)
print(f"✅ daily_distribution_longterm.csv successfully saved to {daily_longterm_path}")

✅ daily_distribution_last12.csv successfully saved to ..\data\4_exports\daily_distribution_last12.csv
✅ daily_distribution_longterm.csv successfully saved to ..\data\4_exports\daily_distribution_longterm.csv


In [102]:
start_stations_last12 = con.execute("""
SELECT *
FROM (
  SELECT 
    member_casual,
    start_station_name,
    COUNT(*) AS trip_count,
    ROW_NUMBER() OVER (PARTITION BY member_casual ORDER BY COUNT(*) DESC) AS rn
  FROM processed_last12
  GROUP BY member_casual, start_station_name
)
WHERE rn <= 10
ORDER BY member_casual, rn;
""").df()

print("🔷 Top 10 Start Stations By Member And Casual (Last 12 Months May 2024 - May 2025):")
start_stations_last12



🔷 Top 10 Start Stations By Member And Casual (Last 12 Months May 2024 - May 2025):


Unnamed: 0,member_casual,start_station_name,trip_count,rn
0,casual,Streeter Dr & Grand Ave,50462,1
1,casual,DuSable Lake Shore Dr & Monroe St,34230,2
2,casual,Michigan Ave & Oak St,24548,3
3,casual,DuSable Lake Shore Dr & North Blvd,22320,4
4,casual,Millennium Park,22139,5
5,casual,Shedd Aquarium,21050,6
6,casual,Dusable Harbor,18696,7
7,casual,Theater on the Lake,16092,8
8,casual,Michigan Ave & 8th St,13144,9
9,casual,Adler Planetarium,12506,10


In [103]:
start_stations_longterm = con.execute("""
SELECT *
FROM (
  SELECT 
    member_casual,
    start_station_name,
    COUNT(*) AS trip_count,
    ROW_NUMBER() OVER (PARTITION BY member_casual ORDER BY COUNT(*) DESC) AS rn
  FROM processed_longterm
  GROUP BY member_casual, start_station_name
)
WHERE rn <= 10
ORDER BY member_casual, rn;
""").df()

print("🔶 Top 10 Start Stations By Member And Casual (Long-Term 2021-2023):")
start_stations_longterm


🔶 Top 10 Start Stations By Member And Casual (Long-Term 2021-2023):


Unnamed: 0,member_casual,start_station_name,trip_count,rn
0,casual,Streeter Dr & Grand Ave,162882,1
1,casual,Millennium Park,75018,2
2,casual,DuSable Lake Shore Dr & Monroe St,74649,3
3,casual,Michigan Ave & Oak St,73524,4
4,casual,Shedd Aquarium,58919,5
5,casual,DuSable Lake Shore Dr & North Blvd,55236,6
6,casual,Theater on the Lake,53013,7
7,casual,Wells St & Concord Ln,44543,8
8,casual,Dusable Harbor,42517,9
9,casual,Indiana Ave & Roosevelt Rd,39463,10


In [104]:
# Save start station — last 12 months
start_stations_last12_path = export_dir / 'start_stations_last12.csv'
start_stations_last12.to_csv(start_stations_last12_path, index=False)
print(f"✅ start_stations_last12.csv successfully saved to {start_stations_last12_path}")

# Save start station — long term
start_stations_longterm_path = export_dir / 'start_stations_longterm.csv'
start_stations_longterm.to_csv(start_stations_longterm_path, index=False)
print(f"✅ start_stations__longterm.csv successfully saved to {start_stations_longterm_path}")

✅ start_stations_last12.csv successfully saved to ..\data\4_exports\start_stations_last12.csv
✅ start_stations__longterm.csv successfully saved to ..\data\4_exports\start_stations_longterm.csv
