In [26]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [27]:
# --- 1. CONFIGURATION ---
LODES_URL = "https://lehd.ces.census.gov/data/lodes/LODES8/ny/od/ny_od_main_JT00_2021.csv.gz"
OUTPUT_DIR = "output"
OD_MATRIX_FILE = "OD_Matrix_LODES_by_Tract.csv"
TOP_N = 15 # Number of top items to plot

In [28]:
# --- 2. LOAD LODES COMMUTER DATA ---
print("➡️ Step 1: Loading LODES commuter data...")
df_lodes = pd.read_csv(LODES_URL)
print(f"✅ LODES data loaded: {len(df_lodes)} total routes in NY State.")

➡️ Step 1: Loading LODES commuter data...
✅ LODES data loaded: 7144618 total routes in NY State.


In [29]:
# --- 3. FILTER TO NYC BOROUGHS ---
print("\n➡️ Step 2: Filtering data to the five NYC boroughs...")
nyc_county_fips = ['36005', '36047', '36061', '36081', '36085']
df_lodes = df_lodes[
    df_lodes['h_geocode'].astype(str).str[:5].isin(nyc_county_fips) &
    df_lodes['w_geocode'].astype(str).str[:5].isin(nyc_county_fips)
]
print(f"✅ Data filtered to NYC-only routes: {len(df_lodes)} remaining.")


➡️ Step 2: Filtering data to the five NYC boroughs...
✅ Data filtered to NYC-only routes: 2466029 remaining.


In [30]:
# --- 4. CREATE CUSTOM TRACT IDS TO MATCH TAXI DATA (FIXED) ---
print("\n➡️ Step 3: Creating custom tract IDs to match taxi data format...")

# Define the mapping from the county FIPS code to a single borough digit
# This mapping creates the custom ID format used in your taxi data.
county_to_borough_digit = {
    '36061': '1',  # Manhattan (New York County)
    '36005': '2',  # Bronx
    '36047': '3',  # Brooklyn (Kings County)
    '36081': '4',  # Queens
    '36085': '5'   # Staten Island (Richmond County)
}
# Extract the county and tract parts from the full geocode
df_lodes['origin_county'] = df_lodes['h_geocode'].astype(str).str[:5]
df_lodes['origin_tract_part'] = df_lodes['h_geocode'].astype(str).str[5:11]
df_lodes['destination_county'] = df_lodes['w_geocode'].astype(str).str[:5]
df_lodes['destination_tract_part'] = df_lodes['w_geocode'].astype(str).str[5:11]

# Create the new custom tract ID by combining the mapped borough digit and the tract part
df_lodes['origin_tract'] = df_lodes['origin_county'].map(county_to_borough_digit) + df_lodes['origin_tract_part']
df_lodes['destination_tract'] = df_lodes['destination_county'].map(county_to_borough_digit) + df_lodes['destination_tract_part']
print("✅ Custom tract IDs created successfully.")


➡️ Step 3: Creating custom tract IDs to match taxi data format...
✅ Custom tract IDs created successfully.


In [31]:
# --- 5. AGGREGATE TO CREATE THE O-D MATRIX ---
print("\n➡️ Step 4: Creating the tract-level O-D matrix...")
od_matrix_lodes = (
    df_lodes.groupby(['origin_tract', 'destination_tract'])
    ['S000']
    .sum()
    .reset_index()
)
od_matrix_lodes.rename(columns={'S000': 'total_commuters'}, inplace=True)
print("✅ LODES O-D matrix created successfully.")


➡️ Step 4: Creating the tract-level O-D matrix...
✅ LODES O-D matrix created successfully.


In [32]:
# --- 6. SAVE THE O-D MATRIX & DISPLAY SAMPLE ---
print("\n➡️ Step 5: Saving the O-D matrix output...")
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(OUTPUT_DIR, OD_MATRIX_FILE)
od_matrix_lodes.to_csv(output_path, index=False)
print(f"✅ Final LODES Commuter O-D matrix saved to: '{output_path}'")
print("\n--- Sample of the Final LODES O-D Matrix ---")
print(od_matrix_lodes.head())


➡️ Step 5: Saving the O-D matrix output...
✅ Final LODES Commuter O-D matrix saved to: 'output/OD_Matrix_LODES_by_Tract.csv'

--- Sample of the Final LODES O-D Matrix ---
  origin_tract destination_tract  total_commuters
0      1000100           1000900                2
1      1000100           1002202                1
2      1000100           1002601                1
3      1000100           1002901                1
4      1000100           1002902                1


In [35]:
# --- 7. ANALYSIS & PLOTTING ---
print("\n➡️ Step 6: Generating analysis plots...")
# Using a universally available style to prevent errors
plt.style.use('ggplot')

# PLOT 1: Top N Busiest Origin Tracts (by Commuters)
top_origins = od_matrix_lodes.groupby('origin_tract')['total_commuters'].sum().nlargest(TOP_N).sort_values(ascending=True)
plt.figure(figsize=(10, 8))
top_origins.plot(kind='barh', color='darkcyan')
plt.title(f'Top {TOP_N} Commuter Origin Tracts (LODES)', fontsize=16)
plt.xlabel('Total Commuters', fontsize=12)
plt.ylabel('Origin Census Tract ID', fontsize=12)
plt.tight_layout()
origin_plot_path = os.path.join("images", 'lodes_top_origin_tracts.png')
plt.savefig(origin_plot_path)
print(f"✅ Saved Top Origins plot to: '{origin_plot_path}'")
plt.close()

# PLOT 2: Top N Busiest Destination Tracts (by Commuters)
top_destinations = od_matrix_lodes.groupby('destination_tract')['total_commuters'].sum().nlargest(TOP_N).sort_values(ascending=True)
plt.figure(figsize=(10, 8))
top_destinations.plot(kind='barh', color='orangered')
plt.title(f'Top {TOP_N} Commuter Destination Tracts (LODES)', fontsize=16)
plt.xlabel('Total Commuters', fontsize=12)
plt.ylabel('Destination Census Tract ID', fontsize=12)
plt.tight_layout()
destination_plot_path = os.path.join("images", 'lodes_top_destination_tracts.png')
plt.savefig(destination_plot_path)
print(f"✅ Saved Top Destinations plot to: '{destination_plot_path}'")
plt.close()

# PLOT 3: Top N Busiest Commuter Routes
top_routes = od_matrix_lodes.sort_values(by='total_commuters', ascending=False).head(TOP_N)
top_routes['route'] = top_routes['origin_tract'].astype(str) + ' → ' + top_routes['destination_tract'].astype(str)
top_routes.sort_values(by='total_commuters', ascending=True, inplace=True)
plt.figure(figsize=(10, 8))
plt.barh(top_routes['route'], top_routes['total_commuters'], color='darkviolet')
plt.title(f'Top {TOP_N} Busiest Commuter Routes (LODES)', fontsize=16)
plt.xlabel('Total Commuters', fontsize=12)
plt.ylabel('Origin → Destination Route', fontsize=12)
plt.tight_layout()
routes_plot_path = os.path.join("images", 'lodes_top_routes.png')
plt.savefig(routes_plot_path)
print(f"✅ Saved Top Routes plot to: '{routes_plot_path}'")
plt.close()

print("\n✅ Analysis and plotting complete.")


➡️ Step 6: Generating analysis plots...
✅ Saved Top Origins plot to: 'images/lodes_top_origin_tracts.png'
✅ Saved Top Destinations plot to: 'images/lodes_top_destination_tracts.png'
✅ Saved Top Routes plot to: 'images/lodes_top_routes.png'

✅ Analysis and plotting complete.
