In [1]:
import os


project_dir = '/blue/shenhaowang/qingqisong/be-and-active-travel'
os.chdir(project_dir)

In [2]:
"""
Chicago Travel Survey - Complete Variable Visualization
========================================================
Visualize ALL variables with vmax = 10 × mean value
"""

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import contextily as ctx
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
print("=" * 80)
print("    CHICAGO TRAVEL SURVEY - COMPLETE VARIABLE VISUALIZATION")
print("=" * 80)
print(f"\nExecution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# ============================================================
# CONFIGURATION
# ============================================================

INPUT_PATH = './city_home_based_chicago_research_ready_v3_clean.csv'
OUTPUT_DIR = './city_home_based_visualizations'

CRS_WGS84 = 'EPSG:4326'
CRS_WEBMERCATOR = 'EPSG:3857'

FIGURE_DPI = 200
MARKER_SIZE = 3
MARKER_ALPHA = 0.6

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"\nOutput directory: {OUTPUT_DIR}")

In [2]:
# ============================================================
# VARIABLE CONFIGURATIONS
# ============================================================

# Define all variables to visualize with their settings
VARIABLE_CONFIGS = {
    # Dependent Variables (Outcomes)
    'Y_total_MET_minutes': {
        'title_en': 'Total MET Minutes',
        'title_cn': '总MET分钟数',
        'cmap': 'OrRd',
        'cbar_label': 'MET-minutes',
        'category': 'outcome'
    },
    'Y_transit_duration_min': {
        'title_en': 'Transit Duration',
        'title_cn': '公交出行时长',
        'cmap': 'Blues',
        'cbar_label': 'Minutes',
        'category': 'outcome'
    },
    'Y_active_duration': {
        'title_en': 'Active Travel Duration',
        'title_cn': '主动出行时长',
        'cmap': 'YlOrRd',
        'cbar_label': 'Minutes',
        'category': 'outcome'
    },
    
    # Trip Statistics
    'n_walking_trips': {
        'title_en': 'Walking Trips',
        'title_cn': '步行出行次数',
        'cmap': 'Oranges',
        'cbar_label': 'Trips',
        'category': 'trips'
    },
    'n_cycling_trips': {
        'title_en': 'Cycling Trips',
        'title_cn': '骑行出行次数',
        'cmap': 'Greens',
        'cbar_label': 'Trips',
        'category': 'trips'
    },
    'n_transit_trips': {
        'title_en': 'Transit Trips',
        'title_cn': '公交出行次数',
        'cmap': 'Blues',
        'cbar_label': 'Trips',
        'category': 'trips'
    },
    'n_active_trips': {
        'title_en': 'Active Travel Trips',
        'title_cn': '主动出行次数',
        'cmap': 'YlOrBr',
        'cbar_label': 'Trips',
        'category': 'trips'
    },
    'n_total_trips': {
        'title_en': 'Total Trips',
        'title_cn': '总出行次数',
        'cmap': 'Purples',
        'cbar_label': 'Trips',
        'category': 'trips'
    },
    'total_travel_time_min': {
        'title_en': 'Total Travel Time',
        'title_cn': '总出行时间',
        'cmap': 'RdPu',
        'cbar_label': 'Minutes',
        'category': 'trips'
    },
    
    # Demographics
    'age': {
        'title_en': 'Age',
        'title_cn': '年龄',
        'cmap': 'viridis',
        'cbar_label': 'Years',
        'category': 'demographics'
    },
    
    # Household
    'hhsize': {
        'title_en': 'Household Size',
        'title_cn': '家庭人数',
        'cmap': 'YlGn',
        'cbar_label': 'Persons',
        'category': 'household'
    },
    'hhveh': {
        'title_en': 'Household Vehicles',
        'title_cn': '家庭车辆数',
        'cmap': 'BuPu',
        'cbar_label': 'Vehicles',
        'category': 'household'
    },
    'hhinc': {
        'title_en': 'Household Income Category',
        'title_cn': '家庭收入类别',
        'cmap': 'Greens',
        'cbar_label': 'Income Category',
        'category': 'household'
    },
    
    # Spatial Variables - Transit
    'dist_rail_ft': {
        'title_en': 'Distance to Rail (ft)',
        'title_cn': '到地铁站距离(英尺)',
        'cmap': 'Purples_r',
        'cbar_label': 'Feet',
        'category': 'spatial'
    },
    'dist_rail_mi': {
        'title_en': 'Distance to Rail (mi)',
        'title_cn': '到地铁站距离(英里)',
        'cmap': 'Purples_r',
        'cbar_label': 'Miles',
        'category': 'spatial'
    },
    'dist_bus_ft': {
        'title_en': 'Distance to Bus (ft)',
        'title_cn': '到公交站距离(英尺)',
        'cmap': 'Greens_r',
        'cbar_label': 'Feet',
        'category': 'spatial'
    },
    'dist_bus_mi': {
        'title_en': 'Distance to Bus (mi)',
        'title_cn': '到公交站距离(英里)',
        'cmap': 'Greens_r',
        'cbar_label': 'Miles',
        'category': 'spatial'
    },
    'bus_count_14mile': {
        'title_en': 'Bus Stops within 1/4 Mile',
        'title_cn': '1/4英里内公交站数',
        'cmap': 'YlGn',
        'cbar_label': 'Stops',
        'category': 'spatial'
    },
    
    # Spatial Variables - CBD
    'dist_cbd_ft': {
        'title_en': 'Distance to CBD (ft)',
        'title_cn': '到CBD距离(英尺)',
        'cmap': 'Blues_r',
        'cbar_label': 'Feet',
        'category': 'spatial'
    },
    'dist_cbd_mi': {
        'title_en': 'Distance to CBD (mi)',
        'title_cn': '到CBD距离(英里)',
        'cmap': 'Blues_r',
        'cbar_label': 'Miles',
        'category': 'spatial'
    },
    
    # Built Environment (H_ prefix)
    'H_intersection_density': {
        'title_en': 'Intersection Density',
        'title_cn': '交叉口密度',
        'cmap': 'plasma',
        'cbar_label': 'Density',
        'category': 'built_env'
    },
    'H_road_network_complexity': {
        'title_en': 'Road Network Complexity',
        'title_cn': '路网复杂度',
        'cmap': 'inferno',
        'cbar_label': 'Complexity',
        'category': 'built_env'
    },
    'H_building_density': {
        'title_en': 'Building Density',
        'title_cn': '建筑密度',
        'cmap': 'viridis',
        'cbar_label': 'Density',
        'category': 'built_env'
    },
    'H_land_use_diversity': {
        'title_en': 'Land Use Diversity',
        'title_cn': '土地利用多样性',
        'cmap': 'cividis',
        'cbar_label': 'Diversity Index',
        'category': 'built_env'
    },
    'H_amenity_density': {
        'title_en': 'Amenity Density',
        'title_cn': '设施密度',
        'cmap': 'magma',
        'cbar_label': 'Density',
        'category': 'built_env'
    }
}

In [2]:
# ============================================================
# STEP 1: LOAD DATA
# ============================================================

print("\n" + "=" * 80)
print("[STEP 1] Loading Data / 加载数据")
print("=" * 80)

df = pd.read_csv(INPUT_PATH)
print(f"\n  ✓ Loaded: {len(df):,} observations")
print(f"  ✓ Columns: {len(df.columns)}")

# Identify coordinate columns
if 'home_x' in df.columns and 'home_y' in df.columns:
    use_projected = True
    print("  ✓ Using projected coordinates (home_x, home_y)")
else:
    use_projected = False
    lat_col = next((c for c in ['home_lat', 'latitude', 'lat'] if c in df.columns), None)
    lon_col = next((c for c in ['home_lon', 'longitude', 'lon'] if c in df.columns), None)
    print(f"  ✓ Using geographic coordinates ({lat_col}, {lon_col})")

# Filter valid coordinates and create GeoDataFrame
if use_projected:
    valid_mask = df['home_x'].notna() & df['home_y'].notna() & (df['home_x'] != 0) & (df['home_y'] != 0)
    df_valid = df[valid_mask].copy()
    geometry = [Point(x, y) for x, y in zip(df_valid['home_x'], df_valid['home_y'])]
    gdf = gpd.GeoDataFrame(df_valid, geometry=geometry, crs='EPSG:3435')
else:
    valid_mask = df[lat_col].notna() & df[lon_col].notna()
    chicago_mask = (
        (df[lat_col] >= 41.5) & (df[lat_col] <= 42.2) &
        (df[lon_col] >= -88.0) & (df[lon_col] <= -87.4)
    )
    df_valid = df[valid_mask & chicago_mask].copy()
    geometry = [Point(lon, lat) for lon, lat in zip(df_valid[lon_col], df_valid[lat_col])]
    gdf = gpd.GeoDataFrame(df_valid, geometry=geometry, crs=CRS_WGS84)

# Convert to Web Mercator for basemap
gdf = gdf.to_crs(CRS_WEBMERCATOR)
print(f"  ✓ Valid points: {len(gdf):,}")

# Sample for faster plotting
max_points = 25000
if len(gdf) > max_points:
    gdf_plot = gdf.sample(n=max_points, random_state=42)
    print(f"  ✓ Sampled to {max_points:,} points for visualization")
else:
    gdf_plot = gdf


# ============================================================
# STEP 2: IDENTIFY AVAILABLE VARIABLES
# ============================================================

print("\n" + "=" * 80)
print("[STEP 2] Identifying Variables / 识别变量")
print("=" * 80)

# Find which configured variables exist in data
available_vars = []
missing_vars = []

for var in VARIABLE_CONFIGS.keys():
    if var in gdf_plot.columns:
        # Check if variable has valid data (not all NaN or zero)
        data = gdf_plot[var].dropna()
        if len(data) > 0 and data.std() > 0:
            available_vars.append(var)
        else:
            missing_vars.append(var + " (no variance)")
    else:
        missing_vars.append(var)

# Also find any numeric variables not in config
all_numeric = gdf_plot.select_dtypes(include=[np.number]).columns.tolist()
extra_vars = [v for v in all_numeric if v not in VARIABLE_CONFIGS 
              and v not in ['sampno', 'perno', 'home_x', 'home_y', 'geometry']]

print(f"\n  Available configured variables: {len(available_vars)}")
for var in available_vars:
    cat = VARIABLE_CONFIGS[var]['category']
    print(f"    ✓ {var} [{cat}]")

if missing_vars:
    print(f"\n  Missing/Invalid variables: {len(missing_vars)}")
    for var in missing_vars[:10]:  # Show first 10
        print(f"    ✗ {var}")

if extra_vars:
    print(f"\n  Additional numeric variables found: {len(extra_vars)}")
    for var in extra_vars[:10]:
        print(f"    + {var}")


# ============================================================
# STEP 3: CALCULATE VMIN/VMAX (10 × mean)
# ============================================================

print("\n" + "=" * 80)
print("[STEP 3] Calculating Vmin/Vmax (10 × mean) / 计算颜色范围")
print("=" * 80)

vmin_vmax_dict = {}

for var in available_vars:
    data = gdf_plot[var].dropna()
    
    mean_val = data.mean()
    min_val = data.min()
    max_val = data.max()
    
    # vmin = 0 (or data min if negative), vmax = 10 × mean
    vmin = max(0, min_val) if min_val >= 0 else min_val
    vmax = mean_val * 10
    
    # Ensure vmax doesn't exceed actual max too much, and is reasonable
    vmax = min(vmax, max_val * 1.5)  # Cap at 1.5× actual max
    vmax = max(vmax, mean_val * 2)   # But at least 2× mean
    
    # Ensure vmax > vmin
    if vmax <= vmin:
        vmax = vmin + 1
    
    vmin_vmax_dict[var] = {
        'vmin': vmin,
        'vmax': vmax,
        'mean': mean_val,
        'actual_max': max_val
    }

print(f"\n  {'Variable':<30} {'Mean':>10} {'Vmax (10×mean)':>15} {'Actual Max':>12}")
print("  " + "-" * 70)
for var in available_vars:
    info = vmin_vmax_dict[var]
    print(f"  {var:<30} {info['mean']:>10.2f} {info['vmax']:>15.2f} {info['actual_max']:>12.2f}")

In [2]:
# ============================================================
# STEP 4: CREATE VISUALIZATIONS
# ============================================================

print("\n" + "=" * 80)
print("[STEP 4] Creating Visualizations / 创建可视化")
print("=" * 80)

successful = []
failed = []

for i, var in enumerate(available_vars):
    config = VARIABLE_CONFIGS[var]
    vmin = vmin_vmax_dict[var]['vmin']
    vmax = vmin_vmax_dict[var]['vmax']
    
    print(f"\n  [{i+1}/{len(available_vars)}] {var}...")
    print(f"      vmin={vmin:.2f}, vmax={vmax:.2f}")
    
    try:
        # Create figure
        fig, ax = plt.subplots(figsize=(12, 10))
        
        # Prepare data (clip to range)
        plot_data = gdf_plot[var].fillna(0).clip(lower=vmin, upper=vmax)
        gdf_plot['_plot_var'] = plot_data
        
        # Plot points
        gdf_plot.plot(
            ax=ax,
            column='_plot_var',
            cmap=config['cmap'],
            markersize=MARKER_SIZE,
            alpha=MARKER_ALPHA,
            legend=False,
            vmin=vmin,
            vmax=vmax
        )
        
        # Add basemap
        try:
            ctx.add_basemap(ax, source=ctx.providers.CartoDB.DarkMatter, zoom=11, alpha=0.9)
        except:
            try:
                ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerLite, zoom=11, alpha=0.7)
            except:
                pass  # Continue without basemap
        
        # Colorbar
        sm = plt.cm.ScalarMappable(
            cmap=config['cmap'],
            norm=plt.Normalize(vmin=vmin, vmax=vmax)
        )
        sm.set_array([])
        cbar = fig.colorbar(sm, ax=ax, shrink=0.7, pad=0.02)
        cbar.set_label(config['cbar_label'], fontsize=11)
        cbar.ax.tick_params(labelsize=9)
        
        # Title with statistics
        mean_val = vmin_vmax_dict[var]['mean']
        title = (f"{config['title_en']}\n{config['title_cn']}\n"
                f"(Mean: {mean_val:.2f}, Vmax: {vmax:.2f})")
        ax.set_title(title, fontsize=14, fontweight='bold', pad=10)
        
        # Clean axes
        ax.set_xticks([])
        ax.set_yticks([])
        for spine in ax.spines.values():
            spine.set_visible(False)
        
        # Scale bar
        try:
            scalebar = ScaleBar(1, location='lower right', length_fraction=0.2,
                               box_color='white', box_alpha=0.7)
            ax.add_artist(scalebar)
        except:
            pass
        
        # North arrow
        ax.annotate('N', xy=(0.95, 0.95), xytext=(0.95, 0.87),
                   xycoords='axes fraction', textcoords='axes fraction',
                   fontsize=12, fontweight='bold', color='white', ha='center',
                   arrowprops=dict(arrowstyle='->', color='white', lw=2))
        
        # Add category label
        category = config['category']
        ax.text(0.02, 0.98, f"[{category.upper()}]", transform=ax.transAxes,
               fontsize=10, color='white', ha='left', va='top',
               bbox=dict(boxstyle='round', facecolor='black', alpha=0.7))
        
        # Save
        output_file = os.path.join(OUTPUT_DIR, f'{var}.png')
        plt.savefig(output_file, dpi=FIGURE_DPI, bbox_inches='tight', 
                   facecolor='white', edgecolor='none')
        plt.close()
        
        successful.append(var)
        print(f"      ✓ Saved: {output_file}")
        
    except Exception as e:
        failed.append((var, str(e)))
        print(f"      ✗ Error: {e}")
        plt.close()

# Clean up
if '_plot_var' in gdf_plot.columns:
    gdf_plot = gdf_plot.drop(columns=['_plot_var'])

In [2]:
# ============================================================
# STEP 5: CREATE SUMMARY GRID
# ============================================================

print("\n" + "=" * 80)
print("[STEP 5] Creating Summary Grid / 创建汇总网格")
print("=" * 80)

# Group variables by category
categories = {}
for var in successful:
    cat = VARIABLE_CONFIGS[var]['category']
    if cat not in categories:
        categories[cat] = []
    categories[cat].append(var)

print(f"\n  Variables by category:")
for cat, vars in categories.items():
    print(f"    {cat}: {len(vars)} variables")

# Create a summary grid for each category
for cat, vars in categories.items():
    if len(vars) < 2:
        continue
    
    n_vars = len(vars)
    n_cols = min(3, n_vars)
    n_rows = (n_vars + n_cols - 1) // n_cols
    
    print(f"\n  Creating {cat} summary grid ({n_rows}×{n_cols})...")
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 5*n_rows))
    if n_rows == 1 and n_cols == 1:
        axes = np.array([[axes]])
    elif n_rows == 1:
        axes = axes.reshape(1, -1)
    elif n_cols == 1:
        axes = axes.reshape(-1, 1)
    
    for idx, var in enumerate(vars):
        row = idx // n_cols
        col = idx % n_cols
        ax = axes[row, col]
        
        config = VARIABLE_CONFIGS[var]
        vmin = vmin_vmax_dict[var]['vmin']
        vmax = vmin_vmax_dict[var]['vmax']
        
        # Plot
        plot_data = gdf_plot[var].fillna(0).clip(lower=vmin, upper=vmax)
        gdf_plot['_plot_var'] = plot_data
        
        gdf_plot.plot(
            ax=ax,
            column='_plot_var',
            cmap=config['cmap'],
            markersize=1,
            alpha=0.5,
            legend=False,
            vmin=vmin,
            vmax=vmax
        )
        
        try:
            ctx.add_basemap(ax, source=ctx.providers.CartoDB.DarkMatter, zoom=10, alpha=0.8)
        except:
            pass
        
        ax.set_title(f"{config['title_en']}\n{config['title_cn']}", fontsize=10, fontweight='bold')
        ax.set_xticks([])
        ax.set_yticks([])
        for spine in ax.spines.values():
            spine.set_visible(False)
    
    # Hide empty subplots
    for idx in range(n_vars, n_rows * n_cols):
        row = idx // n_cols
        col = idx % n_cols
        axes[row, col].axis('off')
    
    # Clean up temp column
    if '_plot_var' in gdf_plot.columns:
        gdf_plot = gdf_plot.drop(columns=['_plot_var'])
    
    fig.suptitle(f'{cat.upper()} Variables\n{cat.upper()}类变量', 
                fontsize=16, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    output_file = os.path.join(OUTPUT_DIR, f'_summary_{cat}.png')
    plt.savefig(output_file, dpi=150, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"    ✓ Saved: {output_file}")

In [2]:
# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)

print(f"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                      VISUALIZATION COMPLETE                    ║
╚══════════════════════════════════════════════════════════════════════════════╝

  Output Directory /: {OUTPUT_DIR}
  
  Statistics / :
  ────────────────────────────────────────────────────────────────────────────
    ✓ Successful: {len(successful)} maps
    ✗ Failed: {len(failed)} maps
    
  Generated Files / :
  ────────────────────────────────────────────────────────────────────────────
""")

# List all files
for var in successful:
    config = VARIABLE_CONFIGS[var]
    print(f"    ✓ {var}.png - {config['title_en']}")

print("\n  Summary Grids / :")
for cat in categories.keys():
    if len(categories[cat]) >= 2:
        print(f"    ✓ _summary_{cat}.png")

if failed:
    print("\n  Failed Variables / :")
    for var, error in failed:
        print(f"    ✗ {var}: {error}")

print(f"""
  Vmax Calculation / Vmax:
  ────────────────────────────────────────────────────────────────────────────
    vmax = min(10 × mean, 1.5 × actual_max)
    vmax = max(vmax, 2 × mean)  
    
══════════════════════════════════════════════════════════════════════════════
                         ✓ ALL VISUALIZATIONS COMPLETE
══════════════════════════════════════════════════════════════════════════════
""")

    CHICAGO TRAVEL SURVEY - COMPLETE VARIABLE VISUALIZATION

Execution Time: 2026-02-10 21:12:41

Output directory: ./city_home_based_visualizations

[STEP 1] Loading Data / 加载数据

  ✓ Loaded: 6,129 observations
  ✓ Columns: 56
  ✓ Using projected coordinates (home_x, home_y)
  ✓ Valid points: 6,129

[STEP 2] Identifying Variables / 识别变量

  Available configured variables: 25
    ✓ Y_total_MET_minutes [outcome]
    ✓ Y_transit_duration_min [outcome]
    ✓ Y_active_duration [outcome]
    ✓ n_walking_trips [trips]
    ✓ n_cycling_trips [trips]
    ✓ n_transit_trips [trips]
    ✓ n_active_trips [trips]
    ✓ n_total_trips [trips]
    ✓ total_travel_time_min [trips]
    ✓ age [demographics]
    ✓ hhsize [household]
    ✓ hhveh [household]
    ✓ hhinc [household]
    ✓ dist_rail_ft [spatial]
    ✓ dist_rail_mi [spatial]
    ✓ dist_bus_ft [spatial]
    ✓ dist_bus_mi [spatial]
    ✓ bus_count_14mile [spatial]
    ✓ dist_cbd_ft [spatial]
    ✓ dist_cbd_mi [spatial]
    ✓ H_intersection_density [