In [None]:
# NYC 311 Service Request Response Time Prediction
# Notebook 1: Data Loading, Sampling & Initial EDA

# Import libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
import gc  # For garbage collection
import pandas as pd  # For compatibility with visualization libs if needed

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# For reproducibility
np.random.seed(42)

In [None]:
# Add data path to sys which is a level above the current directory
import sys
sys.path.append('../')
# add the parent directory to sys.path
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

# Data Loading

In [None]:
# Function to load data with Polars
def load_nyc_311_data(file_path):
    """
    Load NYC 311 service request data using Polars
    
    Parameters:
    -----------
    file_path : str
        Path to the data file
        
    Returns:
    --------
    pl.DataFrame
        Loaded data
    """
    print(f"Loading data from {file_path}...")
    start_time = datetime.now()
    
    # Load data with Polars - faster for large datasets
    # We'll use streaming mode for very large files
    df = pl.scan_csv(file_path)
    
    # Convert to LazyFrame and print schema
    print("Data schema:")
    print(df.schema)
    
    # Get row count using streaming (won't load all data in memory)
    row_count = df.select(pl.count()).collect()[0, 0]
    print(f"Total rows: {row_count:,}")
    
    end_time = datetime.now()
    print(f"Data loaded in {(end_time - start_time).total_seconds():.2f} seconds")
    
    return df

# Path to the data file - adjust as needed
data_file = "../NYC_311_Data/NYC_311_complete.csv"

# Load data
df_lazy = load_nyc_311_data(data_file)

In [14]:
# Display first few rows to verify data loading
print("Preview of data:")
df_sample = df_lazy.limit(5).collect()
display(df_sample)

Preview of data:


:@computed_region_7mpf_4k6g,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_sbqj_enih,:@computed_region_yeji_bk3q,address_type,agency,agency_name,bbl,borough,bridge_highway_direction,bridge_highway_name,bridge_highway_segment,city,closed_date,community_board,complaint_type,created_date,cross_street_1,cross_street_2,descriptor,due_date,facility_type,incident_address,incident_zip,intersection_street_1,intersection_street_2,landmark,latitude,location,location_type,longitude,open_data_channel_type,park_borough,park_facility_name,resolution_action_updated_date,resolution_description,road_ramp,status,street_name,taxi_company_borough,taxi_pick_up_location,unique_key,vehicle_type,x_coordinate_state_plane,y_coordinate_state_plane
i64,i64,i64,i64,i64,i64,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,i64,str,i64,i64
57,38,18182,36,57,2,"""ADDRESS""","""NYPD""","""New York City Police Departmen…",3026997501,"""BROOKLYN""",,,,"""BROOKLYN""","""2025-02-20T09:46:41.000""","""01 BROOKLYN""","""Illegal Parking""","""2025-02-20T09:18:31.000""","""GRAHAM AVENUE""","""ECKFORD STREET""","""Blocked Crosswalk""",,,"""247 DRIGGS AVENUE""",11222,"""GRAHAM AVENUE""","""ECKFORD STREET""","""DRIGGS AVENUE""",40.722686,"""{'latitude': '40.7226864471804…","""Street/Sidewalk""",-73.947772,"""MOBILE""","""BROOKLYN""","""Unspecified""","""2025-02-20T09:46:44.000""","""The Police Department responde…",,"""Closed""","""DRIGGS AVENUE""",,,64141859,,998727,202575
44,11,13509,17,44,2,"""ADDRESS""","""HPD""","""Department of Housing Preserva…",3050260070,"""BROOKLYN""",,,,"""BROOKLYN""","""2025-02-20T19:36:41.000""","""09 BROOKLYN""","""HEAT/HOT WATER""","""2025-02-20T09:18:30.000""",,,"""ENTIRE BUILDING""",,,"""552 FLATBUSH AVENUE""",11225,,,,40.660528,"""{'latitude': '40.6605280296451…","""RESIDENTIAL BUILDING""",-73.960644,"""ONLINE""","""BROOKLYN""","""Unspecified""","""2025-02-20T00:00:00.000""","""The Department of Housing Pres…",,"""Closed""","""FLATBUSH AVENUE""",,,64140044,,995169,179927
28,12,11270,43,28,5,"""ADDRESS""","""HPD""","""Department of Housing Preserva…",2041637501,"""BRONX""",,,,"""BRONX""","""2025-02-20T17:56:49.000""","""10 BRONX""","""HEAT/HOT WATER""","""2025-02-20T09:18:27.000""",,,"""ENTIRE BUILDING""",,,"""1725 EDISON AVENUE""",10461,,,,40.845857,"""{'latitude': '40.8458565695313…","""RESIDENTIAL BUILDING""",-73.832637,"""ONLINE""","""BRONX""","""Unspecified""","""2025-02-20T00:00:00.000""","""The Department of Housing Pres…",,"""Closed""","""EDISON AVENUE""",,,64139849,,1030555,247490
22,39,13098,47,22,4,"""ADDRESS""","""NYPD""","""New York City Police Departmen…",1021790511,"""MANHATTAN""",,,,"""NEW YORK""","""2025-02-21T06:21:25.000""","""12 MANHATTAN""","""Blocked Driveway""","""2025-02-20T09:18:09.000""","""WEST 190 STREET""","""CABRINI BOULEVARD""","""Partial Access""",,,"""701 FORT WASHINGTON AVENUE""",10040,"""WEST 190 STREET""","""CABRINI BOULEVARD""","""FORT WASHINGTON AVENUE""",40.8578,"""{'latitude': '40.8578003775818…","""Street/Sidewalk""",-73.9351,"""PHONE""","""MANHATTAN""","""Unspecified""","""2025-02-21T06:21:29.000""","""The Police Department responde…",,"""Closed""","""FORT WASHINGTON AVENUE""",,,64141525,,1002203,251804
61,6,24332,41,61,3,"""ADDRESS""","""DOT""","""Department of Transportation""",4108670050,"""QUEENS""",,,,"""HOLLIS""","""2025-02-20T16:32:04.000""","""12 QUEENS""","""Street Condition""","""2025-02-20T09:18:08.000""","""100 AVENUE""","""104 AVENUE""","""Blocked - Construction""",,,"""100-35 200 STREET""",11423,"""100 AVENUE""","""104 AVENUE""","""200 STREET""",40.710279,"""{'latitude': '40.7102791635360…","""Street""",-73.759318,"""ONLINE""","""QUEENS""","""Unspecified""","""2025-02-20T16:32:08.000""","""The Department of Transportati…",,"""Closed""","""200 STREET""",,,64144217,,1050976,198142


In [15]:
df_sample.columns

[':@computed_region_7mpf_4k6g',
 ':@computed_region_92fq_4b7q',
 ':@computed_region_efsh_h5xi',
 ':@computed_region_f5dn_yrer',
 ':@computed_region_sbqj_enih',
 ':@computed_region_yeji_bk3q',
 'address_type',
 'agency',
 'agency_name',
 'bbl',
 'borough',
 'bridge_highway_direction',
 'bridge_highway_name',
 'bridge_highway_segment',
 'city',
 'closed_date',
 'community_board',
 'complaint_type',
 'created_date',
 'cross_street_1',
 'cross_street_2',
 'descriptor',
 'due_date',
 'facility_type',
 'incident_address',
 'incident_zip',
 'intersection_street_1',
 'intersection_street_2',
 'landmark',
 'latitude',
 'location',
 'location_type',
 'longitude',
 'open_data_channel_type',
 'park_borough',
 'park_facility_name',
 'resolution_action_updated_date',
 'resolution_description',
 'road_ramp',
 'status',
 'street_name',
 'taxi_company_borough',
 'taxi_pick_up_location',
 'unique_key',
 'vehicle_type',
 'x_coordinate_state_plane',
 'y_coordinate_state_plane']

# Smart Sampling Strategy

In [None]:
def create_stratified_sample(df_lazy, sample_size=100000):
    """
    Create a stratified sample of the data to work with
    
    Parameters:
    -----------
    df_lazy : pl.LazyFrame
        Original data in lazy format
    sample_size : int
        Target sample size
        
    Returns:
    --------
    pl.DataFrame
        Sampled data
    """
    print(f"Creating stratified sample of approximately {sample_size:,} rows...")
    
    # 1. Extract key columns for sampling
    # Target variable calculation: Response time (time between Created Date and Closed Date)
    sample_df = df_lazy.with_columns([
        pl.col("Created Date").str.to_datetime("%m/%d/%Y %I:%M:%S %p").alias("created_datetime"),
        pl.col("Closed Date").str.to_datetime("%m/%d/%Y %I:%M:%S %p").alias("closed_datetime")
    ]).filter(
        # Filter out rows where Closed Date is missing (still open) or Created Date is missing
        pl.col("closed_datetime").is_not_null() & 
        pl.col("created_datetime").is_not_null()
    ).with_columns([
        # Calculate response time in hours
        ((pl.col("closed_datetime") - pl.col("created_datetime")).dt.total_seconds() / 3600).alias("response_time_hours")
    ]).filter(
        # Filter out negative response times (data errors)
        pl.col("response_time_hours") >= 0
    )
    
    # 2. Time-based sampling: ensure we have data from different time periods
    # Create a year-month column
    sample_df = sample_df.with_columns([
        pl.col("created_datetime").dt.year().alias("year"),
        pl.col("created_datetime").dt.month().alias("month")
    ])
    
    # 3. Balance by agency and complaint type
    # Get unique combinations and sample from each
    
    # First, let's collect basic info about unique agencies and complaint types
    agencies_info = sample_df.select(
        pl.col("Agency").alias("agency")
    ).unique().collect()
    
    complaint_types_info = sample_df.select(
        pl.col("Complaint Type").alias("complaint_type")
    ).unique().collect()
    
    print(f"Number of unique agencies: {len(agencies_info)}")
    print(f"Number of unique complaint types: {len(complaint_types_info)}")
    
    # Calculate how many samples per agency to aim for
    # We'll use a simple approach: equal allocation but with a minimum
    target_per_agency = max(100, sample_size // len(agencies_info))
    
    # Create samples for each agency
    agency_samples = []
    
    for agency in agencies_info["agency"]:
        # Sample rows for this agency
        agency_sample = sample_df.filter(
            pl.col("Agency") == agency
        ).sample(
            target_per_agency, with_replacement=False
        )
        
        agency_samples.append(agency_sample)
    
    # Combine all agency samples
    combined_sample = pl.concat(agency_samples)
    
    # If we have more rows than target, take a random sample
    final_sample = combined_sample.sample(
        min(sample_size, combined_sample.select(pl.count()).collect()[0, 0]), 
        with_replacement=False
    ).collect()
    
    print(f"Final sample size: {len(final_sample):,} rows")
    
    return final_sample

# Create a stratified sample
sample_df = create_stratified_sample(df_lazy)

# Display sample information
print("\nSample information:")
print(f"Shape: {sample_df.shape}")

# Initial Data Quality Assessment

In [None]:
def assess_data_quality(df):
    """
    Assess data quality issues like missing values
    
    Parameters:
    -----------
    df : pl.DataFrame
        Data to assess
        
    Returns:
    --------
    None
    """
    print("Assessing data quality...")
    
    # 1. Check missing values
    missing_values = df.null_count()
    missing_pct = (missing_values / len(df) * 100).round(2)
    
    # Create a summary DataFrame
    missing_summary = pl.DataFrame({
        "Column": missing_values.to_pandas().index,
        "Missing Count": missing_values.to_pandas().values,
        "Missing Percentage": missing_pct.to_pandas().values
    }).filter(
        pl.col("Missing Count") > 0
    ).sort(
        "Missing Percentage", 
        descending=True
    )
    
    print("\nMissing Values Summary:")
    print(missing_summary)
    
    # 2. Check for duplicates
    duplicate_count = len(df) - df.select("Unique Key").unique().collect().shape[0]
    print(f"\nDuplicate Rows: {duplicate_count} ({duplicate_count/len(df)*100:.2f}%)")
    
    # 3. Check data types
    print("\nData Types:")
    for col, dtype in zip(df.columns, df.dtypes):
        print(f"{col}: {dtype}")
    
    # 4. Check for outliers in numeric columns
    print("\nNumeric Column Statistics:")
    numeric_cols = [col for col, dtype in zip(df.columns, df.dtypes) 
                   if dtype in [pl.Int64, pl.Float64, pl.Int32, pl.Float32]]
    
    if "response_time_hours" in df.columns:
        numeric_cols.append("response_time_hours")
    
    for col in numeric_cols:
        if col in df.columns:
            stats = df.select([
                pl.min(col).alias("Min"),
                pl.max(col).alias("Max"),
                pl.mean(col).alias("Mean"),
                pl.median(col).alias("Median"),
                pl.std(col).alias("Std")
            ]).collect()
            
            print(f"\n{col}:")
            print(stats)

# Assess data quality
assess_data_quality(sample_df)

# Distribution Analysis

In [None]:
def analyze_distributions(df):
    """
    Analyze distributions of key features
    
    Parameters:
    -----------
    df : pl.DataFrame
        Data to analyze
        
    Returns:
    --------
    None
    """
    print("Analyzing distributions of key features...")
    
    # Convert to pandas for easier plotting with matplotlib/seaborn
    pdf = df.to_pandas()
    
    # 1. Response Time Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(pdf["response_time_hours"], bins=50, kde=True)
    plt.title("Distribution of Response Time (Hours)")
    plt.xlabel("Response Time (Hours)")
    plt.ylabel("Frequency")
    plt.axvline(pdf["response_time_hours"].median(), color='r', linestyle='--', label=f'Median: {pdf["response_time_hours"].median():.2f} hours')
    plt.axvline(pdf["response_time_hours"].mean(), color='g', linestyle='--', label=f'Mean: {pdf["response_time_hours"].mean():.2f} hours')
    plt.legend()
    plt.tight_layout()
    plt.savefig("response_time_distribution.png")
    plt.show()
    
    # Check if we need to log transform
    plt.figure(figsize=(10, 6))
    sns.histplot(np.log1p(pdf["response_time_hours"]), bins=50, kde=True)
    plt.title("Distribution of Log Response Time (Hours)")
    plt.xlabel("Log Response Time (Hours)")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig("log_response_time_distribution.png")
    plt.show()
    
    # 2. Agency Distribution
    plt.figure(figsize=(12, 8))
    agency_counts = pdf["Agency"].value_counts()
    ax = sns.barplot(x=agency_counts.index, y=agency_counts.values)
    plt.title("Distribution of Service Requests by Agency")
    plt.xlabel("Agency")
    plt.ylabel("Count")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig("agency_distribution.png")
    plt.show()
    
    # 3. Complaint Type Distribution (Top 20)
    plt.figure(figsize=(14, 8))
    complaint_counts = pdf["Complaint Type"].value_counts().head(20)
    ax = sns.barplot(x=complaint_counts.index, y=complaint_counts.values)
    plt.title("Top 20 Complaint Types")
    plt.xlabel("Complaint Type")
    plt.ylabel("Count")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig("complaint_type_distribution.png")
    plt.show()
    
    # 4. Response Time by Agency
    plt.figure(figsize=(14, 8))
    agency_response = pdf.groupby("Agency")["response_time_hours"].agg(["mean", "median"]).reset_index()
    agency_response = agency_response.sort_values("median")
    
    sns.barplot(x="Agency", y="median", data=agency_response)
    plt.title("Median Response Time by Agency")
    plt.xlabel("Agency")
    plt.ylabel("Median Response Time (Hours)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig("median_response_time_by_agency.png")
    plt.show()
    
    # 5. Response Time by Complaint Type (Top 20)
    plt.figure(figsize=(16, 8))
    complaint_response = pdf.groupby("Complaint Type")["response_time_hours"].agg(["mean", "median"]).reset_index()
    complaint_response = complaint_response.sort_values("median", ascending=False).head(20)
    
    sns.barplot(x="Complaint Type", y="median", data=complaint_response)
    plt.title("Top 20 Complaint Types by Median Response Time")
    plt.xlabel("Complaint Type")
    plt.ylabel("Median Response Time (Hours)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig("median_response_time_by_complaint_type.png")
    plt.show()
    
    # 6. Temporal Analysis
    pdf["created_datetime"] = pd.to_datetime(pdf["created_datetime"])
    pdf["hour_of_day"] = pdf["created_datetime"].dt.hour
    pdf["day_of_week"] = pdf["created_datetime"].dt.dayofweek
    pdf["month"] = pdf["created_datetime"].dt.month
    
    # By Hour
    plt.figure(figsize=(10, 6))
    hour_response = pdf.groupby("hour_of_day")["response_time_hours"].median()
    sns.lineplot(x=hour_response.index, y=hour_response.values)
    plt.title("Median Response Time by Hour of Day")
    plt.xlabel("Hour of Day")
    plt.ylabel("Median Response Time (Hours)")
    plt.tight_layout()
    plt.savefig("response_time_by_hour.png")
    plt.show()
    
    # By Day of Week
    plt.figure(figsize=(10, 6))
    day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    day_response = pdf.groupby("day_of_week")["response_time_hours"].median()
    sns.barplot(x=[day_names[i] for i in day_response.index], y=day_response.values)
    plt.title("Median Response Time by Day of Week")
    plt.xlabel("Day of Week")
    plt.ylabel("Median Response Time (Hours)")
    plt.tight_layout()
    plt.savefig("response_time_by_day.png")
    plt.show()
    
    # By Month
    plt.figure(figsize=(10, 6))
    month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    month_response = pdf.groupby("month")["response_time_hours"].median()
    sns.barplot(x=[month_names[i-1] for i in month_response.index], y=month_response.values)
    plt.title("Median Response Time by Month")
    plt.xlabel("Month")
    plt.ylabel("Median Response Time (Hours)")
    plt.tight_layout()
    plt.savefig("response_time_by_month.png")
    plt.show()

# Analyze distributions
analyze_distributions(sample_df)

# Save Results

In [None]:
# Save the sample for future use
sample_df.write_csv("nyc_311_stratified_sample.csv")

# Save summary statistics
summary_stats = {
    "total_rows": df_lazy.select(pl.count()).collect()[0, 0],
    "sample_rows": len(sample_df),
    "median_response_time": sample_df["response_time_hours"].median(),
    "mean_response_time": sample_df["response_time_hours"].mean(),
    "agency_count": len(sample_df["Agency"].unique()),
    "complaint_type_count": len(sample_df["Complaint Type"].unique())
}

# Write to a text file
with open("data_summary.txt", "w") as f:
    for key, value in summary_stats.items():
        f.write(f"{key}: {value}\n")

print("Results saved!")