# NYC 311 Service Request Response Time Prediction
# Notebook 1: Data Loading and Cleaning

In [None]:
# Import libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
import gc  # For garbage collection
import pandas as pd  # For compatibility with visualization libs if needed

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# For reproducibility
np.random.seed(42)

In [None]:
# Add data path to sys which is a level above the current directory
import sys
sys.path.append('../')
# add the parent directory to sys.path
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

# Data Loading

In [None]:
import polars as pl
from datetime import datetime

def load_nyc_311_data(file_path):
    print(f"Loading data from {file_path}...")
    start_time = datetime.now()
    
    schema_overrides = {
        "unique_key": pl.Utf8,
        "created_date": pl.Datetime,
        "closed_date": pl.Datetime,
        "agency": pl.Categorical,
        "agency_name": pl.Categorical,
        "complaint_type": pl.Categorical,
        "descriptor": pl.Categorical,
        "status": pl.Categorical,
        "due_date": pl.Datetime,
        "resolution_action_updated_date": pl.Datetime,
        "location_type": pl.Categorical,
        "incident_zip": pl.Utf8,
        "incident_address": pl.Utf8,
        "street_name": pl.Utf8,
        "cross_street_1": pl.Utf8,
        "cross_street_2": pl.Utf8,
        "intersection_street_1": pl.Utf8,
        "intersection_street_2": pl.Utf8,
        "address_type": pl.Categorical,
        "city": pl.Utf8,
        "landmark": pl.Utf8,
        "facility_type": pl.Utf8,
        "community_board": pl.Categorical,
        "bbl": pl.Utf8,
        "borough": pl.Categorical,
        "x_coordinate_state_plane": pl.Float64,
        "y_coordinate_state_plane": pl.Float64,
        "open_data_channel_type": pl.Categorical,
        "latitude": pl.Float64,
        "longitude": pl.Float64,
        "location": pl.Utf8,
        "park_facility_name": pl.Utf8,
        "park_borough": pl.Utf8,
        "vehicle_type": pl.Utf8,
        "taxi_company_borough": pl.Utf8,
        "taxi_pick_up_location": pl.Utf8,
        "bridge_highway_name": pl.Utf8,
        "bridge_highway_direction": pl.Utf8,
        "road_ramp": pl.Utf8,
        "bridge_highway_segment": pl.Utf8
    }
    
    df = pl.scan_csv(
        file_path,
        schema_overrides=schema_overrides,
        null_values=["N/A", "Unknown", "Unkno", "", "null"],
        infer_schema_length=1000000,
        ignore_errors=True
    )
    
    print("Data schema:")
    schema = df.collect_schema()
    for name, dtype in schema.items():
        print(f"{name}: {dtype}")
    
    row_count = df.select(pl.len()).collect()[0, 0]
    print(f"Total rows: {row_count:,}")
    
    end_time = datetime.now()
    print(f"Data loaded in {(end_time - start_time).total_seconds():.2f} seconds")
    
    return df

In [None]:
# Path to the data file - adjust as needed
data_file = "../NYC_311_Data/NYC_311_complete.csv"

# Load data
df_lazy = load_nyc_311_data(data_file)

In [None]:
# Display first few rows to verify data loading
print("Preview of data:")
df_sample = df_lazy.limit(5).collect()
display(df_sample)

# Data Cleaning - Keeping only relevant columns, dropping rows with null closed dates 

In [None]:
import polars as pl

def clean_nyc_311_data(df_lazy):
    """
    Clean NYC 311 data by calculating response time, filtering invalid rows,
    and selecting relevant columns for prediction.
    
    Parameters:
    -----------
    df_lazy : pl.LazyFrame
        Loaded NYC 311 data
        
    Returns:
    --------
    pl.LazyFrame
        Cleaned data ready for response time prediction
    """

    # First, filter out rows with null 'closed_date'
    df_cleaned = df_lazy.filter(
        pl.col('closed_date').is_not_null()
    )
    
    # Calculate response time in multiple units
    df_cleaned = df_cleaned.with_columns([
        (pl.col('closed_date') - pl.col('created_date')).dt.total_seconds().alias('response_time_seconds'),
        ((pl.col('closed_date') - pl.col('created_date')).dt.total_seconds() / 60).alias('response_time_minutes'),
        ((pl.col('closed_date') - pl.col('created_date')).dt.total_seconds() / 3600).alias('response_time_hours')
    ])
    
    # Filter rows to keep only valid data
    df_cleaned = df_cleaned.filter(
        pl.col('response_time_hours').is_not_null() &  # Ensures closed_date and created_date are valid
        (pl.col('response_time_hours') >= 0) &         # No negative response times
        (pl.col('response_time_hours') <= 8760) &      # Max 365 days (365 * 24 hours)
        pl.col('agency').is_not_null() &               # Key feature
        pl.col('complaint_type').is_not_null() &       # Key feature
        pl.col('borough').is_not_null()                # Key feature
    )
    
    # Select only the relevant columns for prediction
    columns_to_keep = [
        'created_date',           # For temporal features
        'agency',                 # Responding agency
        'complaint_type',         # Type of complaint
        'descriptor',             # Additional detail
        'location_type',          # Type of location
        'incident_zip',           # Zip code
        'borough',                # Borough
        'x_coordinate_state_plane',  # Coordinate
        'y_coordinate_state_plane',  # Coordinate
        'open_data_channel_type', # Submission channel
        'latitude',               # Geographic coordinate
        'longitude',              # Geographic coordinate
        'community_board',        # Local governance area
        'response_time_hours'     # Target variable
    ]
    
    df_cleaned = df_cleaned.select(columns_to_keep)
    
    return df_cleaned

# Clean data
df_cleaned_lazy = clean_nyc_311_data(df_lazy)


In [None]:
df_full = df_cleaned_lazy.collect()

In [None]:
df_full.write_parquet("../NYC_311_Data/outputs/nyc_311_cleaned.parquet")