In [46]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import (
    StructType, StructField,
    StringType, 
    IntegerType, LongType, FloatType, DoubleType, DecimalType,
    BooleanType, 
    DateType, TimestampType,
    ArrayType, MapType, StructType,
    BinaryType, ByteType, ShortType,
)
from typing import Optional

In [47]:
spark = (
    SparkSession.builder
    .appName("Testing")
    .config("spark.driver.extraJavaOptions", "--add-opens=java.base/javax.security.auth=ALL-UNNAMED --enable-native-access=ALL-UNNAMED")
    .config("spark.executor.extraJavaOptions", "--add-opens=java.base/javax.security.auth=ALL-UNNAMED --enable-native-access=ALL-UNNAMED")
    .getOrCreate()
)

In [48]:
df = pd.read_csv(
    filepath_or_buffer="dataset/NYC Accidents 2020.csv"
)

In [49]:
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2020-08-29,15:40:00,BRONX,10466.0,40.8921,-73.83376,POINT (-73.83376 40.8921),PRATT AVENUE,STRANG AVENUE,,...,Unspecified,,,,4342908,Sedan,Station Wagon/Sport Utility Vehicle,,,
1,2020-08-29,21:00:00,BROOKLYN,11221.0,40.6905,-73.919914,POINT (-73.919914 40.6905),BUSHWICK AVENUE,PALMETTO STREET,,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,,40.8165,-73.946556,POINT (-73.946556 40.8165),8 AVENUE,,,...,,,,,4343142,Station Wagon/Sport Utility Vehicle,,,,
3,2020-08-29,00:00:00,BRONX,10459.0,40.82472,-73.89296,POINT (-73.89296 40.82472),,,1047 SIMPSON STREET,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,11203.0,40.64989,-73.93389,POINT (-73.93389 40.64989),,,4609 SNYDER AVENUE,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [50]:
# I am not going to use these features, thus I am dropping them.
df = df.drop(["ZIP CODE", "LONGITUDE", "LATITUDE", "LOCATION"], axis=1)

In [51]:
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2020-08-29,15:40:00,BRONX,PRATT AVENUE,STRANG AVENUE,,0,0,0,0,...,Unspecified,,,,4342908,Sedan,Station Wagon/Sport Utility Vehicle,,,
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,,2,0,0,0,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,8 AVENUE,,,1,0,1,0,...,,,,,4343142,Station Wagon/Sport Utility Vehicle,,,,
3,2020-08-29,00:00:00,BRONX,,,1047 SIMPSON STREET,0,0,0,0,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,,,4609 SNYDER AVENUE,0,0,0,0,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [52]:
# COLLISION_ID is our unique values containing feature.
df["COLLISION_ID"].unique()

array([4342908, 4343555, 4343142, ..., 4269230, 4267482, 4268376],
      shape=(74881,))

### Fixing values of car types.

In [53]:
# Fixing misspellings and standardize to proper case
misspellings = {
    'AMBULENCE': 'Ambulance',   # "ENCE" -> "ANCE"
    'Ambulance': 'Ambulance',   # Already correct case
    'GEN  AMBUL': 'Ambulance',  # Abbreviation
    'abulance': 'Ambulance',    # Missing "m"
    'ambulance': 'Ambulance',   # Just capitalized
    'AMB': 'Ambulance',         # Abbreviation
    'AMBU': 'Ambulance',        # Abbreviation
    'Amb': 'Ambulance',         # Mixed case
    'AMBULANCE': 'Ambulance',   # ALL CAPS to proper case
}

# Standardize capitalization - all to proper case (first letter capital)
capitalization_fixes = {
    # Ambulance variations
    'FDNY Ambul': 'Ambulance',
    'FDNY AMBUL': 'Ambulance',
    'Fdny ambul': 'Ambulance',
    'NYC AMBULA': 'Ambulance',
    'NYS AMBULA': 'Ambulance',
    'White ambu': 'Ambulance',
    
    # Fire Truck variations
    'FDNY fire': 'Fire Truck',
    'FDNY FIRET': 'Fire Truck',
    'FDNY TRUCK': 'Fire Truck',
    'FDNY FIRE': 'Fire Truck',
    'FDNY Engin': 'Fire Truck',
    'FDNY ENGIN': 'Fire Truck',
    'Fire Truck': 'Fire Truck',
    'Fire truck': 'Fire Truck',
    'fire truck': 'Fire Truck',
    'Firetruck': 'Fire Truck',
    'FIRETRUCK': 'Fire Truck',
    'FIRE TRUCK': 'Fire Truck',
    'FIRE ENGIN': 'Fire Truck',
    
    # Box Truck
    'BOX TRUCK': 'Box Truck',
    'box truck': 'Box Truck',
    'Box Truck': 'Box Truck',
    
    # Pick-up Truck
    'Pick up Tr': 'Pick-up Truck',
    'PICK-UP TR': 'Pick-up Truck',
    'PICK UP TR': 'Pick-up Truck',
    'Pick up': 'Pick-up Truck',
    'Pickup with mounted Camper': 'Pick-up Truck',
    'PICKUP TRU': 'Pick-up Truck',
    'PICK UP': 'Pick-up Truck',
    'Pick-up Truck': 'Pick-up Truck',
    
    # Van
    'WORK VAN': 'Van',
    'Work van': 'Van',
    'Work Van': 'Van',
    'TRUCK VAN': 'Van',
    'van': 'Van',
    'DELIVERY V': 'Van',
    'delivery v': 'Van',
    'CARGO VAN': 'Van',
    'Cargo Van': 'Van',
    'Van': 'Van',
    
    # Dump Truck
    'Dump': 'Dump Truck',
    'DUMP': 'Dump Truck',
    'dump truck': 'Dump Truck',
    'Dump truck': 'Dump Truck',
    'Dump Truck': 'Dump Truck',
    
    # Tractor
    'Tractor tr': 'Tractor Truck',
    'Tractor Tr': 'Tractor Truck',
    'tractor tr': 'Tractor Truck',
    'TRACTOR': 'Tractor Truck',
    'Tractor': 'Tractor Truck',
    'Tractor Truck Diesel': 'Tractor Truck',
    'Tractor Truck Gasoline': 'Tractor Truck',
    'Tractor tr': 'Tractor Truck',
    'Tractor Truck': 'Tractor Truck',
    
    # Motorcycle variations
    'Motorscooter': 'Motorcycle',
    'MOTORSCOOT': 'Motorcycle',
    'MOTOR SCOO': 'Motorcycle',
    'MOPED': 'Motorcycle',
    'moped': 'Motorcycle',
    'Motorbike': 'Motorcycle',
    'MOTORSCOOTER': 'Motorcycle',
    'Motorscooter': 'Motorcycle',
    'Motorcycle': 'Motorcycle',
    
    # Scooter
    'SCOOTER': 'Scooter',
    'E REVEL SC': 'E-scooter',
    'PUSH SCOOT': 'Scooter',
    'Scooter': 'Scooter',
    
    # Sedan
    '4 dr sedan': 'Sedan',
    '2 dr sedan': 'Sedan',
    '3-Door': 'Sedan',
    'Sedan': 'Sedan',
    
    # E-Bike
    'E-BIKE': 'E-bike',
    'E-Bik': 'E-bike',
    'E bike': 'E-bike',
    'E-Bike': 'E-bike',
    
    # Trailer
    'trailer': 'Trailer',
    'TRAILER': 'Trailer',
    'TRAIL': 'Trailer',
    'TRL': 'Trailer',
    'TR-Trailer': 'Trailer',
    'Trailer': 'Trailer',
    
    # Tow Truck
    'tow truck': 'Tow Truck',
    'TOW TRUCK': 'Tow Truck',
    'Tow truck': 'Tow Truck',
    'Tow Truck': 'Tow Truck',
    'Tow Truck / Wrecker': 'Tow Truck',
    
    # USPS/Mail
    'USPS VAN': 'USPS',
    'USPS TRUCK': 'USPS',
    'USPS POSTA': 'USPS',
    'USPS #7530': 'USPS',
    'postal tru': 'USPS',
    'postal bus': 'USPS',
    'POSTAL TRU': 'USPS',
    'MAIL TRUCK': 'USPS',
    'US POSTAL': 'USPS',
    'postal ser': 'USPS',
    'USPS': 'USPS',
    
    # Delivery
    'DELIVERY': 'Delivery',
    'DELIVERY T': 'Delivery',
    'DELIVERY V': 'Delivery',
    'delviery': 'Delivery',
    'Delv': 'Delivery',
    'DELV': 'Delivery',
    'Delivery': 'Delivery',
    
    # Commercial
    'COM': 'Commercial',
    'com': 'Commercial',
    'commercial': 'Commercial',
    'COM TRANS': 'Commercial',
    'COMMERCIAL': 'Commercial',
    
    # Utility
    'UTIL': 'Utility',
    'UTILITY VE': 'Utility',
    'UT': 'Utility',
    'UTILITY': 'Utility',
    'UTILITY TR': 'Utility',
    'UTILITY.': 'Utility',
    
    # Truck (generic)
    'TRUCK': 'Truck',
    'truck': 'Truck',
    'TRK': 'Truck',
    'Trc': 'Truck',
    'Tr': 'Truck',
    'Truck': 'Truck',
    
    # Garbage
    'GARBAGE TR': 'Garbage Truck',
    'Garbage or Refuse': 'Garbage Truck',
    'Garbage Truck': 'Garbage Truck',
    
    # Freight
    'FREIGHT FL': 'Freight',
    'FREIGHT TR': 'Freight',
    'FREIG': 'Freight',
    'FREIG DELV': 'Freight',
    'FREIGHTLIN': 'Freight',
    'Freight': 'Freight',
    
    # Flat Bed
    'Flat Bed': 'Flat Bed',
    'Flat Rack': 'Flat Bed',
    'FLATBED': 'Flat Bed',
    'TRUCK FLAT': 'Flat Bed',
    
    # Forklift
    'Fork lift': 'Forklift',
    'FORK LIFT': 'Forklift',
    'FORKLIFT': 'Forklift',
    'forklift': 'Forklift',
    
    # Convertible
    'Convertible': 'Convertible',
    'CONVERTIBLE': 'Convertible',
    
    # Golf Cart
    'GOLF CART': 'Golf Cart',
    'Golf Cart': 'Golf Cart',
    
    # Suburban
    'suburban': 'Suburban',
    'SUBN WHI': 'Suburban',
    'SUBURBAN': 'Suburban',
    
    # Bike
    'Bike': 'Bike',
    'BIKE': 'Bike',
    
    # Taxi
    'Taxi': 'Taxi',
    'TAXI': 'Taxi',
    
    # Bus
    'Bus': 'Bus',
    'BUS': 'Bus',
    
    # Station Wagon
    'Station Wagon/Sport Utility Vehicle': 'Station Wagon',
    
    # Convertible
    'Convertible': 'Convertible',
    
    # Bike
    'Bike': 'Bike',
    
    # E-scooter
    'E-Scooter': 'E-scooter',
    
    # Lawnmower
    'Lawnmower': 'Lawnmower',
    
    # Concrete Mixer
    'Concrete Mixer': 'Concrete Mixer',
    
    # Refrigerated Van
    'Refrigerated Van': 'Refrigerated Van',
    
    # Armored Truck
    'Armored Truck': 'Armored Truck',
    
    # Tanker
    'Tanker': 'Tanker',
    
    # Beverage Truck
    'Beverage Truck': 'Beverage Truck',
    
    # Forklift
    'Forklift': 'Forklift',
    
    # Go kart
    'Go kart': 'Go Kart',
    
    # Camper
    'Van Camper': 'Camper',
    
    # Backhoe
    'backhoe': 'Backhoe',
    'BACK HOE': 'Backhoe',
    'BACKHOE': 'Backhoe',
    
    # Bobcat
    'Bobcat': 'Bobcat',
    'BOBCAT FOR': 'Bobcat',
    
    # Snow Plow
    'Snow Plow': 'Snow Plow',
    
    # Hearse
    'Hearse': 'Hearse',
}

# Expanding abbreviations, otherwise no one gonna understand them...
abbreviations = {
    'PK': 'Pickup',
    'PSD': 'Public Safety Vehicle',
    'FDNY': 'Fire Truck',
    'EMS': 'Ambulance',
    'MTA': 'Bus',
    'USPS': 'Mail Truck',
    'FDNY #226': 'Fire Truck',
    'NYC FD': 'Fire Truck',
    'FDNY EMT': 'Ambulance',
    'FDNY LADDE': 'Fire Truck',
    'ESU RESCUE': 'Rescue Vehicle',
    'UNK': 'Unknown',
    'UNKNOWN': 'Unknown',
    'UNKN': 'Unknown',
    'Unknown': 'Unknown',
    'OTH': 'Other',
    'OTHER': 'Other',
    'TRAC': 'Tractor',
    'SWT': 'Station Wagon',
    'LIMO': 'Limousine',
    'PK': 'Pickup',
    'PC': 'Passenger Car',
    'HRSE': 'Horse',
    'H1': 'Hummer H1',
    'J1': 'Jeep',
    '1C': 'One Car',
    'SE': 'Special Equipment',
    'OMS': 'Office of Management Services',
    'OMR': 'Other Motorized Road',
    'LCOMM': 'Light Commercial',
}

In [54]:
VEHICLE_TYPE_COLUMNS = [
    "VEHICLE TYPE CODE 1",
    "VEHICLE TYPE CODE 2", 
    "VEHICLE TYPE CODE 3",
    "VEHICLE TYPE CODE 4",
    "VEHICLE TYPE CODE 5"
]

def clean_vehicle_type_columns(
        df: pd.DataFrame,
        misspellings: dict,
        capitalization_fixes: dict,
        abbreviations: dict
) -> pd.DataFrame:
    """
    Clean vehicle type columns by applying multiple replacement dictionaries.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing vehicle type columns
    misspellings : dict
        Dictionary of misspelling corrections
    capitalization_fixes : dict
        Dictionary of capitalization standardizations
    abbreviations : dict
        Dictionary of abbreviation expansions
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with cleaned vehicle type columns
    """

    # Creating copy to avoid modifying original df
    df_clean = df.copy()

    # Now, combining all replacement dictionaries into one
    # By the way, order matters: misspellings -> capitalization -> abbreviations
    all_replacements = {**misspellings, **capitalization_fixes, **abbreviations} 

    for column in df_clean.columns:
        if column in VEHICLE_TYPE_COLUMNS:
            df_clean[column] = df_clean[column].replace(all_replacements)

    return df_clean

In [55]:
df = clean_vehicle_type_columns(
    df=df,
    misspellings=misspellings,
    capitalization_fixes=capitalization_fixes,
    abbreviations=abbreviations
)

# NOTE: Not all type names are FIXED. I WILL DO IT LATER.

In [56]:
df["VEHICLE TYPE CODE 1"].unique() # Just to ensure that it works

array(['Sedan', 'Station Wagon', 'Bus', 'Pick-up Truck', 'Box Truck',
       'Taxi', 'Bike', 'Convertible', 'Pickup', 'Flat Bed', 'E-bike', nan,
       'Motorcycle', 'Ambulance', 'Dump Truck', 'Carry All',
       'Refrigerated Van', 'Van', 'Tractor Truck', 'E-scooter',
       'Tow Truck', 'Lawnmower', 'Armored Truck', 'Concrete Mixer',
       'Unknown', 'Golf Cart', 'Garbage Truck', 'Tanker',
       'Bulk Agriculture', 'Trailer', 'Tractor', 'Moped', 'COURIER',
       'Minibike', 'Public Safety Vehicle', 'Fire Truck', 'Limousine',
       'Multi-Wheeled Vehicle', 'Chassis Cab', 'Lift Boom', 'dilevery t',
       'DRILL RIG', 'Delivery', 'Pumper', 'Other', 'Stake or Rack',
       'Beverage Truck', 'Front-Load', 'government', 'LIGHT TRAI',
       'JOHN DEERE', 'Commercial', 'cross', 'Forklift', 'Go Kart',
       'Truck', 'Camper', 'Freight', 'Open Body', 'Scooter',
       'Livestock Rack', 'Utility', 'USPS', '18 WHEELER', 'FOOD TRUCK',
       'MOVING VAN', 'Backhoe', 'Suburban', 'Mail Truck

### Renaming Columns

In [57]:
RENAMING_RULES = {
    "CRASH DATE": "date",
    "CRASH TIME": "time",
    "NUMBER OF PERSONS INJURED": "persons_injured",
    "NUMBER OF PERSONS KILLED": "persons_killed",
    "NUMBER OF PEDESTRIANS INJURED": "pedestrians_injured",
    "NUMBER OF PEDESTRIANS KILLED": "pedestrians_killed",
    "VEHICLE TYPE CODE 1": "vehicle_type_1",
    "VEHICLE TYPE CODE 2": "vehicle_type_2", 
    "VEHICLE TYPE CODE 3": "vehicle_type_3",
    "VEHICLE TYPE CODE 4": "vehicle_type_4",
    "VEHICLE TYPE CODE 5": "vehicle_type_5",
    "COLLISION_ID": "collision_id"
}

df = df.rename(columns=RENAMING_RULES)

# Converting all remaining columns to lowercase with underscores (aka Snake Case)
for col in df.columns:
    if col not in RENAMING_RULES.values():  # Skiping already renamed columns
        new_name = col.lower().replace(' ', '_')
        df = df.rename(columns={col: new_name})

In [58]:
df.head()

Unnamed: 0,date,time,borough,on_street_name,cross_street_name,off_street_name,persons_injured,persons_killed,pedestrians_injured,pedestrians_killed,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5
0,2020-08-29,15:40:00,BRONX,PRATT AVENUE,STRANG AVENUE,,0,0,0,0,...,Unspecified,,,,4342908,Sedan,Station Wagon,,,
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,,2,0,0,0,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,8 AVENUE,,,1,0,1,0,...,,,,,4343142,Station Wagon,,,,
3,2020-08-29,00:00:00,BRONX,,,1047 SIMPSON STREET,0,0,0,0,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon,Station Wagon,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,,,4609 SNYDER AVENUE,0,0,0,0,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [59]:
# Extracting hour. Will need this one in the future. 
df["hour"] = df["time"].astype(str).str.split(":").str[0]

In [60]:
# Exporting cleaned DataFrame
df.to_csv(path_or_buf="nyc_traffic_processed.csv", index=False)

In [61]:
# Now, it is time FOR APACHE SPARK
# Schema with proper types
schema = StructType([
    StructField("date", TimestampType(), True),
    StructField("time", TimestampType(), True),
    StructField("borough", StringType(), True),
    StructField("on_street_name", StringType(), True),
    StructField("cross_street_name", StringType(), True),
    StructField("off_street_name", StringType(), True),
    StructField("persons_injured", IntegerType(), True),  # Should be integer
    StructField("persons_killed", IntegerType(), True),   # Should be integer
    StructField("pedestrians_injured", IntegerType(), True),
    StructField("pedestrians_killed", IntegerType(), True),
    StructField("number_of_cyclist_injured", IntegerType(), True),
    StructField("number_of_cyclist_killed", IntegerType(), True),
    StructField("number_of_motorist_injured", IntegerType(), True),
    StructField("number_of_motorist_killed", IntegerType(), True),
    StructField("contributing_factor_vehicle_1", StringType(), True),
    StructField("contributing_factor_vehicle_2", StringType(), True),
    StructField("contributing_factor_vehicle_3", StringType(), True),
    StructField("contributing_factor_vehicle_4", StringType(), True),
    StructField("contributing_factor_vehicle_5", StringType(), True),
    StructField("collision_id", StringType(), True),
    StructField("vehicle_type_1", StringType(), True),
    StructField("vehicle_type_2", StringType(), True),
    StructField("vehicle_type_3", StringType(), True),
    StructField("vehicle_type_4", StringType(), True),
    StructField("vehicle_type_5", StringType(), True),
    StructField("hour", IntegerType(), True),              # Should be integer 0-23
])

In [62]:
df = spark.read.csv("nyc_traffic_processed.csv", header=True, schema=schema)

# Defining temprary view to use it for SQL Queries. 
df.createOrReplaceTempView("accidents_table")

## Analysis

### Basic Selection of Locations

In [63]:
def select_crashes(borough: str, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data for a specific borough.
    
    Parameters:
    -----------
    borough : str
        Name of the borough to filter by
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not borough or not isinstance(borough, str):
        print("Error: Borough must be a non-empty string")
        return None

    # After INPUT VALIDATION, cleaning and standardizing input
    borough_standard = borough.strip().upper()

    # Valid boroughs check
    valid_boroughs = {"MANHATTAN", "BROOKLYN", "QUEENS", "BRONX", "STATEN ISLAND"}
    if borough_standard not in valid_boroughs:
        print(f"Warning: '{borough_standard}' may not be a valid borough")

    try:
        result = spark.sql(
            f"""
            SELECT *
            FROM accidents_table
            WHERE borough = '{borough}'
            LIMIT {limit}
            """,
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [64]:
result = select_crashes(
    borough="BROOKLYN",
    limit=10
)   # User can pass whatever borough 
    # he needs to analyze. 

result.show(truncate=False)


+-------------------+-------------------+--------+------------------------+-----------------+------------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+------------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+--------------+--------------+--------------+--------------+--------------+----+
|date               |time               |borough |on_street_name          |cross_street_name|off_street_name   |persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1 |contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|collision_id|vehicle_type_1|vehi

### Injured People

In [65]:
df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- borough: string (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- cross_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- persons_injured: integer (nullable = true)
 |-- persons_killed: integer (nullable = true)
 |-- pedestrians_injured: integer (nullable = true)
 |-- pedestrians_killed: integer (nullable = true)
 |-- number_of_cyclist_injured: integer (nullable = true)
 |-- number_of_cyclist_killed: integer (nullable = true)
 |-- number_of_motorist_injured: integer (nullable = true)
 |-- number_of_motorist_killed: integer (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- contributing_factor_vehicle_3: string (nullable = true)
 |-- contributing_factor_vehicle_4: string (nullable = true)
 |-- contributing_factor_vehicle_5: string (nullable = true)
 

In [66]:
def someone_injured(injured_people: int, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people.
    
    Parameters:
    -----------
    injured_people : int
        Number of people who was injured
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not injured_people or not isinstance(injured_people, int):
        print("Error: Number of injured people must be non-empty and Integer")
        return None

    try:
        result = spark.sql(
            f"""
            SELECT *
            FROM accidents_table
            WHERE 
                persons_injured >= {injured_people}
                OR pedestrians_injured >= {injured_people}
                OR number_of_cyclist_injured >= {injured_people}
                OR number_of_motorist_injured >= {injured_people}
            LIMIT {limit}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [67]:
result = someone_injured(
    injured_people=2,
    limit=20
)

result.show()

+-------------------+-------------------+-------------+--------------------+--------------------+------------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+--------------+--------------+--------------+--------------+--------------+----+
|               date|               time|      borough|      on_street_name|   cross_street_name|   off_street_name|persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|collision_id|vehicle_type_

### Contributing Factors (aka Multi Column) Analysis

In [None]:
def contibuting_factors(factor: str) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people.
    
    Parameters:
    -----------
    factor : str
        Contributing Factor
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    # Input validation. 
    if not factor or not isinstance(factor, str):
        print("Error: Contributing Factor must be non-empty and String")
        return None

    try:
        result = spark.sql(
            f"""
            SELECT 
                COUNT(collision_id) AS number_of_crashes
            FROM accidents_table
            WHERE 
                contributing_factor_vehicle_1 = "{factor}"
                OR
                contributing_factor_vehicle_2 = "{factor}"
                OR
                contributing_factor_vehicle_3 = "{factor}"
                OR
                contributing_factor_vehicle_4 = "{factor}"
                OR
                contributing_factor_vehicle_5 = "{factor}"
            """
        )
        
    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [77]:
result = contibuting_factors(
    factor="Alcohol Involvement", 
)

result.show()

+-----------------+
|number_of_crashes|
+-----------------+
|             1019|
+-----------------+



### Crashed After Specific Time

In [100]:
def crashed_after_time(start_hour: int, limit: int = 100) -> Optional[DataFrame]:
    """
    Retrieve crash data for incidents occurring at or after a specified hour.
    
    This function filters accident records based on the hour of occurrence,
    returning crashes that happened at the specified hour or later in the day.
    
    Parameters:
    -----------
    start_hour : int
        The starting hour (0-23) to filter crashes. 
        Only crashes occurring at this hour or later will be returned.
    limit : int, optional
        Maximum number of rows to return (default: 100).
        Use to prevent memory issues with large result sets.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame containing crash records filtered by hour, or None if:
        - Input validation fails
        - An error occurs during query execution
        
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Get crashes from 6 PM (18:00) onwards
    >>> evening_crashes = get_crashes_after_hour(18)
    
    >>> # Get first 50 crashes after midnight
    >>> night_crashes = get_crashes_after_hour(0, limit=50)
    
    Notes:
    ------
    - Hours should be provided in 24-hour format (0-23)
    - The function performs a case-insensitive filter on the hour column
    """
    
    # --- INPUT VALIDATION ---
    if start_hour is None:
        print("Error: Hour parameter cannot be None")
        return None
    
    if not isinstance(start_hour, int):
        print(f"Error: Hour must be an integer, got {type(start_hour).__name__}")
        return None
    
    # Validate hour range (0-23 for 24-hour format)
    if not (0 <= start_hour <= 23):
        print(f"Error: Hour must be between 0 and 23, got {start_hour}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Limit must be a positive integer, got {limit}")
        return None
    
    # --- QUERY EXECUTION ---
    try:
        result = spark.sql(
            f"""
            SELECT 
                  date,
                  time,
                  borough,
                  on_street_name
            FROM accidents_table
            WHERE hour >= {start_hour}
            ORDER BY date, time
            LIMIT {limit}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [101]:
result = crashed_after_time(
    start_hour=21,
    limit=20
)

result.show(truncate=False)

+-------------------+-------------------+---------+------------------------+
|date               |time               |borough  |on_street_name          |
+-------------------+-------------------+---------+------------------------+
|2020-01-01 00:00:00|2025-12-08 21:00:00|NULL     |2 avenue                |
|2020-01-01 00:00:00|2025-12-08 21:00:00|NULL     |MYRTLE AVENUE           |
|2020-01-01 00:00:00|2025-12-08 21:23:00|BRONX    |HULL AVENUE             |
|2020-01-01 00:00:00|2025-12-08 21:30:00|QUEENS   |WOODHAVEN BOULEVARD     |
|2020-01-01 00:00:00|2025-12-08 21:40:00|BROOKLYN |BOGART STREET           |
|2020-01-01 00:00:00|2025-12-08 21:40:00|QUEENS   |NULL                    |
|2020-01-01 00:00:00|2025-12-08 21:45:00|NULL     |NULL                    |
|2020-01-01 00:00:00|2025-12-08 21:45:00|NULL     |STEINWAY AVENUE         |
|2020-01-01 00:00:00|2025-12-08 22:00:00|NULL     |GERARD AVENUE           |
|2020-01-01 00:00:00|2025-12-08 22:00:00|BRONX    |VANCORTLANDT AVENUE EAST|

### Aggregation Tasks

In [None]:
# Counting how many crashes happened in each borough
spark.sql(
    """
    select borough, count(collision_id)
    from accidents_table
    group by borough
    """
).show()

+-------------+-------------------+
|      borough|count(collision_id)|
+-------------+-------------------+
|         NULL|              25741|
|       QUEENS|              14017|
|     BROOKLYN|              16907|
|        BRONX|               9417|
|    MANHATTAN|               7353|
|STATEN ISLAND|               1446|
+-------------+-------------------+



In [None]:
# Finding the number of persons killed per borough.
spark.sql(
    """
    select borough, 
        sum(persons_killed) as killed_persons 
    from accidents_table
    group by borough
    """
).show()

+-------------+--------------+
|      borough|killed_persons|
+-------------+--------------+
|         NULL|            72|
|       QUEENS|            20|
|     BROOKLYN|            27|
|        BRONX|            10|
|    MANHATTAN|             9|
|STATEN ISLAND|             6|
+-------------+--------------+



In [None]:
# Finding the top 5 streets with the highest number of crashes.
spark.sql(
    """
    SELECT on_street_name, 
        COUNT(collision_id) AS number_of_crashes 
    FROM accidents_table
    WHERE on_street_name IS NOT NULL
    GROUP BY on_street_name
    ORDER BY number_of_crashes DESC
    LIMIT 5
    """
).show(truncate=False)

+--------------------------+-----------------+
|on_street_name            |number_of_crashes|
+--------------------------+-----------------+
|BELT PARKWAY              |1241             |
|LONG ISLAND EXPRESSWAY    |745              |
|BROOKLYN QUEENS EXPRESSWAY|738              |
|FDR DRIVE                 |728              |
|MAJOR DEEGAN EXPRESSWAY   |591              |
+--------------------------+-----------------+



In [None]:
# Finding all streets that had more than 100 crashes.
spark.sql(
    """
    SELECT on_street_name, 
        COUNT(collision_id) AS number_of_crashes 
    FROM accidents_table
    WHERE 
        on_street_name IS NOT NULL
    GROUP BY on_street_name
        HAVING number_of_crashes > 100
    """
).show(truncate=False)

+-----------------------+-----------------+
|on_street_name         |number_of_crashes|
+-----------------------+-----------------+
|WOODHAVEN BOULEVARD    |177              |
|MYRTLE AVENUE          |165              |
|FLUSHING AVENUE        |111              |
|EAST 149 STREET        |115              |
|BEDFORD AVENUE         |191              |
|FULTON STREET          |177              |
|FLATLANDS AVENUE       |115              |
|PITKIN AVENUE          |102              |
|BROADWAY               |575              |
|ROCKAWAY BOULEVARD     |225              |
|MAJOR DEEGAN EXPRESSWAY|591              |
|CHURCH AVENUE          |127              |
|BRUCKNER EXPRESSWAY    |337              |
|CANAL STREET           |114              |
|BUSHWICK AVENUE        |216              |
|EAST FORDHAM ROAD      |137              |
|GRAND CENTRAL PARKWAY  |120              |
|WEST FORDHAM ROAD      |118              |
|EAST TREMONT AVENUE    |192              |
|HILLSIDE AVENUE        |166    

### Working with NULL Values

In [None]:
# Counting how many crashes have no borough information (BOROUGH IS NULL).
spark.sql(
    """
    SELECT borough, 
        COUNT(collision_id) AS number_of_crashes
    FROM accidents_table
    WHERE borough is NULL
    GROUP BY borough
    """
).show(truncate=False)

+-------+-----------------+
|borough|number_of_crashes|
+-------+-----------------+
|NULL   |25741            |
+-------+-----------------+



In [None]:
df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- borough: string (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- cross_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- persons_injured: integer (nullable = true)
 |-- persons_killed: integer (nullable = true)
 |-- pedestrians_injured: integer (nullable = true)
 |-- pedestrians_killed: integer (nullable = true)
 |-- number_of_cyclist_injured: integer (nullable = true)
 |-- number_of_cyclist_killed: integer (nullable = true)
 |-- number_of_motorist_injured: integer (nullable = true)
 |-- number_of_motorist_killed: integer (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- contributing_factor_vehicle_3: string (nullable = true)
 |-- contributing_factor_vehicle_4: string (nullable = true)
 |-- contributing_factor_vehicle_5: string (nullable = true)
 

### CASE Statements

In [None]:
spark.sql(
    """
    SELECT 
        *,
        CASE 
            WHEN persons_killed > 0 
                OR pedestrians_killed > 0 
                OR number_of_cyclist_killed > 0 
                OR number_of_motorist_killed > 0
                THEN 'fatal'
            WHEN persons_injured > 0 
                OR pedestrians_injured > 0 
                OR number_of_cyclist_injured > 0 
                OR number_of_motorist_injured > 0
                THEN 'injury_only'
            ELSE 'property_damage'
        END AS crash_severity
    FROM accidents_table
    """
).show(truncate=False)

+-------------------+-------------------+--------+------------------------+-----------------------+-------------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+------------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+-----------------------------------+-----------------------------------+-----------------------------------+--------------+--------------+----+---------------+
|date               |time               |borough |on_street_name          |cross_street_name      |off_street_name    |persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1 |contributing_factor_vehicle_2|contributing_factor_vehicle_3

### Ranking streets by number of crashes for each borough

In [113]:
spark.sql(
    """
    SELECT 
        *,
        DENSE_RANK() OVER(PARTITION BY borough ORDER BY number_of_crashes DESC) as rank
    FROM 
        (SELECT 
            borough,
            on_street_name,
            COUNT(collision_id) AS number_of_crashes
        FROM accidents_table
        WHERE 
            borough IS NOT NULL
            AND
            on_street_name IS NOT NULL
        GROUP BY borough, on_street_name
        ) AS t
    ORDER BY number_of_crashes DESC
    """
).show()

# NOTE: I have checked the result - it is correct, do not worry. After BRONX, there will be BROOKLYN

+---------+--------------------+-----------------+----+
|  borough|      on_street_name|number_of_crashes|rank|
+---------+--------------------+-----------------+----+
| BROOKLYN|     ATLANTIC AVENUE|              286|   1|
|MANHATTAN|            2 AVENUE|              202|   1|
|   QUEENS|  NORTHERN BOULEVARD|              198|   1|
|    BRONX|  BRUCKNER BOULEVARD|              189|   1|
|   QUEENS|    QUEENS BOULEVARD|              171|   2|
| BROOKLYN|    LINDEN BOULEVARD|              163|   2|
|   QUEENS|NORTH CONDUIT AVENUE|              161|   3|
|   QUEENS| WOODHAVEN BOULEVARD|              153|   4|
| BROOKLYN|     FLATBUSH AVENUE|              152|   3|
|MANHATTAN|            3 AVENUE|              151|   2|
|MANHATTAN|            BROADWAY|              148|   3|
|    BRONX| EAST TREMONT AVENUE|              144|   2|
|   QUEENS|SOUTH CONDUIT AVENUE|              143|   5|
|    BRONX|       JEROME AVENUE|              141|   3|
| BROOKLYN|       OCEAN PARKWAY|              12

### Running Total of Injuries

In [114]:
# Calculating the running total of injuries ordered by date for every borough.
spark.sql(
    """
    SELECT 
        borough,
        date,
        time,
        SUM(persons_injured + pedestrians_injured + 
            number_of_cyclist_injured + number_of_motorist_injured)
            OVER(
                PARTITION BY borough 
                ORDER BY date, time
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
            ) as running_total_of_injuries
    FROM accidents_table
    WHERE borough IS NOT NULL
    ORDER BY borough, date, time
    """
).show()

+-------+-------------------+-------------------+-------------------------+
|borough|               date|               time|running_total_of_injuries|
+-------+-------------------+-------------------+-------------------------+
|  BRONX|2020-01-01 00:00:00|2025-12-08 00:00:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-08 00:27:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-08 00:37:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-08 02:05:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-08 02:20:00|                        2|
|  BRONX|2020-01-01 00:00:00|2025-12-08 02:24:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-08 03:30:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-08 03:45:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-08 04:46:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-08 04:50:00|                       10|
|  BRONX|202

### Street Share of Crushes

In [115]:
# For each street, calculating the percentage of crashes it contributes within its borough.
spark.sql(
    """
    WITH BoroughTotal AS (
        SELECT
            borough, 
            COUNT(collision_id) AS total_number_of_crashes
        FROM accidents_table
        WHERE 
            borough IS NOT NULL 
        GROUP BY borough 
    )
    SELECT
        at.borough,    
        at.on_street_name,
        COUNT(collision_id) as number_of_crashes,
        bt.total_number_of_crashes,
        ROUND(
            (COUNT(collision_id) * 100 / bt.total_number_of_crashes)
            , 5) AS percentage 
    FROM accidents_table AS at
    FULL JOIN BoroughTotal as bt on at.borough = bt.borough
    WHERE 
        at.borough IS NOT NULL 
        AND
        at.on_street_name IS NOT NULL
    GROUP BY at.borough, at.on_street_name, bt.total_number_of_crashes
    """
).show(truncate=False)



+---------+------------------+-----------------+-----------------------+----------+
|borough  |on_street_name    |number_of_crashes|total_number_of_crashes|percentage|
+---------+------------------+-----------------+-----------------------+----------+
|MANHATTAN|WEST 178 STREET   |21               |7353                   |0.2856    |
|BROOKLYN |FLATLANDS AVENUE  |94               |16907                  |0.55598   |
|BRONX    |MACOMBS ROAD      |7                |9417                   |0.07433   |
|QUEENS   |134 STREET        |10               |14017                  |0.07134   |
|MANHATTAN|WADSWORTH AVENUE  |7                |7353                   |0.0952    |
|QUEENS   |MYRTLE AVENUE     |42               |14017                  |0.29964   |
|BROOKLYN |HANCOCK STREET    |11               |16907                  |0.06506   |
|QUEENS   |111 AVENUE        |24               |14017                  |0.17122   |
|BROOKLYN |EAST 26 STREET    |4                |16907                  |0.02

### TOP N Most Dangerous Hours

In [116]:
def top_n_most_dangerous_hours(n: int = 5, limit: int = 20) -> Optional[DataFrame]:
    """
    Identify the N hours with the highest number of crashes using window ranking.
    
    This function analyzes crash frequency by hour of day and ranks them from
    most dangerous (highest crash count) to least dangerous. It returns the
    top N hours with their crash statistics and ranking.
    
    Parameters:
    -----------
    n : int, optional
        Number of top dangerous hours to return (default: 5).
        Must be between 1 and 24 (inclusive).
    limit : int, optional
        Maximum number of detailed crash records to return per hour (default: 20).
        Use to prevent memory issues when examining individual crashes.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame with the following structure:
        - hour: The hour of day (0-23)
        - rank: Danger ranking (1 = most dangerous)
        - number_of_crashes: Total crashes in that hour
    
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Get top 3 most dangerous hours with 10 crash details each
    >>> dangerous_hours = get_top_n_most_dangerous_hours(n=3, limit=10)
    
    >>> # Get top 5 most dangerous hours (default)
    >>> top_hours = get_top_n_most_dangerous_hours()
    
    Notes:
    ------
    - Ranking uses DENSE_RANK() to handle ties appropriately
    - Hours are in 24-hour format (0 = midnight to 11 PM, 23 = 11 PM)
    """
    
    # --- INPUT VALIDATION ---
    if n is None:
        print("Error: Parameter 'n' cannot be None")
        return None
    
    if not isinstance(n, int):
        print(f"Error: Parameter 'n' must be an integer, got {type(n).__name__}")
        return None
    
    # Validate n range (1-24 for hours in a day)
    if not (1 <= n <= 24):
        print(f"Error: Parameter 'n' must be between 1 and 24, got {n}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Parameter 'limit' must be a positive integer, got {limit}")
        return None


    # --- QUERY EXECUTION ---
    try:
        result = spark.sql(
            f"""
            SELECT
                *
            FROM
            (    SELECT 
                    *, 
                    DENSE_RANK() OVER(ORDER BY number_of_crashes DESC) as rank
                FROM 
                (
                    SELECT 
                        hour,
                        COUNT(collision_id) as number_of_crashes
                    FROM accidents_table
                    GROUP BY hour
                    ORDER BY hour ASC
                ) as t
            ) as k
            WHERE rank <= {n}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [117]:
result = top_n_most_dangerous_hours(n=12, limit=20)
result.show()

+----+-----------------+----+
|hour|number_of_crashes|rank|
+----+-----------------+----+
|  16|             5219|   1|
|  14|             5016|   2|
|  17|             4974|   3|
|  18|             4696|   4|
|  15|             4677|   5|
|  13|             4458|   6|
|  12|             4054|   7|
|  11|             3803|   8|
|  19|             3738|   9|
|   8|             3678|  10|
|  10|             3525|  11|
|   9|             3439|  12|
+----+-----------------+----+



### Boroughs above average injures per borough (word salad, I know)

In [118]:
# Using a CTE, I will calculate average injuries per borough, 
# then gonna find boroughs above this average.
spark.sql(
    """
    WITH BoroughAverage AS 
    (
        SELECT 
            borough, 
            ROUND(AVG(persons_injured + pedestrians_injured + 
                number_of_cyclist_injured + number_of_motorist_injured), 2) 
                AS avg_injuries_per_borough
        FROM accidents_table
        WHERE borough IS NOT NULL
        GROUP BY borough
    ),
    OverallAverage AS
    (
        SELECT 
            ROUND(AVG(persons_injured + pedestrians_injured + 
                number_of_cyclist_injured + number_of_motorist_injured), 2) 
                AS overall_injuries_per_crash
        FROM accidents_table
        WHERE borough IS NOT NULL
    )
    SELECT
        ba.borough,
        ba.avg_injuries_per_borough,
        oa.overall_injuries_per_crash,
        CASE 
            WHEN ba.avg_injuries_per_borough > oa.overall_injuries_per_crash 
            THEN 'Above Average'
            WHEN ba.avg_injuries_per_borough < oa.overall_injuries_per_crash 
            THEN 'Below Average'
            ELSE 'Equal to Average'
        END AS comparison
    FROM BoroughAverage AS ba
    CROSS JOIN OverallAverage AS oa
    ORDER BY ba.avg_injuries_per_borough DESC
    """
).show(truncate=False)


+-------------+------------------------+--------------------------+-------------+
|borough      |avg_injuries_per_borough|overall_injuries_per_crash|comparison   |
+-------------+------------------------+--------------------------+-------------+
|STATEN ISLAND|0.84                    |0.68                      |Above Average|
|BROOKLYN     |0.71                    |0.68                      |Above Average|
|BRONX        |0.69                    |0.68                      |Above Average|
|QUEENS       |0.66                    |0.68                      |Below Average|
|MANHATTAN    |0.59                    |0.68                      |Below Average|
+-------------+------------------------+--------------------------+-------------+

