In [1]:
import pandas as pd
import numpy as np
from typing import Optional
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import (
    StructType, 
    StructField,
    StringType, 
    IntegerType, 
    LongType, 
    FloatType, 
    DoubleType, 
    DecimalType,
    BooleanType, 
    DateType, 
    TimestampType,
    ArrayType, 
    MapType, 
    StructType,
    BinaryType, 
    ByteType, 
    ShortType,
)

In [2]:
spark = (
    SparkSession.builder
    .appName("Testing")
    .config("spark.driver.extraJavaOptions", "--add-opens=java.base/javax.security.auth=ALL-UNNAMED --enable-native-access=ALL-UNNAMED")
    .config("spark.executor.extraJavaOptions", "--add-opens=java.base/javax.security.auth=ALL-UNNAMED --enable-native-access=ALL-UNNAMED")
    .getOrCreate()
)

In [3]:
df = pd.read_csv(
    filepath_or_buffer="dataset/NYC Accidents 2020.csv"
)

In [4]:
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2020-08-29,15:40:00,BRONX,10466.0,40.8921,-73.83376,POINT (-73.83376 40.8921),PRATT AVENUE,STRANG AVENUE,,...,Unspecified,,,,4342908,Sedan,Station Wagon/Sport Utility Vehicle,,,
1,2020-08-29,21:00:00,BROOKLYN,11221.0,40.6905,-73.919914,POINT (-73.919914 40.6905),BUSHWICK AVENUE,PALMETTO STREET,,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,,40.8165,-73.946556,POINT (-73.946556 40.8165),8 AVENUE,,,...,,,,,4343142,Station Wagon/Sport Utility Vehicle,,,,
3,2020-08-29,00:00:00,BRONX,10459.0,40.82472,-73.89296,POINT (-73.89296 40.82472),,,1047 SIMPSON STREET,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,11203.0,40.64989,-73.93389,POINT (-73.93389 40.64989),,,4609 SNYDER AVENUE,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74881 entries, 0 to 74880
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CRASH DATE                     74881 non-null  object 
 1   CRASH TIME                     74881 non-null  object 
 2   BOROUGH                        49140 non-null  object 
 3   ZIP CODE                       49134 non-null  float64
 4   LATITUDE                       68935 non-null  float64
 5   LONGITUDE                      68935 non-null  float64
 6   LOCATION                       68935 non-null  object 
 7   ON STREET NAME                 55444 non-null  object 
 8   CROSS STREET NAME              35681 non-null  object 
 9   OFF STREET NAME                19437 non-null  object 
 10  NUMBER OF PERSONS INJURED      74881 non-null  int64  
 11  NUMBER OF PERSONS KILLED       74881 non-null  int64  
 12  NUMBER OF PEDESTRIANS INJURED  74881 non-null 

In [6]:
# I am not going to use these features, thus I am dropping them.
# Besides this, they do not seem to me as useful.
df = df.drop(["ZIP CODE", "LONGITUDE", "LATITUDE", "LOCATION"], axis=1)

In [7]:
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2020-08-29,15:40:00,BRONX,PRATT AVENUE,STRANG AVENUE,,0,0,0,0,...,Unspecified,,,,4342908,Sedan,Station Wagon/Sport Utility Vehicle,,,
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,,2,0,0,0,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,8 AVENUE,,,1,0,1,0,...,,,,,4343142,Station Wagon/Sport Utility Vehicle,,,,
3,2020-08-29,00:00:00,BRONX,,,1047 SIMPSON STREET,0,0,0,0,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,,,4609 SNYDER AVENUE,0,0,0,0,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [8]:
# COLLISION_ID is our unique values containing feature.
df["COLLISION_ID"].unique()

array([4342908, 4343555, 4343142, ..., 4269230, 4267482, 4268376],
      shape=(74881,))

### Fixing values of car types.

In [9]:
# Fixing misspellings and standardize to proper case
misspellings = {
    'AMBULENCE': 'Ambulance',   # "ENCE" -> "ANCE"
    'Ambulance': 'Ambulance',   # Already correct case
    'GEN  AMBUL': 'Ambulance',  # Abbreviation
    'abulance': 'Ambulance',    # Missing "m"
    'ambulance': 'Ambulance',   # Just capitalized
    'AMB': 'Ambulance',         # Abbreviation
    'AMBU': 'Ambulance',        # Abbreviation
    'Amb': 'Ambulance',         # Mixed case
    'AMBULANCE': 'Ambulance',   # ALL CAPS to proper case
}

# Standardize capitalization - all to proper case (first letter capital)
capitalization_fixes = {
    # Ambulance variations
    'FDNY Ambul': 'Ambulance',
    'FDNY AMBUL': 'Ambulance',
    'Fdny ambul': 'Ambulance',
    'NYC AMBULA': 'Ambulance',
    'NYS AMBULA': 'Ambulance',
    'White ambu': 'Ambulance',
    
    # Fire Truck variations
    'FDNY fire': 'Fire Truck',
    'FDNY FIRET': 'Fire Truck',
    'FDNY TRUCK': 'Fire Truck',
    'FDNY FIRE': 'Fire Truck',
    'FDNY Engin': 'Fire Truck',
    'FDNY ENGIN': 'Fire Truck',
    'Fire Truck': 'Fire Truck',
    'Fire truck': 'Fire Truck',
    'fire truck': 'Fire Truck',
    'Firetruck': 'Fire Truck',
    'FIRETRUCK': 'Fire Truck',
    'FIRE TRUCK': 'Fire Truck',
    'FIRE ENGIN': 'Fire Truck',
    
    # Box Truck
    'BOX TRUCK': 'Box Truck',
    'box truck': 'Box Truck',
    'Box Truck': 'Box Truck',
    
    # Pick-up Truck
    'Pick up Tr': 'Pick-up Truck',
    'PICK-UP TR': 'Pick-up Truck',
    'PICK UP TR': 'Pick-up Truck',
    'Pick up': 'Pick-up Truck',
    'Pickup with mounted Camper': 'Pick-up Truck',
    'PICKUP TRU': 'Pick-up Truck',
    'PICK UP': 'Pick-up Truck',
    'Pick-up Truck': 'Pick-up Truck',
    
    # Van
    'WORK VAN': 'Van',
    'Work van': 'Van',
    'Work Van': 'Van',
    'TRUCK VAN': 'Van',
    'van': 'Van',
    'DELIVERY V': 'Van',
    'delivery v': 'Van',
    'CARGO VAN': 'Van',
    'Cargo Van': 'Van',
    'Van': 'Van',
    
    # Dump Truck
    'Dump': 'Dump Truck',
    'DUMP': 'Dump Truck',
    'dump truck': 'Dump Truck',
    'Dump truck': 'Dump Truck',
    'Dump Truck': 'Dump Truck',
    
    # Tractor
    'Tractor tr': 'Tractor Truck',
    'Tractor Tr': 'Tractor Truck',
    'tractor tr': 'Tractor Truck',
    'TRACTOR': 'Tractor Truck',
    'Tractor': 'Tractor Truck',
    'Tractor Truck Diesel': 'Tractor Truck',
    'Tractor Truck Gasoline': 'Tractor Truck',
    'Tractor tr': 'Tractor Truck',
    'Tractor Truck': 'Tractor Truck',
    
    # Motorcycle variations
    'Motorscooter': 'Motorcycle',
    'MOTORSCOOT': 'Motorcycle',
    'MOTOR SCOO': 'Motorcycle',
    'MOPED': 'Motorcycle',
    'moped': 'Motorcycle',
    'Motorbike': 'Motorcycle',
    'MOTORSCOOTER': 'Motorcycle',
    'Motorscooter': 'Motorcycle',
    'Motorcycle': 'Motorcycle',
    
    # Scooter
    'SCOOTER': 'Scooter',
    'E REVEL SC': 'E-scooter',
    'PUSH SCOOT': 'Scooter',
    'Scooter': 'Scooter',
    
    # Sedan
    '4 dr sedan': 'Sedan',
    '2 dr sedan': 'Sedan',
    '3-Door': 'Sedan',
    'Sedan': 'Sedan',
    
    # E-Bike
    'E-BIKE': 'E-bike',
    'E-Bik': 'E-bike',
    'E bike': 'E-bike',
    'E-Bike': 'E-bike',
    
    # Trailer
    'trailer': 'Trailer',
    'TRAILER': 'Trailer',
    'TRAIL': 'Trailer',
    'TRL': 'Trailer',
    'TR-Trailer': 'Trailer',
    'Trailer': 'Trailer',
    
    # Tow Truck
    'tow truck': 'Tow Truck',
    'TOW TRUCK': 'Tow Truck',
    'Tow truck': 'Tow Truck',
    'Tow Truck': 'Tow Truck',
    'Tow Truck / Wrecker': 'Tow Truck',
    
    # USPS/Mail
    'USPS VAN': 'USPS',
    'USPS TRUCK': 'USPS',
    'USPS POSTA': 'USPS',
    'USPS #7530': 'USPS',
    'postal tru': 'USPS',
    'postal bus': 'USPS',
    'POSTAL TRU': 'USPS',
    'MAIL TRUCK': 'USPS',
    'US POSTAL': 'USPS',
    'postal ser': 'USPS',
    'USPS': 'USPS',
    
    # Delivery
    'DELIVERY': 'Delivery',
    'DELIVERY T': 'Delivery',
    'DELIVERY V': 'Delivery',
    'delviery': 'Delivery',
    'Delv': 'Delivery',
    'DELV': 'Delivery',
    'Delivery': 'Delivery',
    
    # Commercial
    'COM': 'Commercial',
    'com': 'Commercial',
    'commercial': 'Commercial',
    'COM TRANS': 'Commercial',
    'COMMERCIAL': 'Commercial',
    
    # Utility
    'UTIL': 'Utility',
    'UTILITY VE': 'Utility',
    'UT': 'Utility',
    'UTILITY': 'Utility',
    'UTILITY TR': 'Utility',
    'UTILITY.': 'Utility',
    
    # Truck (generic)
    'TRUCK': 'Truck',
    'truck': 'Truck',
    'TRK': 'Truck',
    'Trc': 'Truck',
    'Tr': 'Truck',
    'Truck': 'Truck',
    
    # Garbage
    'GARBAGE TR': 'Garbage Truck',
    'Garbage or Refuse': 'Garbage Truck',
    'Garbage Truck': 'Garbage Truck',
    
    # Freight
    'FREIGHT FL': 'Freight',
    'FREIGHT TR': 'Freight',
    'FREIG': 'Freight',
    'FREIG DELV': 'Freight',
    'FREIGHTLIN': 'Freight',
    'Freight': 'Freight',
    
    # Flat Bed
    'Flat Bed': 'Flat Bed',
    'Flat Rack': 'Flat Bed',
    'FLATBED': 'Flat Bed',
    'TRUCK FLAT': 'Flat Bed',
    
    # Forklift
    'Fork lift': 'Forklift',
    'FORK LIFT': 'Forklift',
    'FORKLIFT': 'Forklift',
    'forklift': 'Forklift',
    
    # Convertible
    'Convertible': 'Convertible',
    'CONVERTIBLE': 'Convertible',
    
    # Golf Cart
    'GOLF CART': 'Golf Cart',
    'Golf Cart': 'Golf Cart',
    
    # Suburban
    'suburban': 'Suburban',
    'SUBN WHI': 'Suburban',
    'SUBURBAN': 'Suburban',
    
    # Bike
    'Bike': 'Bike',
    'BIKE': 'Bike',
    
    # Taxi
    'Taxi': 'Taxi',
    'TAXI': 'Taxi',
    
    # Bus
    'Bus': 'Bus',
    'BUS': 'Bus',
    
    # Station Wagon
    'Station Wagon/Sport Utility Vehicle': 'Station Wagon',
    
    # Convertible
    'Convertible': 'Convertible',
    
    # Bike
    'Bike': 'Bike',
    
    # E-scooter
    'E-Scooter': 'E-scooter',
    
    # Lawnmower
    'Lawnmower': 'Lawnmower',
    
    # Concrete Mixer
    'Concrete Mixer': 'Concrete Mixer',
    
    # Refrigerated Van
    'Refrigerated Van': 'Refrigerated Van',
    
    # Armored Truck
    'Armored Truck': 'Armored Truck',
    
    # Tanker
    'Tanker': 'Tanker',
    
    # Beverage Truck
    'Beverage Truck': 'Beverage Truck',
    
    # Forklift
    'Forklift': 'Forklift',
    
    # Go kart
    'Go kart': 'Go Kart',
    
    # Camper
    'Van Camper': 'Camper',
    
    # Backhoe
    'backhoe': 'Backhoe',
    'BACK HOE': 'Backhoe',
    'BACKHOE': 'Backhoe',
    
    # Bobcat
    'Bobcat': 'Bobcat',
    'BOBCAT FOR': 'Bobcat',
    
    # Snow Plow
    'Snow Plow': 'Snow Plow',
    
    # Hearse
    'Hearse': 'Hearse',
}

# Expanding abbreviations, otherwise no one gonna understand them...
abbreviations = {
    'PK': 'Pickup',
    'PSD': 'Public Safety Vehicle',
    'FDNY': 'Fire Truck',
    'EMS': 'Ambulance',
    'MTA': 'Bus',
    'USPS': 'Mail Truck',
    'FDNY #226': 'Fire Truck',
    'NYC FD': 'Fire Truck',
    'FDNY EMT': 'Ambulance',
    'FDNY LADDE': 'Fire Truck',
    'ESU RESCUE': 'Rescue Vehicle',
    'UNK': 'Unknown',
    'UNKNOWN': 'Unknown',
    'UNKN': 'Unknown',
    'Unknown': 'Unknown',
    'OTH': 'Other',
    'OTHER': 'Other',
    'TRAC': 'Tractor',
    'SWT': 'Station Wagon',
    'LIMO': 'Limousine',
    'PK': 'Pickup',
    'PC': 'Passenger Car',
    'HRSE': 'Horse',
    'H1': 'Hummer H1',
    'J1': 'Jeep',
    '1C': 'One Car',
    'SE': 'Special Equipment',
    'OMS': 'Office of Management Services',
    'OMR': 'Other Motorized Road',
    'LCOMM': 'Light Commercial',
}

In [10]:
VEHICLE_TYPE_COLUMNS = [
    "VEHICLE TYPE CODE 1",
    "VEHICLE TYPE CODE 2", 
    "VEHICLE TYPE CODE 3",
    "VEHICLE TYPE CODE 4",
    "VEHICLE TYPE CODE 5"
]

def clean_vehicle_type_columns(
        df: pd.DataFrame,
        misspellings: dict,
        capitalization_fixes: dict,
        abbreviations: dict
) -> pd.DataFrame:
    """
    Clean vehicle type columns by applying multiple replacement dictionaries.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing vehicle type columns
    misspellings : dict
        Dictionary of misspelling corrections
    capitalization_fixes : dict
        Dictionary of capitalization standardizations
    abbreviations : dict
        Dictionary of abbreviation expansions
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with cleaned vehicle type columns
    """

    # Creating copy to avoid modifying original df
    df_clean = df.copy()

    # Now, combining all replacement dictionaries into one
    # By the way, order matters: misspellings -> capitalization -> abbreviations
    all_replacements = {**misspellings, **capitalization_fixes, **abbreviations} 

    for column in df_clean.columns:
        if column in VEHICLE_TYPE_COLUMNS:
            df_clean[column] = df_clean[column].replace(all_replacements)

    return df_clean

In [11]:
df = clean_vehicle_type_columns(
    df=df,
    misspellings=misspellings,
    capitalization_fixes=capitalization_fixes,
    abbreviations=abbreviations
)

# NOTE: Not all type names are FIXED. I WILL DO IT LATER.

In [12]:
df["VEHICLE TYPE CODE 1"].unique() # Just to ensure that it works

array(['Sedan', 'Station Wagon', 'Bus', 'Pick-up Truck', 'Box Truck',
       'Taxi', 'Bike', 'Convertible', 'Pickup', 'Flat Bed', 'E-bike', nan,
       'Motorcycle', 'Ambulance', 'Dump Truck', 'Carry All',
       'Refrigerated Van', 'Van', 'Tractor Truck', 'E-scooter',
       'Tow Truck', 'Lawnmower', 'Armored Truck', 'Concrete Mixer',
       'Unknown', 'Golf Cart', 'Garbage Truck', 'Tanker',
       'Bulk Agriculture', 'Trailer', 'Tractor', 'Moped', 'COURIER',
       'Minibike', 'Public Safety Vehicle', 'Fire Truck', 'Limousine',
       'Multi-Wheeled Vehicle', 'Chassis Cab', 'Lift Boom', 'dilevery t',
       'DRILL RIG', 'Delivery', 'Pumper', 'Other', 'Stake or Rack',
       'Beverage Truck', 'Front-Load', 'government', 'LIGHT TRAI',
       'JOHN DEERE', 'Commercial', 'cross', 'Forklift', 'Go Kart',
       'Truck', 'Camper', 'Freight', 'Open Body', 'Scooter',
       'Livestock Rack', 'Utility', 'USPS', '18 WHEELER', 'FOOD TRUCK',
       'MOVING VAN', 'Backhoe', 'Suburban', 'Mail Truck

### Renaming Columns

In [13]:
RENAMING_RULES = {
    "CRASH DATE": "date",
    "CRASH TIME": "time",
    "NUMBER OF PERSONS INJURED": "persons_injured",
    "NUMBER OF PERSONS KILLED": "persons_killed",
    "NUMBER OF PEDESTRIANS INJURED": "pedestrians_injured",
    "NUMBER OF PEDESTRIANS KILLED": "pedestrians_killed",
    "VEHICLE TYPE CODE 1": "vehicle_type_1",
    "VEHICLE TYPE CODE 2": "vehicle_type_2", 
    "VEHICLE TYPE CODE 3": "vehicle_type_3",
    "VEHICLE TYPE CODE 4": "vehicle_type_4",
    "VEHICLE TYPE CODE 5": "vehicle_type_5",
    "COLLISION_ID": "collision_id"
}

df = df.rename(columns=RENAMING_RULES)

# Converting all remaining columns to lowercase with underscores (aka Snake Case)
for col in df.columns:
    if col not in RENAMING_RULES.values():  # Skiping already renamed columns
        new_name = col.lower().replace(' ', '_')
        df = df.rename(columns={col: new_name})

In [14]:
df.head()

Unnamed: 0,date,time,borough,on_street_name,cross_street_name,off_street_name,persons_injured,persons_killed,pedestrians_injured,pedestrians_killed,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5
0,2020-08-29,15:40:00,BRONX,PRATT AVENUE,STRANG AVENUE,,0,0,0,0,...,Unspecified,,,,4342908,Sedan,Station Wagon,,,
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,,2,0,0,0,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,,8 AVENUE,,,1,0,1,0,...,,,,,4343142,Station Wagon,,,,
3,2020-08-29,00:00:00,BRONX,,,1047 SIMPSON STREET,0,0,0,0,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon,Station Wagon,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,,,4609 SNYDER AVENUE,0,0,0,0,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74881 entries, 0 to 74880
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   date                           74881 non-null  object
 1   time                           74881 non-null  object
 2   borough                        49140 non-null  object
 3   on_street_name                 55444 non-null  object
 4   cross_street_name              35681 non-null  object
 5   off_street_name                19437 non-null  object
 6   persons_injured                74881 non-null  int64 
 7   persons_killed                 74881 non-null  int64 
 8   pedestrians_injured            74881 non-null  int64 
 9   pedestrians_killed             74881 non-null  int64 
 10  number_of_cyclist_injured      74881 non-null  int64 
 11  number_of_cyclist_killed       74881 non-null  int64 
 12  number_of_motorist_injured     74881 non-null  int64 
 13  n

`off_street_name` has 19,437 non-null values, which in turn means that over 70% of the rows are missing this data, making it less reliable for analysis. Thus, I am dropping it. 

In [16]:
df = df.drop(columns=['off_street_name'], axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74881 entries, 0 to 74880
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   date                           74881 non-null  object
 1   time                           74881 non-null  object
 2   borough                        49140 non-null  object
 3   on_street_name                 55444 non-null  object
 4   cross_street_name              35681 non-null  object
 5   persons_injured                74881 non-null  int64 
 6   persons_killed                 74881 non-null  int64 
 7   pedestrians_injured            74881 non-null  int64 
 8   pedestrians_killed             74881 non-null  int64 
 9   number_of_cyclist_injured      74881 non-null  int64 
 10  number_of_cyclist_killed       74881 non-null  int64 
 11  number_of_motorist_injured     74881 non-null  int64 
 12  number_of_motorist_killed      74881 non-null  int64 
 13  c

### Working With `NaN` Values

In [18]:
df.loc[:, "borough": "cross_street_name"] = \
    df.loc[:, "borough": "cross_street_name"] \
      .fillna("Not Specified")

In [19]:
df.head()

Unnamed: 0,date,time,borough,on_street_name,cross_street_name,persons_injured,persons_killed,pedestrians_injured,pedestrians_killed,number_of_cyclist_injured,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5
0,2020-08-29,15:40:00,BRONX,PRATT AVENUE,STRANG AVENUE,0,0,0,0,0,...,Unspecified,,,,4342908,Sedan,Station Wagon,,,
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,2,0,0,0,0,...,Unspecified,,,,4343555,Sedan,Sedan,,,
2,2020-08-29,18:20:00,Not Specified,8 AVENUE,Not Specified,1,0,1,0,0,...,,,,,4343142,Station Wagon,,,,
3,2020-08-29,00:00:00,BRONX,Not Specified,Not Specified,0,0,0,0,0,...,Unspecified,Unspecified,Unspecified,,4343588,Station Wagon,Station Wagon,Sedan,Motorcycle,
4,2020-08-29,17:10:00,BROOKLYN,Not Specified,Not Specified,0,0,0,0,0,...,Unspecified,,,,4342953,Sedan,Sedan,,,


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74881 entries, 0 to 74880
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   date                           74881 non-null  object
 1   time                           74881 non-null  object
 2   borough                        74881 non-null  object
 3   on_street_name                 74881 non-null  object
 4   cross_street_name              74881 non-null  object
 5   persons_injured                74881 non-null  int64 
 6   persons_killed                 74881 non-null  int64 
 7   pedestrians_injured            74881 non-null  int64 
 8   pedestrians_killed             74881 non-null  int64 
 9   number_of_cyclist_injured      74881 non-null  int64 
 10  number_of_cyclist_killed       74881 non-null  int64 
 11  number_of_motorist_injured     74881 non-null  int64 
 12  number_of_motorist_killed      74881 non-null  int64 
 13  c

### Exporting DataFrame

In [21]:
# Extracting hour. Will need this one in the future. 
df["hour"] = df["time"].astype(str).str.split(":").str[0]

In [22]:
# Exporting cleaned DataFrame
df.to_csv(path_or_buf="nyc_traffic_processed.csv", index=False)

### Configuring Apache Spark

In [23]:
# Now, it is time FOR APACHE SPARK
# Schema with proper types
schema = StructType([
    StructField("date", TimestampType(), True),
    StructField("time", TimestampType(), True),
    StructField("borough", StringType(), True),
    StructField("on_street_name", StringType(), True),
    StructField("cross_street_name", StringType(), True),
    StructField("off_street_name", StringType(), True),
    StructField("persons_injured", IntegerType(), True),  # Should be integer
    StructField("persons_killed", IntegerType(), True),   # Should be integer
    StructField("pedestrians_injured", IntegerType(), True),
    StructField("pedestrians_killed", IntegerType(), True),
    StructField("number_of_cyclist_injured", IntegerType(), True),
    StructField("number_of_cyclist_killed", IntegerType(), True),
    StructField("number_of_motorist_injured", IntegerType(), True),
    StructField("number_of_motorist_killed", IntegerType(), True),
    StructField("contributing_factor_vehicle_1", StringType(), True),
    StructField("contributing_factor_vehicle_2", StringType(), True),
    StructField("contributing_factor_vehicle_3", StringType(), True),
    StructField("contributing_factor_vehicle_4", StringType(), True),
    StructField("contributing_factor_vehicle_5", StringType(), True),
    StructField("collision_id", StringType(), True),
    StructField("vehicle_type_1", StringType(), True),
    StructField("vehicle_type_2", StringType(), True),
    StructField("vehicle_type_3", StringType(), True),
    StructField("vehicle_type_4", StringType(), True),
    StructField("vehicle_type_5", StringType(), True),
    StructField("hour", IntegerType(), True),              # Should be integer 0-23
])

In [24]:
df_sql = spark.read.csv("nyc_traffic_processed.csv", header=True, schema=schema)

df_pandas = pd.read_csv(
    filepath_or_buffer="nyc_traffic_processed.csv"
)

# Defining temprary view to use it for SQL Queries. 
df_sql.createOrReplaceTempView("accidents_table")

## Analysis

### Basic Selection of Locations

In [25]:
def select_crashes_sql(borough: str, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data for a specific borough.
    
    Parameters:
    -----------
    borough : str
        Name of the borough to filter by
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not borough or not isinstance(borough, str):
        print("Error: Borough must be a non-empty string")
        return None

    # After INPUT VALIDATION, cleaning and standardizing input
    borough_standard = borough.strip().upper()

    # Valid boroughs check
    valid_boroughs = {"MANHATTAN", "BROOKLYN", "QUEENS", "BRONX", "STATEN ISLAND"}
    if borough_standard not in valid_boroughs:
        print(f"Warning: '{borough_standard}' may not be a valid borough")

    try:
        result = spark.sql(
            f"""
            SELECT *
            FROM accidents_table
            WHERE borough = '{borough}'
            LIMIT {limit}
            """,
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [26]:
result = select_crashes_sql(
    borough="BROOKLYN",
    limit=10
)   # User can pass whatever borough 
    # he needs to analyze. 

result.show(truncate=False)


+-------------------+-------------------+--------+------------------------+-----------------+---------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------+--------------+--------------+--------------+--------------+--------------+----+
|date               |time               |borough |on_street_name          |cross_street_name|off_street_name|persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|collision_id |vehicle_type_1|vehicle_ty

In [27]:
def select_crashes_pandas(borough: str, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data for a specific borough using pandas.
    
    Parameters:
    -----------
    borough : str
        Name of the borough to filter by
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not borough or not isinstance(borough, str):
        print("Error: Borough must be a non-empty string")
        return None

    # After INPUT VALIDATION, cleaning and standardizing input
    borough_standard = borough.strip().upper()

    # Valid boroughs check
    valid_boroughs = {"MANHATTAN", "BROOKLYN", "QUEENS", "BRONX", "STATEN ISLAND"}
    if borough_standard not in valid_boroughs:
        print(f"Warning: '{borough_standard}' may not be a valid borough")

    try:
        result = df_pandas[df_pandas["borough"] == borough].iloc[:limit, :]

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [28]:
select_crashes_pandas(borough='BROOKLYN', limit=20)

Unnamed: 0,date,time,borough,on_street_name,cross_street_name,persons_injured,persons_killed,pedestrians_injured,pedestrians_killed,number_of_cyclist_injured,...,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5,hour
1,2020-08-29,21:00:00,BROOKLYN,BUSHWICK AVENUE,PALMETTO STREET,2,0,0,0,0,...,,,,4343555,Sedan,Sedan,,,,21
4,2020-08-29,17:10:00,BROOKLYN,Not Specified,Not Specified,0,0,0,0,0,...,,,,4342953,Sedan,Sedan,,,,17
13,2020-08-29,22:53:00,BROOKLYN,WILLIAMSBURG STREET WEST,WYTHE AVENUE,0,0,0,0,0,...,,,,4343074,Sedan,,,,,22
16,2020-08-29,13:00:00,BROOKLYN,BEDFORD AVENUE,WALLABOUT STREET,0,0,0,0,0,...,,,,4343077,Station Wagon,Station Wagon,,,,13
19,2020-08-29,10:35:00,BROOKLYN,UNION AVENUE,GRAND STREET,1,0,0,0,0,...,,,,4343073,Sedan,Station Wagon,,,,10
20,2020-08-29,13:55:00,BROOKLYN,HAMILTON AVENUE,GARNET STREET,1,0,0,0,0,...,,,,4342786,Sedan,Sedan,,,,13
26,2020-08-29,23:19:00,BROOKLYN,NEWKIRK AVENUE,FLATBUSH AVENUE,1,0,0,0,0,...,,,,4342807,Pick-up Truck,Motorcycle,,,,23
31,2020-08-29,22:11:00,BROOKLYN,Not Specified,Not Specified,1,0,0,0,1,...,,,,4343158,Sedan,Bike,,,,22
34,2020-08-29,11:25:00,BROOKLYN,Not Specified,Not Specified,0,0,0,0,0,...,,,,4343421,Sedan,,,,,11
39,2020-08-29,17:00:00,BROOKLYN,Not Specified,Not Specified,0,0,0,0,0,...,,,,4342749,Station Wagon,,,,,17


### Injured People

In [29]:
df.printSchema()

AttributeError: 'DataFrame' object has no attribute 'printSchema'

In [None]:
def someone_injured_sql(injured_people: int, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people.
    
    Parameters:
    -----------
    injured_people : int
        Number of people who was injured
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not injured_people or not isinstance(injured_people, int):
        print("Error: Number of injured people must be non-empty and Integer")
        return None

    try:
        result = spark.sql(
            f"""
            SELECT *
            FROM accidents_table
            WHERE 
                persons_injured >= {injured_people}
                OR pedestrians_injured >= {injured_people}
                OR number_of_cyclist_injured >= {injured_people}
                OR number_of_motorist_injured >= {injured_people}
            LIMIT {limit}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
result = someone_injured_sql(
    injured_people=2,
    limit=20
)

result.show()

+-------------------+-------------------+-------------+--------------------+-----------------+---------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+--------------+--------------+--------------+--------------+--------------+----+
|               date|               time|      borough|      on_street_name|cross_street_name|off_street_name|persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|collision_id|vehicle_type_1|vehicle_ty

In [None]:
def someone_injured_pandas(injured_people: int, limit: int = 100) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people using pandas.
    
    Parameters:
    -----------
    injured_people : int
        Number of people who was injured
    limit : int, default=100
        Maximum number of rows to return
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    
    # Input validation. 
    if not injured_people or not isinstance(injured_people, int):
        print("Error: Number of injured people must be non-empty and Integer")
        return None

    try:
        result = df_pandas[
            (df_pandas['persons_injured'] > injured_people) |
            (df_pandas['pedestrians_injured'] > injured_people) |
            (df_pandas['number_of_cyclist_injured'] > injured_people) |
            (df_pandas['number_of_motorist_injured'] > injured_people)
        ].head(limit)

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
someone_injured_pandas(injured_people=8, limit=100)

Unnamed: 0,date,time,borough,on_street_name,cross_street_name,persons_injured,persons_killed,pedestrians_injured,pedestrians_killed,number_of_cyclist_injured,...,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5,hour
4635,2020-08-13,18:08:00,BROOKLYN,EAST 87 STREET,AVENUE M,15,0,0,0,0,...,Unspecified,,,4340204,Station Wagon,Station Wagon,Bus,,,18
5990,2020-08-09,18:15:00,STATEN ISLAND,FOREST AVENUE,BEMENT AVENUE,9,0,0,0,0,...,,,,4337670,Sedan,Station Wagon,,,,18
7044,2020-08-05,13:49:00,BROOKLYN,CLARENDON ROAD,UTICA AVENUE,9,0,0,0,0,...,Unspecified,Unspecified,,4335789,Sedan,Sedan,Station Wagon,Bus,,13
10505,2020-07-26,17:00:00,Not Specified,VAN WYCK EXPWY,Not Specified,10,0,0,0,0,...,Unspecified,Unspecified,Unspecified,4331884,Sedan,Station Wagon,Station Wagon,Sedan,Station Wagon,17
22370,2020-06-14,20:29:00,STATEN ISLAND,VICTORY BOULEVARD,MINTHORNE STREET,9,0,0,0,0,...,,,,4320036,Station Wagon,,,,,20
40799,2020-03-12,18:16:00,Not Specified,ROCKAWAY BOULEVARD,BROOKVILLE BOULEVARD,9,0,0,0,0,...,Unspecified,,,4299529,Station Wagon,Station Wagon,Station Wagon,,,18
42852,2020-03-08,03:00:00,Not Specified,BRUCKNER EXPRESSWAY,Not Specified,10,0,0,0,0,...,Unspecified,,,4304994,Sedan,Station Wagon,,,,3
48964,2020-02-25,17:30:00,BRONX,SOUTHERN BOULEVARD,EAST 142 STREET,9,0,0,0,0,...,,,,4293463,Station Wagon,Sedan,,,,17
53703,2020-02-15,11:49:00,Not Specified,STATEN ISLAND EXPRESSWAY,Not Specified,9,0,0,0,0,...,Unspecified,,,4290569,Sedan,Station Wagon,Station Wagon,,,11


### Contributing Factors (aka Multi Column) Analysis

In [None]:
def contibuting_factors(factor: str) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people.
    
    Parameters:
    -----------
    factor : str
        Contributing Factor
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    # Input validation. 
    if not factor or not isinstance(factor, str):
        print("Error: Contributing Factor must be non-empty and String")
        return None

    try:
        result = spark.sql(
            f"""
            SELECT 
                COUNT(collision_id) AS number_of_crashes
            FROM accidents_table
            WHERE 
                contributing_factor_vehicle_1 = "{factor}"
                OR
                contributing_factor_vehicle_2 = "{factor}"
                OR
                contributing_factor_vehicle_3 = "{factor}"
                OR
                contributing_factor_vehicle_4 = "{factor}"
                OR
                contributing_factor_vehicle_5 = "{factor}"
            """
        )
        
    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
result = contibuting_factors(
    factor="Alcohol Involvement", 
)

result.show()

+-----------------+
|number_of_crashes|
+-----------------+
|             1019|
+-----------------+



In [None]:
def contibuting_factors_pandas(factor: str) -> Optional[DataFrame]:
    """
    Select crash data with a specific number of injured people using pandas.
    
    Parameters:
    -----------
    factor : str
        Contributing Factor
        
    Returns:
    --------
    DataFrame or None
        Filtered DataFrame or None if error occurs
    """
    # Input validation. 
    if not factor or not isinstance(factor, str):
        print("Error: Contributing Factor must be non-empty and String")
        return None

    try:
        result = df_pandas[
            (df_pandas['contributing_factor_vehicle_1'] == factor) |
            (df_pandas['contributing_factor_vehicle_2'] == factor) |
            (df_pandas['contributing_factor_vehicle_3'] == factor) |
            (df_pandas['contributing_factor_vehicle_4'] == factor)
        ].agg(number_of_crashes=('collision_id', 'count'))
        
    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
contibuting_factors_pandas(factor='Alcohol Involvement')

Unnamed: 0,collision_id
number_of_crashes,1019


### Crashed After Specific Time

In [None]:
def crashed_after_time(start_hour: int, limit: int = 100) -> Optional[DataFrame]:
    """
    Retrieve crash data for incidents occurring at or after a specified hour.
    
    This function filters accident records based on the hour of occurrence,
    returning crashes that happened at the specified hour or later in the day.
    
    Parameters:
    -----------
    start_hour : int
        The starting hour (0-23) to filter crashes. 
        Only crashes occurring at this hour or later will be returned.
    limit : int, optional
        Maximum number of rows to return (default: 100).
        Use to prevent memory issues with large result sets.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame containing crash records filtered by hour, or None if:
        - Input validation fails
        - An error occurs during query execution
        
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Get crashes from 6 PM (18:00) onwards
    >>> evening_crashes = get_crashes_after_hour(18)
    
    >>> # Get first 50 crashes after midnight
    >>> night_crashes = get_crashes_after_hour(0, limit=50)
    
    Notes:
    ------
    - Hours should be provided in 24-hour format (0-23)
    - The function performs a case-insensitive filter on the hour column
    """
    
    # --- INPUT VALIDATION ---
    if start_hour is None:
        print("Error: Hour parameter cannot be None")
        return None
    
    if not isinstance(start_hour, int):
        print(f"Error: Hour must be an integer, got {type(start_hour).__name__}")
        return None
    
    # Validate hour range (0-23 for 24-hour format)
    if not (0 <= start_hour <= 23):
        print(f"Error: Hour must be between 0 and 23, got {start_hour}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Limit must be a positive integer, got {limit}")
        return None
    
    # --- QUERY EXECUTION ---
    try:
        result = spark.sql(
            f"""
            SELECT 
                  date,
                  time,
                  borough,
                  on_street_name
            FROM accidents_table
            WHERE hour >= {start_hour}
            ORDER BY date, time
            LIMIT {limit}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
result = crashed_after_time(
    start_hour=21,
    limit=20
)

result.show(truncate=False)

+-------------------+-------------------+---------+------------------------+
|date               |time               |borough  |on_street_name          |
+-------------------+-------------------+---------+------------------------+
|2020-01-01 00:00:00|2025-12-11 21:00:00|NULL     |2 avenue                |
|2020-01-01 00:00:00|2025-12-11 21:00:00|NULL     |MYRTLE AVENUE           |
|2020-01-01 00:00:00|2025-12-11 21:23:00|BRONX    |HULL AVENUE             |
|2020-01-01 00:00:00|2025-12-11 21:30:00|QUEENS   |WOODHAVEN BOULEVARD     |
|2020-01-01 00:00:00|2025-12-11 21:40:00|BROOKLYN |BOGART STREET           |
|2020-01-01 00:00:00|2025-12-11 21:40:00|QUEENS   |NULL                    |
|2020-01-01 00:00:00|2025-12-11 21:45:00|NULL     |NULL                    |
|2020-01-01 00:00:00|2025-12-11 21:45:00|NULL     |STEINWAY AVENUE         |
|2020-01-01 00:00:00|2025-12-11 22:00:00|NULL     |GERARD AVENUE           |
|2020-01-01 00:00:00|2025-12-11 22:00:00|BRONX    |VANCORTLANDT AVENUE EAST|

In [30]:
def crashed_after_time_pandas(
        start_hour: int, 
        limit: int = 100) -> Optional[DataFrame]:
    """
    Retrieve crash data for incidents occurring at or after a specified hour using pandas
    
    This function filters accident records based on the hour of occurrence,
    returning crashes that happened at the specified hour or later in the day.
    
    Parameters:
    -----------
    start_hour : int
        The starting hour (0-23) to filter crashes. 
        Only crashes occurring at this hour or later will be returned.
    limit : int, optional
        Maximum number of rows to return (default: 100).
        Use to prevent memory issues with large result sets.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame containing crash records filtered by hour, or None if:
        - Input validation fails
        - An error occurs during query execution
        
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Get crashes from 6 PM (18:00) onwards
    >>> evening_crashes = get_crashes_after_hour(18)
    
    >>> # Get first 50 crashes after midnight
    >>> night_crashes = get_crashes_after_hour(0, limit=50)
    
    Notes:
    ------
    - Hours should be provided in 24-hour format (0-23)
    - The function performs a case-insensitive filter on the hour column
    """
    
    # --- INPUT VALIDATION ---
    if start_hour is None:
        print("Error: Hour parameter cannot be None")
        return None
    
    if not isinstance(start_hour, int):
        print(f"Error: Hour must be an integer, got {type(start_hour).__name__}")
        return None
    
    # Validate hour range (0-23 for 24-hour format)
    if not (0 <= start_hour <= 23):
        print(f"Error: Hour must be between 0 and 23, got {start_hour}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Limit must be a positive integer, got {limit}")
        return None
    
    # --- QUERY EXECUTION ---
    try:
        result = df_pandas[df_pandas['hour'] > 15] \
                .loc[:, ["date", "time", "borough", "on_street_name"]] \
                .sort_values(by=['date', 'time']) \
                .head(limit)

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [31]:
crashed_after_time_pandas(start_hour=16, limit=20)

Unnamed: 0,date,time,borough,on_street_name
74673,2020-01-01,16:00:00,Not Specified,RUTLAND ROAD
74754,2020-01-01,16:08:00,Not Specified,TRIBOROUGH BRIDGE
74821,2020-01-01,16:15:00,BROOKLYN,Not Specified
74547,2020-01-01,16:30:00,QUEENS,68 AVENUE
74649,2020-01-01,16:30:00,QUEENS,Not Specified
74708,2020-01-01,16:34:00,BRONX,BARTOW AVENUE
74587,2020-01-01,16:45:00,Not Specified,ROCKAWAY BOULEVARD
74875,2020-01-01,16:49:00,BROOKLYN,Not Specified
74791,2020-01-01,16:50:00,QUEENS,BREWER BOULEVARD
74639,2020-01-01,16:59:00,QUEENS,162 STREET


### How many crashes happened in each borough

In [None]:
spark.sql(
    """
    SELECT
          borough,
          count(collision_id) AS number_of_crashes
    FROM accidents_table
    WHERE
        borough IS NOT NULL
    GROUP BY borough
    """
).show()

+-------------+-----------------+
|      borough|number_of_crashes|
+-------------+-----------------+
|       QUEENS|            14017|
|     BROOKLYN|            16907|
|        BRONX|             9417|
|    MANHATTAN|             7353|
|STATEN ISLAND|             1446|
+-------------+-----------------+



### People killed per borough

In [None]:
spark.sql(
    """
    SELECT 
          borough,
          SUM(persons_killed) AS killed_persons,
          SUM(pedestrians_killed) AS killed_pedestrians,
          SUM(number_of_cyclist_killed) as killed_cyclists,
          SUM(number_of_motorist_killed) as killed_motorists
    FROM accidents_table
    WHERE borough IS NOT NULL
    GROUP BY borough
    """
).show()

+-------------+--------------+------------------+---------------+----------------+
|      borough|killed_persons|killed_pedestrians|killed_cyclists|killed_motorists|
+-------------+--------------+------------------+---------------+----------------+
|       QUEENS|            20|                12|              0|               8|
|     BROOKLYN|            27|                12|              2|              13|
|        BRONX|            10|                 3|              1|               6|
|    MANHATTAN|             9|                 4|              1|               4|
|STATEN ISLAND|             6|                 1|              0|               5|
+-------------+--------------+------------------+---------------+----------------+



### Streets that had more than N crashes.

In [None]:
# Finding all streets that had more than 100 crashes.
def more_than_n_crashes(n: int = 100, limit: int = 20) -> Optional[DataFrame]:
    """
    Identify streets that have experienced more than N crashes.
    
    This function analyzes crash frequency by street name and returns all streets
    where the total number of crashes exceeds the specified threshold. Useful for
    identifying high-risk locations that may require safety improvements.
    
    Parameters:
    -----------
    n : int, optional
        Minimum number of crashes threshold (default: 100).
        Only streets with more than this many crashes will be returned.
    limit : int, optional
        Maximum number of street records to return (default: 20).
        Use to prevent memory issues with large result sets.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame with the following structure:
        - on_street_name: Name of the street
        - number_of_crashes: Total crashes recorded on that street
        Returns None if:
        - Input validation fails
        - An error occurs during query execution
    
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Find streets with more than 100 crashes
    >>> dangerous_streets = more_than_n_crashes(n=100)
    
    >>> # Find streets with more than 50 crashes, limit to 10 results
    >>> high_risk_streets = more_than_n_crashes(n=50, limit=10)
    
    Notes:
    ------
    - Only includes streets where on_street_name is not NULL
    - Uses COUNT(collision_id) to count distinct crashes
    - Results are ordered by number_of_crashes in descending order

    Besides all of these... Enjoy using the function!
    """

    # --- INPUT VALIDATION ---
    if n is None:
        print("Error: Parameter 'n' cannot be None")
        return None
    
    if not isinstance(n, int):
        print(f"Error: Parameter 'n' must be an integer, got {type(n).__name__}")
        return None
    
    # Checking whether N positive or negative.
    if not (n > 0):
        print(f"Error: Parameter 'n' must be positive, got {n}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Parameter 'limit' must be a positive integer, got {limit}")
        return None

    # --- QUERY EXECUTION ---
    try:
        result = spark.sql(
            f"""
            SELECT on_street_name, 
                COUNT(collision_id) AS number_of_crashes 
            FROM accidents_table
            WHERE 
                on_street_name IS NOT NULL
            GROUP BY on_street_name
                HAVING number_of_crashes > {n}
            """
            )
    
    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
result = more_than_n_crashes(n=400, limit=20)
result.show(truncate=False)

+--------------------------+-----------------+
|on_street_name            |number_of_crashes|
+--------------------------+-----------------+
|BROADWAY                  |575              |
|MAJOR DEEGAN EXPRESSWAY   |591              |
|FDR DRIVE                 |728              |
|LONG ISLAND EXPRESSWAY    |745              |
|CROSS BRONX EXPY          |526              |
|3 AVENUE                  |428              |
|BROOKLYN QUEENS EXPRESSWAY|738              |
|GRAND CENTRAL PKWY        |581              |
|CROSS ISLAND PARKWAY      |512              |
|BELT PARKWAY              |1241             |
|VAN WYCK EXPWY            |454              |
|ATLANTIC AVENUE           |532              |
+--------------------------+-----------------+



### How many crashes have no borough information

In [None]:
# Counting how many crashes have no borough information (BOROUGH IS NULL).
spark.sql(
    """
    SELECT borough, 
        COUNT(collision_id) AS number_of_crashes
    FROM accidents_table
    WHERE borough is NULL
    GROUP BY borough
    """
).show(truncate=False)

+-------+-----------------+
|borough|number_of_crashes|
+-------+-----------------+
|NULL   |25741            |
+-------+-----------------+



In [None]:
df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- borough: string (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- cross_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- persons_injured: integer (nullable = true)
 |-- persons_killed: integer (nullable = true)
 |-- pedestrians_injured: integer (nullable = true)
 |-- pedestrians_killed: integer (nullable = true)
 |-- number_of_cyclist_injured: integer (nullable = true)
 |-- number_of_cyclist_killed: integer (nullable = true)
 |-- number_of_motorist_injured: integer (nullable = true)
 |-- number_of_motorist_killed: integer (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- contributing_factor_vehicle_3: string (nullable = true)
 |-- contributing_factor_vehicle_4: string (nullable = true)
 |-- contributing_factor_vehicle_5: string (nullable = true)
 

### Crash Severity Classification 

In [None]:
spark.sql(
    """
    SELECT 
        *,
        CASE 
            WHEN persons_killed > 0 
                OR pedestrians_killed > 0 
                OR number_of_cyclist_killed > 0 
                OR number_of_motorist_killed > 0
                THEN 'fatal'
            WHEN persons_injured > 0 
                OR pedestrians_injured > 0 
                OR number_of_cyclist_injured > 0 
                OR number_of_motorist_injured > 0
                THEN 'injury_only'
            ELSE 'property_damage'
        END AS crash_severity
    FROM accidents_table
    """
).show(truncate=False)

+-------------------+-------------------+--------+------------------------+-----------------------+-------------------+---------------+--------------+-------------------+------------------+-------------------------+------------------------+--------------------------+-------------------------+------------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+--------------+--------------+--------------+--------------+--------------+----+---------------+
|date               |time               |borough |on_street_name          |cross_street_name      |off_street_name    |persons_injured|persons_killed|pedestrians_injured|pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1 |contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|co

### Ranking streets by number of crashes for each borough

In [None]:
spark.sql(
    """
    SELECT 
        *,
        DENSE_RANK() OVER(PARTITION BY borough ORDER BY number_of_crashes DESC) AS rank
    FROM 
        (SELECT 
            borough,
            on_street_name,
            COUNT(collision_id) AS number_of_crashes
        FROM accidents_table
        WHERE 
            borough IS NOT NULL
            AND
            on_street_name IS NOT NULL
        GROUP BY borough, on_street_name
        ) AS t
    ORDER BY number_of_crashes DESC
    """
).show()

# NOTE: I have checked the result - it is correct, do not worry. After BRONX, there will be BROOKLYN

+---------+--------------------+-----------------+----+
|  borough|      on_street_name|number_of_crashes|rank|
+---------+--------------------+-----------------+----+
| BROOKLYN|     ATLANTIC AVENUE|              286|   1|
|MANHATTAN|            2 AVENUE|              202|   1|
|   QUEENS|  NORTHERN BOULEVARD|              198|   1|
|    BRONX|  BRUCKNER BOULEVARD|              189|   1|
|   QUEENS|    QUEENS BOULEVARD|              171|   2|
| BROOKLYN|    LINDEN BOULEVARD|              163|   2|
|   QUEENS|NORTH CONDUIT AVENUE|              161|   3|
|   QUEENS| WOODHAVEN BOULEVARD|              153|   4|
| BROOKLYN|     FLATBUSH AVENUE|              152|   3|
|MANHATTAN|            3 AVENUE|              151|   2|
|MANHATTAN|            BROADWAY|              148|   3|
|    BRONX| EAST TREMONT AVENUE|              144|   2|
|   QUEENS|SOUTH CONDUIT AVENUE|              143|   5|
|    BRONX|       JEROME AVENUE|              141|   3|
| BROOKLYN|       OCEAN PARKWAY|              12

### Running Total of Injuries

In [None]:
# Calculating the running total of injuries ordered by date for every borough.
spark.sql(
    """
    SELECT 
        borough,
        date,
        time,
        SUM(persons_injured + pedestrians_injured + 
            number_of_cyclist_injured + number_of_motorist_injured)
            OVER(
                PARTITION BY borough 
                ORDER BY date, time
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
            ) AS running_total_of_injuries
    FROM accidents_table
    WHERE borough IS NOT NULL
    ORDER BY borough, date, time
    """
).show()

+-------+-------------------+-------------------+-------------------------+
|borough|               date|               time|running_total_of_injuries|
+-------+-------------------+-------------------+-------------------------+
|  BRONX|2020-01-01 00:00:00|2025-12-11 00:00:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-11 00:27:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-11 00:37:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-11 02:05:00|                        0|
|  BRONX|2020-01-01 00:00:00|2025-12-11 02:20:00|                        2|
|  BRONX|2020-01-01 00:00:00|2025-12-11 02:24:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-11 03:30:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-11 03:45:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-11 04:46:00|                        8|
|  BRONX|2020-01-01 00:00:00|2025-12-11 04:50:00|                       10|
|  BRONX|202

### Street Share of Crushes

In [None]:
# For each street, calculating the percentage of crashes it contributes within its borough.
spark.sql(
    """
    WITH BoroughTotal AS (
        SELECT
            borough, 
            COUNT(collision_id) AS total_number_of_crashes
        FROM accidents_table
        WHERE 
            borough IS NOT NULL 
        GROUP BY borough 
    )
    SELECT
        at.borough,    
        at.on_street_name,
        COUNT(collision_id) AS number_of_crashes,
        bt.total_number_of_crashes,
        ROUND(
            (COUNT(collision_id) * 100 / bt.total_number_of_crashes)
            , 5) AS percentage 
    FROM accidents_table AS at
    FULL JOIN BoroughTotal as bt on at.borough = bt.borough
    WHERE 
        at.borough IS NOT NULL 
        AND
        at.on_street_name IS NOT NULL
    GROUP BY at.borough, at.on_street_name, bt.total_number_of_crashes
    """
).show(truncate=False)



+---------+------------------+-----------------+-----------------------+----------+
|borough  |on_street_name    |number_of_crashes|total_number_of_crashes|percentage|
+---------+------------------+-----------------+-----------------------+----------+
|MANHATTAN|WEST 178 STREET   |21               |7353                   |0.2856    |
|BROOKLYN |FLATLANDS AVENUE  |94               |16907                  |0.55598   |
|BRONX    |MACOMBS ROAD      |7                |9417                   |0.07433   |
|QUEENS   |134 STREET        |10               |14017                  |0.07134   |
|MANHATTAN|WADSWORTH AVENUE  |7                |7353                   |0.0952    |
|QUEENS   |MYRTLE AVENUE     |42               |14017                  |0.29964   |
|BROOKLYN |HANCOCK STREET    |11               |16907                  |0.06506   |
|QUEENS   |111 AVENUE        |24               |14017                  |0.17122   |
|BROOKLYN |EAST 26 STREET    |4                |16907                  |0.02

### TOP N Most Dangerous Hours

In [None]:
def top_n_most_dangerous_hours(n: int = 5, limit: int = 20) -> Optional[DataFrame]:
    """
    Identify the N hours with the highest number of crashes using window ranking.
    
    This function analyzes crash frequency by hour of day and ranks them from
    most dangerous (highest crash count) to least dangerous. It returns the
    top N hours with their crash statistics and ranking.
    
    Parameters:
    -----------
    n : int, optional
        Number of top dangerous hours to return (default: 5).
        Must be between 1 and 24 (inclusive).
    limit : int, optional
        Maximum number of detailed crash records to return per hour (default: 20).
        Use to prevent memory issues when examining individual crashes.
        
    Returns:
    --------
    Optional[DataFrame]
        DataFrame with the following structure:
        - hour: The hour of day (0-23)
        - rank: Danger ranking (1 = most dangerous)
        - number_of_crashes: Total crashes in that hour
    
    Raises:
    -------
    This function handles exceptions internally and returns None on error.
    
    Examples:
    ---------
    >>> # Get top 3 most dangerous hours with 10 crash details each
    >>> dangerous_hours = get_top_n_most_dangerous_hours(n=3, limit=10)
    
    >>> # Get top 5 most dangerous hours (default)
    >>> top_hours = get_top_n_most_dangerous_hours()
    
    Notes:
    ------
    - Ranking uses DENSE_RANK() to handle ties appropriately
    - Hours are in 24-hour format (0 = midnight to 11 PM, 23 = 11 PM)
    """
    
    # --- INPUT VALIDATION ---
    if n is None:
        print("Error: Parameter 'n' cannot be None")
        return None
    
    if not isinstance(n, int):
        print(f"Error: Parameter 'n' must be an integer, got {type(n).__name__}")
        return None
    
    # Validate n range (1-24 for hours in a day)
    if not (1 <= n <= 24):
        print(f"Error: Parameter 'n' must be between 1 and 24, got {n}")
        return None
    
    if not isinstance(limit, int) or limit <= 0:
        print(f"Error: Parameter 'limit' must be a positive integer, got {limit}")
        return None


    # --- QUERY EXECUTION ---
    try:
        result = spark.sql(
            f"""
            SELECT
                *
            FROM
            (    SELECT 
                    *, 
                    DENSE_RANK() OVER(ORDER BY number_of_crashes DESC) AS rank
                FROM 
                (
                    SELECT 
                        hour,
                        COUNT(collision_id) AS number_of_crashes
                    FROM accidents_table
                    GROUP BY hour
                    ORDER BY hour ASC
                ) as t
            ) as k
            WHERE rank <= {n}
            """
        )

    except Exception as e:
        print(f"Bro, something bad happened: {e}")
        print("Returning None")
        return None

    return result

In [None]:
result = top_n_most_dangerous_hours(n=12, limit=20)
result.show()

+----+-----------------+----+
|hour|number_of_crashes|rank|
+----+-----------------+----+
|  16|             5219|   1|
|  14|             5016|   2|
|  17|             4974|   3|
|  18|             4696|   4|
|  15|             4677|   5|
|  13|             4458|   6|
|  12|             4054|   7|
|  11|             3803|   8|
|  19|             3738|   9|
|   8|             3678|  10|
|  10|             3525|  11|
|   9|             3439|  12|
+----+-----------------+----+



### Boroughs above average injures per borough (word salad, I know)

In [None]:
# Using a CTE, I will calculate average injuries per borough, 
# then gonna find boroughs above this average.
spark.sql(
    """
    WITH BoroughAverage AS 
    (
        SELECT 
            borough, 
            ROUND(AVG(persons_injured + pedestrians_injured + 
                number_of_cyclist_injured + number_of_motorist_injured), 2) 
                AS avg_injuries_per_borough
        FROM accidents_table
        WHERE borough IS NOT NULL
        GROUP BY borough
    ),
    OverallAverage AS
    (
        SELECT 
            ROUND(AVG(persons_injured + pedestrians_injured + 
                number_of_cyclist_injured + number_of_motorist_injured), 2) 
                AS overall_injuries_per_crash
        FROM accidents_table
        WHERE borough IS NOT NULL
    )
    SELECT
        ba.borough,
        ba.avg_injuries_per_borough,
        oa.overall_injuries_per_crash,
        CASE 
            WHEN ba.avg_injuries_per_borough > oa.overall_injuries_per_crash 
            THEN 'Above Average'
            WHEN ba.avg_injuries_per_borough < oa.overall_injuries_per_crash 
            THEN 'Below Average'
            ELSE 'Equal to Average'
        END AS comparison
    FROM BoroughAverage AS ba
    CROSS JOIN OverallAverage AS oa
    ORDER BY ba.avg_injuries_per_borough DESC
    """
).show(truncate=False)


+-------------+------------------------+--------------------------+-------------+
|borough      |avg_injuries_per_borough|overall_injuries_per_crash|comparison   |
+-------------+------------------------+--------------------------+-------------+
|STATEN ISLAND|0.84                    |0.68                      |Above Average|
|BROOKLYN     |0.71                    |0.68                      |Above Average|
|BRONX        |0.69                    |0.68                      |Above Average|
|QUEENS       |0.66                    |0.68                      |Below Average|
|MANHATTAN    |0.59                    |0.68                      |Below Average|
+-------------+------------------------+--------------------------+-------------+

