In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [3]:
# Step 2: Load the Data
# Load the dataset
url = 'https://data.cityofnewyork.us/api/odata/v4/hhsa-x92p'
df = pd.read_csv(url)


In [4]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


Empty DataFrame
Columns: [{"@odata.context":"https://data.cityofnewyork.us/api/odata/v4/$metadata#hhsa-x92p", value:[{"__id":"row-nnpb-ei95.bzv4", crash_date:"2021-04-13T00:00:00.000", crash_time:"0:00", borough:"BROOKLYN", zip_code:"11222", latitude:40.7264440, longitude:-73.9523300, location:{"type":"Point", coordinates:[-73.95233, 40.726444]}, location_address:null, location_city:null, location_state:null, location_zip:null, on_street_name:null, off_street_name:null, cross_street_name:"745       MANHATTAN AVENUE              ", number_of_persons_injured:0, number_of_persons_killed:0, number_of_pedestrians_injured:0, number_of_pedestrians_killed:0, number_of_cyclist_injured:0, number_of_cyclist_killed:0, number_of_motorist_injured:0, number_of_motorist_killed:0, contributing_factor_vehicle_1:"Passing Too Closely", contributing_factor_vehicle_2:"Unspecified", contributing_factor_vehicle_3:null, contributing_factor_vehicle_4:null, contributing_factor_vehicle_5:null, collision_id:440687

In [5]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# Drop rows with missing values or fill them with appropriate values
df.dropna(inplace=True)  # Simplest approach, or you can use df.fillna() for imputation


{"@odata.context":"https://data.cityofnewyork.us/api/odata/v4/$metadata#hhsa-x92p"                                  0.0
value:[{"__id":"row-nnpb-ei95.bzv4"                                                                                 0.0
crash_date:"2021-04-13T00:00:00.000"                                                                                0.0
crash_time:"0:00"                                                                                                   0.0
borough:"BROOKLYN"                                                                                                  0.0
                                                                                                                   ... 
vehicle_type_code2:"Sedan".311                                                                                      0.0
vehicle_type_code_3:null.896                                                                                        0.0
vehicle_type_code_4:null.976            

In [6]:
# Step 5: Data Visualization
# Distribution of collision severities
sns.countplot(x='SEVERITYCODE', data=df)
plt.title('Distribution of Collision Severities')
plt.show()

# Collisions by weather condition
plt.figure(figsize=(12, 8))
sns.countplot(y='WEATHER', data=df, order=df['WEATHER'].value_counts().index)
plt.title('Collisions by Weather Condition')
plt.show()

# Collisions by road conditions
plt.figure(figsize=(12, 8))
sns.countplot(y='ROADCOND', data=df, order=df['ROADCOND'].value_counts().index)
plt.title('Collisions by Road Condition')
plt.show()

# Collisions by day of the week
df['DAYOFWEEK'] = pd.to_datetime(df['DATETIME']).dt.day_name()
plt.figure(figsize=(12, 8))
sns.countplot(y='DAYOFWEEK', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Collisions by Day of the Week')
plt.show()

# Collisions over time
df['DATE'] = pd.to_datetime(df['DATETIME']).dt.date
plt.figure(figsize=(12, 8))
df.groupby('DATE').size().plot()
plt.title('Number of Collisions Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Collisions')
plt.show()


ValueError: Could not interpret input 'SEVERITYCODE'

In [7]:
# Step 6: Univariate Analysis
# Distribution of collision severities
sns.countplot(x='SEVERITYCODE', data=df)
plt.title('Collision Severity Distribution')
plt.show()

# Distribution of weather conditions
sns.countplot(y='WEATHER', data=df)
plt.title('Weather Condition Distribution')
plt.show()


ValueError: Could not interpret input 'SEVERITYCODE'

In [8]:
# Step 7: Bivariate Analysis
# Severity vs Weather Condition
sns.countplot(x='WEATHER', hue='SEVERITYCODE', data=df)
plt.title('Severity vs Weather Condition')
plt.xticks(rotation=90)
plt.show()

# Severity vs Road Condition
sns.countplot(x='ROADCOND', hue='SEVERITYCODE', data=df)
plt.title('Severity vs Road Condition')
plt.xticks(rotation=90)
plt.show()


ValueError: Could not interpret input 'WEATHER'

In [9]:
# Step 8: Multivariate Analysis
# Pair plot for selected features (limited by categorical nature)
sns.pairplot(df[['SEVERITYCODE', 'WEATHER', 'ROADCOND']])
plt.show()


KeyError: "None of [Index(['SEVERITYCODE', 'WEATHER', 'ROADCOND'], dtype='object')] are in the [columns]"

Step 9: Identify and Handle Outliers
This dataset is primarily categorical, so traditional numerical outlier detection may not apply. However, checking for rare or unusual categories can be informative.

In [None]:
# Step 10: Feature Engineering
# Extract additional features
df['MONTH'] = pd.to_datetime(df['DATETIME']).dt.month
df['HOUR'] = pd.to_datetime(df['DATETIME']).dt.hour


In [None]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Severity distribution
severity_dist = df['SEVERITYCODE'].value_counts()
print(f"Severity Distribution:\n{severity_dist}")

# Weather condition distribution
weather_dist = df['WEATHER'].value_counts()
print(f"Weather Condition Distribution:\n{weather_dist}")

# Road condition distribution
road_cond_dist = df['ROADCOND'].value_counts()
print(f"Road Condition Distribution:\n{road_cond_dist}")

# Collisions over time
collisions_over_time = df.groupby('DATE').size().describe()
print(f"Collisions Over Time:\n{collisions_over_time}")

# Severity by weather condition
severity_weather = df.groupby('WEATHER')['SEVERITYCODE'].value_counts().unstack()
print(f"Severity by Weather Condition:\n{severity_weather}")

# Severity by road condition
severity_road_cond = df.groupby('ROADCOND')['SEVERITYCODE'].value_counts().unstack()
print(f"Severity by Road Condition:\n{severity_road_cond}")


Findings:
1. Collision Severity: The dataset shows how collisions are distributed across different severity levels.
2. Weather and Road Conditions: Certain weather and road conditions are associated with higher or lower severity of collisions.
3. Temporal Trends: Collisions show patterns over time, with certain days or months possibly exhibiting higher frequencies.
4. Seasonal and Daily Patterns: Understanding these patterns can help in traffic management and accident prevention strategies.