Motor Vehicle Collisions - Crashes -- https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.pandas.set_option("display.max_rows", None)
pd.pandas.set_option("display.max_columns", None)

In [None]:
df  = pd.read_csv("Motor_Vehicle_Collisions.csv")
df.head()


In [None]:
df.info()

Lets look at the datatype and columns at a glance.

In [None]:
df.describe(include='all')

In [None]:
df= df[df['CONTRIBUTING FACTOR VEHICLE 1'].notna()]
dfspecified= df[df['CONTRIBUTING FACTOR VEHICLE 1'] !='Unspecified']
dfspecified.shape

To have look at the Missing values

In [None]:
dfspecified = dfspecified[dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].notna()]
dfspecified = pd.DataFrame(dfspecified[dfspecified['LONGITUDE'].notna()])
dfspecified = dfspecified[dfspecified['BOROUGH'].notna()]
dfspecified = dfspecified[dfspecified['ON STREET NAME'].notna()]
#dfspecified.to_csv('output.csv', index= False)

# Compute the mean and standard deviation of the latitude column
print("Mean latitude:", dfspecified['LATITUDE'].mean())
print("Standard deviation latitude:", dfspecified['LATITUDE'].std())

# Compute the count and percentage of accidents in each borough
borough_counts = dfspecified['BOROUGH'].value_counts()
print(borough_counts)
borough_percentages = dfspecified['BOROUGH'].value_counts(normalize=True) * 100
print(borough_percentages)


In [None]:
# Convert the CRASH DATE column to a datetime object
dfspecified['CRASH DATE'] = pd.to_datetime(dfspecified['CRASH DATE'])

# Create a new column with the day of the week
dfspecified['DAY OF WEEK'] = dfspecified['CRASH DATE'].dt.day_name()

# Compute the number of accidents per day of the week
day_counts = dfspecified['DAY OF WEEK'].value_counts()
print(day_counts)


In [None]:
dfspecified.head()

In [None]:
dfspecified = dfspecified[dfspecified['CROSS STREET NAME'].notna()]

dfspecified = dfspecified[dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].notna()]

dfspecified = dfspecified[dfspecified['VEHICLE TYPE CODE 1'].notna()]

dfspecified = dfspecified[dfspecified['VEHICLE TYPE CODE 2'].notna()]

dfspecified.loc['NUMBER OF PERSONS INJURED'] = dfspecified['NUMBER OF PERSONS INJURED'].fillna(0)

dfspecified.loc['NUMBER OF PERSONS KILLED'] = dfspecified['NUMBER OF PERSONS KILLED'].fillna(0)


# Check the updated DataFrame
dfspecified.isnull().sum()

In [None]:
dfspecified.dropna(subset=['CONTRIBUTING FACTOR VEHICLE 1'], inplace=True)
dfspecified.dropna(subset=['CONTRIBUTING FACTOR VEHICLE 2'], inplace=True)

In [None]:
dfspecified['INJURY_SEVERITY'] = dfspecified['NUMBER OF PERSONS INJURED'] + dfspecified['NUMBER OF PERSONS KILLED'] * 10

In [None]:
def categorize_injury_severity(injury_severity):
    if injury_severity == 0:
        return 'No Injury'
    elif 0 < injury_severity <= 2:
        return 'Mild Injury'
    elif 2 < injury_severity <= 4:
        return 'Moderate Injury'
    elif 4 < injury_severity <= 6:
        return 'Severe Injury'
    else:
        return 'Fatal'

# Assuming you have already created the 'INJURY_SEVERITY' column
dfspecified['INJURY_CATEGORY'] = dfspecified['INJURY_SEVERITY'].apply(categorize_injury_severity)

In [None]:
dfspecified['CRASH TIME'] = pd.to_datetime(dfspecified['CRASH TIME'])

# Extract the hour, minute, and second values from CRASH TIME
hour = dfspecified['CRASH TIME'].dt.hour
minute = dfspecified['CRASH TIME'].dt.minute
second = dfspecified['CRASH TIME'].dt.second

# Calculate total number of seconds since midnight
seconds_since_midnight = hour*3600 + minute*60 + second

# Add the new column to the DataFrame
dfspecified['SECONDS_SINCE_MIDNIGHT'] = seconds_since_midnight


In [None]:
# Count the occurrences of each contributing factor across all boroughs
factor_counts = dfspecified.groupby('CONTRIBUTING FACTOR VEHICLE 1').size().reset_index(name='counts')
most_common_factor = factor_counts.sort_values('counts', ascending=False).iloc[0]['CONTRIBUTING FACTOR VEHICLE 1']
print("The most common contributing factor to car accidents in NYC is:", most_common_factor)

# Count the occurrences of each contributing factor by borough
borough_factor_counts = dfspecified.groupby(['BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1']).size().reset_index(name='counts')
borough_most_common_factor = borough_factor_counts[borough_factor_counts['CONTRIBUTING FACTOR VEHICLE 1'] == most_common_factor]
print("\nMost common contributing factor by borough:\n", borough_most_common_factor)

In [None]:
def standardize_street_name(name):
    # Convert to uppercase and strip leading/trailing whitespaces
    if not isinstance(name, str):
      return name
    name = name.upper().strip()

    # Replace common abbreviations
    name = name.replace(' ST.', ' STREET')
    name = name.replace(' AVE.', ' AVENUE')
    name = name.replace(' BLVD.', ' BOULEVARD')
    name = name.replace(' RD.', ' ROAD')
    name = name.replace(' PL.', ' PLACE')
    name = name.replace(' PKWY.', ' PARKWAY')
    name = name.replace(' DR.', ' DRIVE')
    name = name.replace(' LN.', ' LANE')
    name = name.replace(' CT.', ' COURT')
    # Add more replacements as needed

    return name

# Apply the standardize_street_name function to the 'ON STREET NAME' column
dfspecified['ON STREET NAME'] = dfspecified['ON STREET NAME'].apply(standardize_street_name)
dfspecified['CROSS STREET NAME'] = dfspecified['CROSS STREET NAME'].apply(standardize_street_name)


In [None]:
dfspecified['CRASH DATE'] = pd.to_datetime(dfspecified['CRASH DATE'])
dfspecified['DAY_OF_WEEK'] = dfspecified['CRASH DATE'].dt.dayofweek

In [None]:
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'

dfspecified['SEASON'] = dfspecified['CRASH DATE'].apply(get_season)

In [None]:
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Cell Phone" if "Cell Phone" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Drugs" if "Drugs" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Illness" if "Illnes" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Cell Phone" if "Texting" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Reaction to Uninvolved Vehicle" if "Reaction to" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 1'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].apply(lambda x: "Pavement Issues" if "Pavement" in x else x)


dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Cell Phone" if "Cell Phone" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Drugs" if "Drugs" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Illness" if "Illnes" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Cell Phone" if "Texting" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Reaction to Uninvolved Vehicle" if "Reaction to" in x else x)
dfspecified['CONTRIBUTING FACTOR VEHICLE 2'] = dfspecified['CONTRIBUTING FACTOR VEHICLE 2'].apply(lambda x: "Pavement Issues" if "Pavement" in x else x)

In [None]:
# Group the data by location (intersections) and calculate the accident frequency
accident_frequency = dfspecified.groupby(['ON STREET NAME', 'CROSS STREET NAME']).size().reset_index(name='ACCIDENT_COUNT')

# Sort the data by accident frequency in descending order
accident_frequency_sorted = accident_frequency.sort_values('ACCIDENT_COUNT', ascending=False)

# Display the accident frequency by location
accident_frequency_sorted.head()

# Merge the accident_frequency_sorted back to the original DataFrame
dfspecified = dfspecified.merge(accident_frequency_sorted, on=['ON STREET NAME', 'CROSS STREET NAME'], how='left')

In [None]:
dfspecified.shape

In [None]:
dfspecified.info()

In [None]:
import geohash

# Let's assume df is your DataFrame and it has columns 'latitude' and 'longitude'
# df = pd.read_csv('your_file.csv')

# Compute geohashes
def safe_encode(latitude, longitude, precision=6):
    try:
        return geohash.encode(latitude, longitude, precision)
    except ValueError:
        return None

# Compute geohashes
dfspecified['geohash'] = dfspecified.apply(lambda row: safe_encode(row['LATITUDE'], row['LONGITUDE'], precision=7), axis=1)

# Find out which coordinates fall into a specific region
specific_region = 'geohash_of_interest'
df_in_specific_region = dfspecified[dfspecified['geohash'] == specific_region]

# Count the number of distinct geohashes
num_distinct_geohashes = dfspecified['geohash'].nunique()

print(f"Number of distinct geohashes: {num_distinct_geohashes}")

# To find the count of coordinates in each geohash
geohash_counts = dfspecified['geohash'].value_counts()
print(geohash_counts)


In [None]:
uniqueGeo = dfspecified['geohash'].nunique()

print(uniqueGeo)

In [None]:
num_distinct_values = df['ZIP CODE'].nunique()

print(num_distinct_values)

In [None]:
dfspecified.head()

In [None]:
# Compute the correlation matrix
corr = dfspecified.corr()

# Plot a heatmap of the correlation matrix
import seaborn as sns
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(dfspecified['CONTRIBUTING FACTOR VEHICLE 1'], dfspecified['CONTRIBUTING FACTOR VEHICLE 2'])

# Perform chi-squared test
chi2, p, dof, ex = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = contingency_table.sum().sum()
cramer_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print(f"Cramér's V: {cramer_v}")


In [None]:
import matplotlib.pyplot as plt

# Count occurrences of each contributing factor
factor_counts = dfspecified['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()

# Create a bar chart of the counts
plt.figure(figsize=(10, 6))
factor_counts.plot(kind='bar')
plt.title('Frequency of Contributing Factors in NYC Car Accidents')
plt.xlabel('Contributing Factor')
plt.ylabel('Frequency')
plt.show()

**Based on the identified causes, preventive measures can be determined. For example, if "Driver Inattention/Distraction" is a common cause, measures can be taken to reduce distractions while driving, such as implementing stricter laws on using mobile phones while driving or providing education on the dangers of distracted driving.**

In [None]:
# Group the data by hour of day and sum the number of injured persons
df_hourly = dfspecified.groupby(dfspecified['CRASH TIME'].dt.hour)['NUMBER OF PERSONS INJURED'].sum().reset_index()

# Create a line chart
plt.plot(df_hourly['CRASH TIME'], df_hourly['NUMBER OF PERSONS INJURED'])
plt.xlabel('Hour of Day')
plt.ylabel('Number of Injured Persons')
plt.title('Number of Injured Persons by Hour of Day')
plt.show()

In [None]:
weekday_counts = dfspecified.groupby('DAY_OF_WEEK')['CRASH DATE'].count()

weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.bar(weekday_counts.index, weekday_counts.values)
plt.title('Number of Accidents by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
# Group the data by ON STREET NAME and CROSS STREET NAME to find the number of accidents at each intersection
dangerous_intersections = dfspecified.groupby(['ON STREET NAME', 'CROSS STREET NAME']).size().reset_index(name='ACCIDENT_COUNT')

# Sort the intersections by the number of accidents in descending order
dangerous_intersections = dangerous_intersections.sort_values('ACCIDENT_COUNT', ascending=False)

# Display the top 10 most dangerous intersections
dangerous_intersections.head(10)


In [None]:
# Filter the dataset to only include intersection crashes
df_intersections = dfspecified[(dfspecified['ON STREET NAME'].notnull()) & (dfspecified['CROSS STREET NAME'].notnull())]

# Group the data by intersection and count the number of crashes at each intersection
df_intersection_counts = df_intersections.groupby(['ON STREET NAME', 'CROSS STREET NAME']).size().reset_index(name='Crash Count')

# Sort the intersections by the number of crashes to identify the most dangerous intersections
df_intersection_counts = df_intersection_counts.sort_values('Crash Count', ascending=False)

# Display the top 10 most dangerous intersections
print(df_intersection_counts.head(10))

# For each intersection, analyze the contributing factors to identify the main causes of crashes
for i, row in df_intersection_counts.head(10).iterrows():
    intersection = row['ON STREET NAME'] + ' and ' + row['CROSS STREET NAME']
    print('\nIntersection:', intersection)

    # Filter the data to only include crashes at this intersection
    df_intersection_crashes = df_intersections[(df_intersections['ON STREET NAME'] == row['ON STREET NAME']) & (df_intersections['CROSS STREET NAME'] == row['CROSS STREET NAME'])]

    # Group the crashes by contributing factor and count the number of crashes for each factor
    df_intersection_contributing_factors = df_intersection_crashes.groupby('CONTRIBUTING FACTOR VEHICLE 1').size().reset_index(name='Crash Count')

    # Sort the contributing factors by the number of crashes to identify the main causes of crashes at this intersection
    df_intersection_contributing_factors = df_intersection_contributing_factors.sort_values('Crash Count', ascending=False)

    # Display the top contributing factors for this intersection
    print('Top Contributing Factors:')
    print(df_intersection_contributing_factors.head(5))


In [None]:
# Get the top 10 most dangerous intersections
top_10 = df_intersection_counts.head(10)

# Create a horizontal bar chart
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(top_10['ON STREET NAME'] + ' & ' + top_10['CROSS STREET NAME'], top_10['Crash Count'])

# Set the title and labels
ax.set_title('Top 10 Most Dangerous Intersections in NYC', fontsize=16)
ax.set_xlabel('Number of Crashes')
ax.set_ylabel('Intersection')

# Display the chart
plt.show()

In [None]:
# Define a function to plot stacked horizontal bar chart for top 10 dangerous intersections
def plot_top_intersection(intersection_name):
    # Filter the data to include only the selected intersection
    df_intersection = df_intersections[(df_intersections['ON STREET NAME'] == intersection_name[0]) & (df_intersections['CROSS STREET NAME'] == intersection_name[1])]

    # Group the data by contributing factor and count the number of crashes caused by each factor
    df_factor_counts = df_intersection.groupby('CONTRIBUTING FACTOR VEHICLE 1').size().reset_index(name='Count')

    # Sort the contributing factors by count in descending order
    df_factor_counts = df_factor_counts.sort_values('Count', ascending=False)

    # Create a horizontal stacked bar chart
    fig, ax = plt.subplots(figsize=(8,6))
    y_pos = np.arange(len(df_factor_counts))
    ax.barh(y_pos, df_factor_counts['Count'], align='center', color='steelblue')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(df_factor_counts['CONTRIBUTING FACTOR VEHICLE 1'])
    ax.invert_yaxis()
    ax.set_xlabel('Number of Accidents')
    ax.set_title('Contributing Factors for Accidents at '+intersection_name[0]+' and '+intersection_name[1])
    plt.show()

# Loop through the top 10 intersections and plot a stacked bar chart for each intersection
for intersection in df_intersection_counts.head(10).itertuples(index=False):
    plot_top_intersection((intersection[0], intersection[1]))


In [None]:
# Filter the dataset to only include crashes with non-null vehicle types
df_vehicle_types = dfspecified[dfspecified['VEHICLE TYPE CODE 1'].notnull()]

# Group the data by vehicle type and count the number of crashes for each type
df_vehicle_counts = df_vehicle_types.groupby('VEHICLE TYPE CODE 1').size().reset_index(name='Crash Count')

# Sort the vehicle types by the number of crashes to identify which types are most commonly involved in accidents
df_vehicle_counts = df_vehicle_counts.sort_values('Crash Count', ascending=False)

# Display the top 10 most commonly involved vehicle types
print(df_vehicle_counts.head(10))


In [None]:
# Create a bar chart of the top 10 most commonly involved vehicle types
plt.bar(df_vehicle_counts['VEHICLE TYPE CODE 1'].head(10), df_vehicle_counts['Crash Count'].head(10))
plt.xticks(rotation=90)
plt.xlabel('Vehicle Type')
plt.ylabel('Number of Crashes')
plt.title('Top 10 Most Commonly Involved Vehicle Types')
plt.show()


In [None]:
# Extract the year from the CRASH DATE column
dfspecified['Year'] = dfspecified['CRASH DATE'].dt.year

# Count the number of crashes per year
crashes_by_year = dfspecified.groupby('Year').size()

# Create a line graph showing the number of crashes per year
plt.plot(crashes_by_year.index, crashes_by_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Crashes')
plt.title('Crashes by Year in NYC')
plt.show()


In [None]:
# Group the data by year and calculate the number of crashes and fatalities in each year
df_yearly = dfspecified.groupby(dfspecified['CRASH DATE'].dt.year)[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']].sum().reset_index()

# Calculate the crash rate and injury rate for each year
df_yearly['Total Crashes'] = df_yearly['NUMBER OF PERSONS INJURED'] + df_yearly['NUMBER OF PERSONS KILLED']
df_yearly['Crash Rate'] = df_yearly['Total Crashes'] / dfspecified['CRASH DATE'].dt.year.value_counts().sort_index()
df_yearly['Injury Rate'] = df_yearly['NUMBER OF PERSONS INJURED'] / dfspecified['CRASH DATE'].dt.year.value_counts().sort_index()

# Calculate the fatal rate for each year
df_yearly['Fatal Rate'] = df_yearly['NUMBER OF PERSONS KILLED'] / df_yearly['Total Crashes']

# Plot the fatal rate by year
plt.plot(df_yearly['CRASH DATE'], df_yearly['Fatal Rate'])
plt.title('Fatal Rate by Year')
plt.xlabel('Year')
plt.ylabel('Fatal Rate')
plt.show()


**How has the implementation of Vision Zero policies in NYC affected the frequency and severity of car accidents, and what more can be done to improve road safety in the city?**

In [None]:
# Filter the dataset to only include crashes that occurred after the implementation of Vision Zero policies
df_vz = dfspecified[dfspecified['CRASH DATE'].dt.year >= 2014]

# Group the data by year and count the number of crashes, injuries, and fatalities for each year
df_vz_yearly = df_vz.groupby(df_vz['CRASH DATE'].dt.year)[['CRASH DATE']].count().rename(columns={'CRASH DATE': 'Crashes'})
df_vz_yearly['Injuries'] = df_vz.groupby(df_vz['CRASH DATE'].dt.year)[['NUMBER OF PERSONS INJURED']].sum()
df_vz_yearly['Fatalities'] = df_vz.groupby(df_vz['CRASH DATE'].dt.year)[['NUMBER OF PERSONS KILLED']].sum()

# Calculate the crash, injury, and fatality rates per capita for each year
population = [ 8550405, 8622698, 8669675, 8622698, 8537673, 8468181, 8398748, 8326699, 8253213, 8175133]
df_vz_yearly['Crash Rate'] = df_vz_yearly['Crashes'] / population
df_vz_yearly['Injury Rate'] = df_vz_yearly['Injuries'] / population
df_vz_yearly['Fatality Rate'] = df_vz_yearly['Fatalities'] / population

# Plot the trends for the crash, injury, and fatality rates over time
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_vz_yearly.index, df_vz_yearly['Crash Rate'], label='Crash Rate')
ax.plot(df_vz_yearly.index, df_vz_yearly['Injury Rate'], label='Injury Rate')
ax.plot(df_vz_yearly.index, df_vz_yearly['Fatality Rate'], label='Fatality Rate')
ax.set_xlabel('Year')
ax.set_ylabel('Rate per capita')
ax.set_title('Trends in crash, injury, and fatality rates after Vision Zero policies')
ax.legend()
plt.show()


In [None]:
# Group the data by borough and injury severity, and count the number of accidents in each group
grouped_data = dfspecified.groupby(['BOROUGH', 'INJURY_CATEGORY']).size().unstack()

# Create a bar chart
ax = grouped_data.plot(kind='bar', stacked=True, figsize=(10, 6))

# Set the title and axis labels
ax.set_title('Number of Accidents by Borough and Severity')
ax.set_xlabel('Borough')
ax.set_ylabel('Number of Accidents')

# Set the x-tick labels to the borough names
ax.set_xticklabels(grouped_data.index, rotation=0)

# Use different colors for each injury severity level
ax.legend(title='Injury Severity', loc='upper left')

plt.show()

In [None]:
# Calculate the number of accidents per season
season_counts = dfspecified.groupby('SEASON')['SEASON'].count()

# Plot a bar graph
sns.barplot(x=season_counts.index, y=season_counts.values)

# Set the plot title and labels
plt.title('Number of Accidents per Season')
plt.xlabel('Season')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
import seaborn as sns

# Group the data by season and injury category
seasonal_injuries = dfspecified.groupby(['SEASON', 'INJURY_CATEGORY']).size().reset_index(name='count')

# Pivot the data to create a table with seasons as rows, injury categories as columns, and counts as values
seasonal_injuries_pivot = seasonal_injuries.pivot(index='SEASON', columns='INJURY_CATEGORY', values='count')

# Normalize the data by row (i.e., divide each row by the sum of its values)
seasonal_injuries_pivot_norm = seasonal_injuries_pivot.div(seasonal_injuries_pivot.sum(axis=1), axis=0)

# Plot the stacked bar chart
sns.set_style('whitegrid')
ax = seasonal_injuries_pivot_norm.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_xlabel('Season')
ax.set_ylabel('Proportion of injuries')
ax.set_title('Distribution of injuries across seasons')
plt.show()

In [None]:
# Create a subset of the data with only the columns we need
dfspecified['MONTH'] = dfspecified['CRASH DATE'].dt.month

df_injury = dfspecified[['MONTH', 'INJURY_CATEGORY']]

# Group the data by month and injury category, and count the number of occurrences
df_injury_counts = df_injury.groupby(['MONTH', 'INJURY_CATEGORY']).size().reset_index(name='COUNTS')

# Pivot the data to create a matrix of injury category counts for each month
df_injury_pivot = df_injury_counts.pivot(index='MONTH', columns='INJURY_CATEGORY', values='COUNTS').fillna(0)

# Create the bar plot
sns.set_style("whitegrid")
ax = df_injury_pivot.plot(kind='bar', stacked=True, figsize=(12,6))
ax.set_title("Injury Categories by Month")
ax.set_xlabel("Month")
ax.set_ylabel("Count")
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Drop irrelevant columns
data = dfspecified[['BOROUGH', 'DAY_OF_WEEK', 'SECONDS_SINCE_MIDNIGHT', 'SEASON', 'ACCIDENT_COUNT', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2','VEHICLE TYPE CODE 1','VEHICLE TYPE CODE 2','INJURY_CATEGORY']]

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()

data['CONTRIBUTING FACTOR VEHICLE 1'] = le.fit_transform(dfspecified['CONTRIBUTING FACTOR VEHICLE 1'])
data['CONTRIBUTING FACTOR VEHICLE 2'] = le.fit_transform(dfspecified['CONTRIBUTING FACTOR VEHICLE 2'])
data['VEHICLE TYPE CODE 1'] = le.fit_transform(dfspecified['VEHICLE TYPE CODE 1'])
data['VEHICLE TYPE CODE 2'] = le.fit_transform(dfspecified['VEHICLE TYPE CODE 2'])
data['BOROUGH'] = le.fit_transform(dfspecified['BOROUGH'])
data['SEASON'] = le.fit_transform(dfspecified['SEASON'])
data['DAY_OF_WEEK'] = le.fit_transform(dfspecified['DAY_OF_WEEK'])
data['INJURY_CATEGORY_ENCODED'] = le.fit_transform(data['INJURY_CATEGORY'])



In [None]:
# Split the data into train and test sets
X = data.drop(['INJURY_CATEGORY_ENCODED','INJURY_CATEGORY'], axis=1)
y = data['INJURY_CATEGORY_ENCODED']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def model_performance_analysis(number, model, X_test, y_test):
    # Predict labels for test data
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean squared error: {mse}")

    # Generate the classification report
    report = classification_report(y_test, y_pred, zero_division=1)
    print(report)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Create a new figure for the matrix
    plt.figure(i)

    # Plot the matrix
    plt.imshow(cm, cmap='Blues', interpolation='nearest')
    plt.title(f"Confusion Matrix")
    plt.colorbar()
    plt.xticks([0, 1, 2, 3, 4], ["No Injury", "Mild Injury", "Moderate Injury", "Severe Injury", "Fatal"])
    plt.yticks([0, 1, 2, 3, 4], ["No Injury", "Mild Injury", "Moderate Injury", "Severe Injury", "Fatal"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

    # Plot the confusion matrix
    #sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

In [None]:
# Define the models to be evaluated
models = [
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    LogisticRegression(max_iter=10000),
    MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500, random_state=42)
]

# Perform cross-validation
i=0
for model in models:
    model.fit(X_train, y_train)
    print(f"{model.__class__.__name__}")
    model_performance_analysis(i,model, X_test, y_test)
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{model.__class__.__name__} Accuracy: {scores.mean()} +/- {scores.std()}")
    i=i+1