# NYC Crash Data Analysis
This project studies the [Motor-Vehicle-Collisions-Crashes](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95) data to find patterns in the data by visualizing it.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import geoplot as gplt
import geopandas as gpd
import geoplot.crs as gcrs

In [None]:
# please proceed onlyafter changing root directory and input file name

# Config for the project
root_dir = 'G:\Projects\\NYC-Crash-Data-Analytics\\'
# Input filename
filename = 'G:\Projects\\NYC-Crash-Data-Analytics\Motor_Vehicle_Collisions_-_Crashes.csv'
# Cleaned filename
clean_filename = 'G:\Projects\\NYC-Crash-Data-Analytics\Motor_Vehicle_Collisions_-_Crashes_Cleaned.csv'
# NY Borough Boundaries shapefile
nybb_shapefile = root_dir + 'maps\nybb.shp'
# Index column

index_col = 'COLLISION_ID'

## Data Cleaning
The data cleaning involved the following processes:
* **Date Parsing:** The data has two fields for CRASH DATE and CRASH TIME. We combined these two fields into a DATETIME field to make filtering easier for the data investigation.
* **NA Value Filling:** The data contains NA values, to have a more consistent data, we filled the NA values depending on the type of data in the field. For example, the NA values in the BOROUGH field were filled with 'UNKOWN', the LATITUDE/LONGITUDE fields were set to 0.

In [None]:
# Data types for the columns in the data
dtypes = {
    'CRASH DATE' : 'str',
    'CRASH TIME' : 'str',
    'BOROUGH' : 'str',
    'ZIP CODE' : 'str',
    'LATITUDE' : 'float64',
    'LONGITUDE' : 'float64',
    'LOCATION' : 'object',
    'ON STREET NAME' : 'str',
    'CROSS STREET NAME' : 'str',
    'OFF STREET NAME' : 'str',
    'NUMBER OF PERSONS INJURED' : 'float64',
    'NUMBER OF PERSONS KILLED' : 'float64',
    'NUMBER OF PEDESTRIANS INJURED' : 'float64',
    'NUMBER OF PEDESTRIANS KILLED' : 'float64',
    'NUMBER OF CYCLIST INJURED' : 'float64',
    'NUMBER OF CYCLIST KILLED' : 'float64',
    'NUMBER OF MOTORIST INJURED' : 'float64',
    'NUMBER OF MOTORIST KILLED' : 'float64',
    'CONTRIBUTING FACTOR VEHICLE 1' : 'str',
    'CONTRIBUTING FACTOR VEHICLE 2' : 'str',
    'CONTRIBUTING FACTOR VEHICLE 3' : 'str',
    'CONTRIBUTING FACTOR VEHICLE 4' : 'str',
    'CONTRIBUTING FACTOR VEHICLE 5' : 'str',
    'COLLISION_ID' : 'int64',
    'VEHICLE TYPE CODE 1' : 'category',
    'VEHICLE TYPE CODE 2' : 'category',
    'VEHICLE TYPE CODE 3' : 'category',
    'VEHICLE TYPE CODE 4' : 'category',
    'VEHICLE TYPE CODE 5' : 'category'
}

# Column-wise replacement values for NA
na_replace = {
    'BOROUGH' : 'UNKNOWN',
    'ZIP CODE' : 'UNKNOWN',
    'LATITUDE' : 0,
    'LONGITUDE' : 0,
    'LOCATION' : '(0.0, 0.0)',
    'ON STREET NAME' : '',
    'CROSS STREET NAME' : '',
    'OFF STREET NAME' : '',
    'NUMBER OF PERSONS INJURED' : 0,
    'NUMBER OF PERSONS KILLED' : 0,
    'NUMBER OF PEDESTRIANS INJURED' : 0,
    'NUMBER OF PEDESTRIANS KILLED' : 0,
    'NUMBER OF CYCLIST INJURED' : 0,
    'NUMBER OF CYCLIST KILLED' : 0,
    'NUMBER OF MOTORIST INJURED' : 0,
    'NUMBER OF MOTORIST KILLED' : 0,
    'CONTRIBUTING FACTOR VEHICLE 1' : '',
    'CONTRIBUTING FACTOR VEHICLE 2' : '',
    'CONTRIBUTING FACTOR VEHICLE 3' : '',
    'CONTRIBUTING FACTOR VEHICLE 4' : '',
    'CONTRIBUTING FACTOR VEHICLE 5' : '',
    'VEHICLE TYPE CODE 1' : '',
    'VEHICLE TYPE CODE 2' : '',
    'VEHICLE TYPE CODE 3' : '',
    'VEHICLE TYPE CODE 4' : '',
    'VEHICLE TYPE CODE 5' : ''
}

print('Reading CSV file %s ...' % filename)

crash_data = pd.read_csv(filename, 
        index_col=index_col,
        dtype=dtypes,
        parse_dates={'CRASH DATETIME' : ['CRASH DATE', 'CRASH TIME']},
        infer_datetime_format=True
    )

print('Filling NaN values ...')
for key, val in na_replace.items():
    print('\t%s' % key)
    crash_data[key] = crash_data[key].replace(np.nan, val)

print("Saving cleaned file to %s ..." % clean_filename)
crash_data.to_csv(clean_filename)
print("Cleaning complete.")

## Read Clean Data

In [None]:
crash_data = pd.read_csv(clean_filename, 
    index_col=index_col, 
    parse_dates=['CRASH DATETIME'])    

before_20 = crash_data[crash_data['CRASH DATETIME'].dt.year < 2020]
during_20 = crash_data[crash_data['CRASH DATETIME'].dt.year == 2020]

crash_data.head()

## Q1
For the years before 2020, which boroughs had the most accidents? Did this distribution change during 2020?

In [None]:
accidents = pd.DataFrame()
# == Compute the distribution for accidents by Borough for years before 2020 ===
accidents['Before 2020'] = before_20['BOROUGH'].value_counts()
accidents['Before 2020'] = accidents['Before 2020'] / accidents['Before 2020'] \
                           .sum()

# ====== Compute the distribution for accidents by Borough for year 2020 =======
accidents['2020'] = during_20['BOROUGH'].value_counts()
accidents['2020'] = accidents['2020'] / accidents['2020'].sum()

# Plot distributions
accidents.plot.bar()
plt.show()

## Q2
For the years before 2020, which months had the most accidents? Students in the past have said they thought there were 10% fewer accidents in February then in January. Is this true or is this bogus? Did this distribution change during 2020?

In [None]:
accidents = pd.DataFrame()
# == Compute the distribution for accidents by Borough for years before 2020 ===
accidents['Before 2020'] = before_20['CRASH DATETIME'].dt.month.value_counts()
accidents['Before 2020'] = accidents['Before 2020'] / accidents['Before 2020'] \
                           .sum()

# ====== Compute the distribution for accidents by Borough for year 2020 =======
accidents['2020'] = during_20['CRASH DATETIME'].dt.month.value_counts()
accidents['2020'] = accidents['2020'] / accidents['2020'].sum()

print('February had %.2f%% fewer accidents than January.' \
  % ((1 - (accidents['Before 2020'][2] / accidents['Before 2020'][1])) * 100))

# Stack the data
accidents = accidents.sort_index()
# Plot distributions
accidents.plot()
plt.show()

## Q3
For the years before 2020, which types of accidents were most prevalent? Did this distribution change during 2020?

In [None]:
accidents = pd.DataFrame()
# == Compute the distribution for accidents by Borough for years before 2020 ===
accidents['Before 2020'] = before_20['CONTRIBUTING FACTOR VEHICLE 1'] \
                          .value_counts()
accidents['Before 2020'] = accidents['Before 2020'] / accidents['Before 2020'] \
                          .sum()

# ====== Compute the distribution for accidents by Borough for year 2020 =======
accidents['2020'] = during_20['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()
accidents['2020'] = accidents['2020'] / accidents['2020'].sum()

# Plot distributions
accidents.plot.bar(figsize=(22,8), fontsize=20)
plt.show()

## Q4
For the years before 2020, which days of the week had the most accidents? Did this distribution change during 2020?

In [None]:
# introduce a column for storing day the crash took place
before_20['DAY'] = before_20['CRASH DATETIME'].dt.day_name()
during_20['DAY'] = during_20['CRASH DATETIME'].dt.day_name()

# get the total number of crashes that happened on each day
crash_day_before_20 = before_20['DAY'].value_counts(ascending=True,normalize=True)*100
crash_day_during_20 = during_20['DAY'].value_counts(ascending=True,normalize=True)*100


# plot the total number of crashes to day it took place
print('---------------------')
print('\nNumber of crashes by day before 2020:\n\n', crash_day_before_20.to_string())
crash_day_before_20.plot(kind='barh', figsize=(12, 8))
plt.xlabel("No. of Crashes", labelpad=14)
plt.ylabel("Day of Week", labelpad=14)
plt.title("No. of Crashes by Day of Week for year before 2020", y=1.02)
plt.savefig('Results\crashes_by_day_bef_20.png')
print('\nSaved the graph for No. of Crashes by Day of Week for '
      'year before 2020 to crashes_by_day_bef_20.png.')
plt.show()
# plt.clf()

# plot the total number of crashes to day it took place
print('---------------------')
print('\nNumber of crashes by day during 2020:\n\n', crash_day_during_20.to_string())
crash_day_during_20.plot(kind='barh', figsize=(12, 8))
plt.xlabel("No. of Crashes", labelpad=14)
plt.ylabel("Day of Week", labelpad=14)
plt.title("No. of Crashes by Day of Week for year during 2020", y=1.02)
plt.savefig('Results\crashes_by_day_in_20.png')
print('\nSaved the graph for No. of Crashes by Day of Week for '
      'year during 2020 to crashes_by_day_in_20.png.')
plt.show()
print('---------------------')

# Stack the data
season_crashes = pd.concat([crash_day_before_20, crash_day_during_20], axis=1)

# Plot distributions
season_crashes.plot.bar(figsize=(12,9))

plt.ylabel("Percentage of Crashes", labelpad=6)
plt.title("Percentage of Crashes by Day of week", y=1.02)
plt.legend(['Before 2020','After 2020'])
plt.savefig('Results\crashes_by_day')
print('\nSaved the graph for No. of Crashes by Day of Week to crashes_by_day.png.')

## Q5
For a typical year before 2020, how given a seven day calendar that starts at 12:01 AM on Saturday and goes until 11:59 PM midnight on Sunday, when are accidents most likely to happen?  Which day of the week is most likely to have an accident?  What time of the day is most likely to have an accident. Does this change in 2020?

In [None]:
# introduce a column for storing day the crash took place
before_20['DAY'] = before_20['CRASH DATETIME'].dt.day_name()
during_20['DAY'] = during_20['CRASH DATETIME'].dt.day_name()

# introduce a column for storing day the crash took place
before_20['HOUR'] = before_20['CRASH DATETIME'].dt.hour
during_20['HOUR'] = during_20['CRASH DATETIME'].dt.hour


# group the data by day value
grp_by_day_before_20 = before_20.groupby(['DAY'])
grp_by_day_during_20 = during_20.groupby(['DAY'])


# ======== Retrieve and plot data for years before 2020 ========
plt.figure(figsize=(10, 6), dpi=80)

print('---------------------')
print('\nCrashes that happened every hour for each day before 2020:')
day_list = []
for entry in grp_by_day_before_20:
    # get day of week
    day_of_week = entry[0]
    # sort the entries by hour
    this_day_crashes = entry[1].sort_values(by=['HOUR'])
    # get count of crashes that happened for every hour of day
    crashes_by_hour = this_day_crashes['HOUR'].value_counts().sort_index(ascending=True)
    print('\nFor ',day_of_week,'\nHOUR\tCrashes\n', crashes_by_hour)
    day_list.append(day_of_week)
    # plot the count of crashes that happened for every hour of day
    crashes_by_hour.plot.line()

# give labels and add legend
plt.ylabel("No. of Crashes", labelpad=14)
plt.xlabel("Time of day", labelpad=14)
plt.legend(day_list)
plt.title("No. of Crashes by Time of day for years before 2020", y=1.02)
# save the graph
plt.savefig('Results\crashes_by_time_bef_20.png')
print("Saved No. of Crashes by Time of day for years before 2020 to crashes_by_time_bef_20.png")
plt.show()
# clear plot
# plt.cla()

print('---------------------')



# ======== Retrieve and plot data for year 2020 ========
print('\nCrashes that happened every hour for each day in 2020:')
day_list = []

plt.figure(figsize=(10, 6), dpi=80)
# give labels and add legend
plt.ylabel("No. of Crashes", labelpad=14)
plt.xlabel("Time of day", labelpad=14)
plt.title("No. of Crashes by Time of day during 2020", y=1.02)

for entry in grp_by_day_during_20:
    # get day of week
    day_of_week = entry[0]
    # sort the entries by hour
    this_day_crashes = entry[1].sort_values(by=['HOUR'])
    # get count of crashes that happened for every hour of day
    crashes_by_hour = this_day_crashes['HOUR'].value_counts().sort_index(ascending=True)
    print('\nFor ',day_of_week,'\nHOUR\tCrashes\n', crashes_by_hour)
    day_list.append(day_of_week)
    # plot the count of crashes that happened for every hour of day
    crashes_by_hour.plot.line()

plt.legend(day_list)


# save the graph
plt.savefig('Results\crashes_by_time_in_20.png')
print("Saved No. of Crashes by Time of day during 2020 to crashes_by_time_in_20.png")
plt.show()
print('---------------------')


## Q6
Does the timing of when accidents happen depend on the borough of NY City? Does the amount of change vary from year to year?

In [None]:
# group the data by day value
grp_by_borough_before_20 = before_20.groupby(['BOROUGH'])
grp_by_borough_during_20 = during_20.groupby(['BOROUGH'])


# ======== Retrieve and plot data for years before 2020 ========

plt.figure(figsize=(10, 6), dpi=80)

print('---------------------')
print('\nCrashes that happened every hour for each BOROUGH before 2020:')
day_list = []
for entry in grp_by_borough_before_20:
    # get borough
    day_of_week = entry[0]
    # sort the entries by hour
    this_day_crashes = entry[1].sort_values(by=['HOUR'])
    # get count of crashes that happened for every hour of day in the borough
    crashes_by_hour = this_day_crashes['HOUR'].value_counts().sort_index(ascending=True)
    print('\nFor ',day_of_week,'\nHOUR\tCrashes\n', crashes_by_hour)
    day_list.append(day_of_week)
    # plot the count of crashes that happened for every hour in the borough
    crashes_by_hour.plot.line()

# give labels and add legend
plt.ylabel("No. of Crashes", labelpad=14)
plt.xlabel("Time of day", labelpad=14)
plt.legend(day_list)
plt.title("No. of Crashes for every Borough by Time of day for years before 2020", y=1.02)
# save the graph
plt.savefig('Results\crashes_by_borough_bef_20.png')
print("Saved No. of Crashes by Time of day for years before 2020 to Results\crashes_by_borough_bef_20.png")
plt.show()
# clear plot

print('---------------------')

plt.figure(figsize=(10, 6), dpi=80)


# ======== Retrieve and plot data for year 2020 ========
print('\nCrashes that happened every hour for each BOROUGH in 2020:')
day_list = []
for entry in grp_by_borough_during_20:
    # get  borough
    day_of_week = entry[0]
    # sort the entries by hour
    this_day_crashes = entry[1].sort_values(by=['HOUR'])
    # get count of crashes that happened for every hour in the borough
    crashes_by_hour = this_day_crashes['HOUR'].value_counts().sort_index(ascending=True)
    print('\nFor ',day_of_week,'\nHOUR\tCrashes\n', crashes_by_hour)
    day_list.append(day_of_week)
    # plot the count of crashes that happened for every hour of day in the borough
    crashes_by_hour.plot.line()

# give labels and add legend
plt.ylabel("No. of Crashes", labelpad=14)
plt.xlabel("Time of day", labelpad=14)
plt.legend(day_list)
plt.title("No. of Crashes for every Borough by Time of day during 2020", y=1.02)
# save the graph
plt.savefig('Results\crashes_by_borough_in_20.png')
print("Saved No. of Crashes by Time of day during 2020 to Results\crashes_by_borough_in_20.png")
plt.show()
print('---------------------')

## Q7
For the years before 2020, given the entire region of NY City, which regions are “hot spots,” or places where accidents are most likely to occur?  You will need to do some Parzen density estimation on this.  You will need to use the GPS coordinates.  You may need to convert the GPS coordinates from degrees:minutes:seconds, to degrees.fractions of degrees  (fractional degrees).  You do not need to use the Haversine distance.  Assume that longitude and latitude is Euclidean for NY City.  Create a “heat map” of where accidents occur and overlay it on a map of the city.

Do the hot spot locations change in 2020?


In [None]:
nyc = gpd.read_file(
    gplt.datasets.get_path('nyc_boroughs')
)
nyc.head()

In [None]:
ax = gplt.polyplot(nyc,
                   edgecolor="white",
                   facecolor="lightgray",
                   figsize=(12, 8))
accidents = crash_data[(crash_data['LATITUDE'] != 0) & (crash_data['LONGITUDE'] != 0)]
accidents = accidents[['LATITUDE', 'LONGITUDE']].head(4000)
gdf = gpd.GeoDataFrame(
    accidents, 
    geometry=gpd.points_from_xy(accidents['LONGITUDE'], accidents['LATITUDE']))
# gdf = gpd.read_file(gplt.datasets.get_path("nyc_collision_factors"))

gplt.kdeplot(gdf, ax=ax)

## Q8
Compare the number of car-only accidents (car and car or car and obstacle) with car-pedestrian accidents (car and person or car and bicycle).  Do these proportions change in 2020?  Do they change in any particular location?


In [None]:
before_20_total = len(before_20)
before_20_car_ped = before_20[(before_20['NUMBER OF PEDESTRIANS INJURED'] > 0) \
                                 | (before_20['NUMBER OF PEDESTRIANS KILLED'] > 0) \
                                 | (before_20['NUMBER OF CYCLIST INJURED'] > 0) \
                                 | (before_20['NUMBER OF CYCLIST KILLED'] > 0)]
before_20_car_car = pd.merge( \
    before_20, \
    before_20_car_ped, \
    on=before_20.columns.tolist(), \
    how='outer', \
    indicator=True).query("_merge == 'left_only'").drop('_merge', 1)

during_20_total = len(during_20)
during_20_car_ped = during_20[(during_20['NUMBER OF PEDESTRIANS INJURED'] > 0) \
                                 | (during_20['NUMBER OF PEDESTRIANS KILLED'] > 0) \
                                 | (during_20['NUMBER OF CYCLIST INJURED'] > 0) \
                                 | (during_20['NUMBER OF CYCLIST KILLED'] > 0)]
during_20_car_car = pd.merge( \
    during_20, \
    during_20_car_ped, \
    on=during_20.columns.tolist(), \
    how='outer', \
    indicator=True).query("_merge == 'left_only'").drop('_merge', 1)

accidents = pd.DataFrame(
    {'Before 2020': [len(before_20_car_ped), len(before_20_car_car)], 
     '2020': [len(during_20_car_ped), len(during_20_car_car)] },
     index=['Car-Ped', 'Car-Car'])
accidents['Before 2020'] /= before_20_total
accidents['2020'] /= during_20_total
accidents.plot.bar()
plt.show()


In [None]:
nyc = gpd.read_file(
    gplt.datasets.get_path('nyc_boroughs')
)

In [None]:
ax = gplt.polyplot(nyc,
                   edgecolor="white",
                   facecolor="lightgray",
                   figsize=(12, 8))
accidents = before_20_car_car[(before_20_car_car['LATITUDE'] != 0) & (before_20_car_car['LONGITUDE'] != 0)]
accidents = accidents[['LATITUDE', 'LONGITUDE']].head(4000)
gdf = gpd.GeoDataFrame(
    accidents, 
    geometry=gpd.points_from_xy(accidents['LONGITUDE'], accidents['LATITUDE']))
# gdf = gpd.read_file(gplt.datasets.get_path("nyc_collision_factors"))

gplt.kdeplot(gdf, ax=ax)

In [None]:
ax = gplt.polyplot(nyc,
                   edgecolor="white",
                   facecolor="lightgray",
                   figsize=(12, 8))
accidents = before_20_car_ped[(before_20_car_ped['LATITUDE'] != 0) & (before_20_car_ped['LONGITUDE'] != 0)]
accidents = accidents[['LATITUDE', 'LONGITUDE']].head(4000)
gdf = gpd.GeoDataFrame(
    accidents, 
    geometry=gpd.points_from_xy(accidents['LONGITUDE'], accidents['LATITUDE']))
# gdf = gpd.read_file(gplt.datasets.get_path("nyc_collision_factors"))

gplt.kdeplot(gdf, ax=ax)

## Q9
While working on the data, did you discover anything else you wanted to explore? 
Some students in the past found relationships between weather and numbers of accidents.
One team identified the surge in the number of accidents that happened after the change to/from daylight savings time.



In [None]:
# introduce a column for storing month the crash took place
before_20['MONTH'] = before_20['CRASH DATETIME'].dt.month
during_20['MONTH'] = during_20['CRASH DATETIME'].dt.month

# introduce a column for storing season the crash took place
before_20['SEASON'] = before_20["MONTH"].apply(lambda month: ["Winter","Spring","Summer","Fall"][(month-1)//3])
during_20['SEASON'] = during_20["MONTH"].apply(lambda month: ["Winter","Spring","Summer","Fall"][(month-1)//3])


# get the total number of crashes that happened in each season
crash_season_before_20 = before_20['SEASON'].value_counts(normalize=True) * 100
crash_season_during_20 = during_20['SEASON'].value_counts(normalize=True) * 100

crash_season_before_20.sort_index(inplace=True)
crash_season_during_20.sort_index(inplace=True)

# group crashes by season
grp_by_season_before_20 = before_20.groupby(['SEASON'])
grp_by_season_in_20 = during_20.groupby(['SEASON'])


print('\nPercentage of crashes by Season before 2020:\n\n', crash_season_before_20.to_string())
print('\nPercentage of crashes by Season during 2020:\n\n', crash_season_during_20.to_string())

for season_data in grp_by_season_before_20:
    # get season
    season = season_data[0]
    # sort the entries by month
    this_season_crashes = season_data[1].sort_values(by=['MONTH'])
    crash_by_boroughs = this_season_crashes['BOROUGH'].value_counts(normalize=True)*100
    print('\nPercentage of Crashes during %s season:'%season)
    print(crash_by_boroughs.to_string())

for season_data in grp_by_season_in_20:
    # get  season
    season = season_data[0]
    # sort the entries by month
    this_season_crashes = season_data[1].sort_values(by=['MONTH'])
    crash_by_boroughs = this_season_crashes['BOROUGH'].value_counts(normalize=True)*100
    print('\nPercentage of Crashes during %s season:'%season)
    print(crash_by_boroughs.to_string())




# ======== Retrieve and plot data  ========

print('---------------------')

# Stack the data
season_crashes = pd.concat([crash_season_before_20, crash_season_during_20], axis=1)

# Plot distributions
season_crashes.plot.bar(figsize=(9,8))

plt.ylabel("No. of Crashes", labelpad=6)
plt.title("Percentage of Crashes by Season", y=1.02)
plt.legend(['Before 2020','After 2020'])

plt.savefig('Results\crashes_by_season.png')

print('\nSaved the graph for Percentage of Crashes by Season of Week for '
      'year before 2020 to crashes_by_season.png.')
# plt.show()

print('---------------------\n')




## Q10
Suppose I tell you that there is an accident in one of these certain locations:\
a.	Hope, or \
b.	Hunts Point, or \
c.	Central Brooklyn, \
d.	Brairwood, or \
e.	West Bronx

what else can you tell me about that accident just by the location – even before  we dispatch emergency vehicles?  How would you build a classifier for this?  Is it likely to be a car-car, or car-pedestrian, or car-bicycle?




In [None]:
# a. Hope,                
# b. Hunts Point          40.813°N 73.884°W
# c. Central Brooklyn     40.697°N 73.917°W
# d. Brairwood            40.71°N 73.81°W
# e. West Bronx           40.850°N 73.900°W



crash_data_total = len(crash_data)

crash_data_car_other = crash_data[(crash_data['NUMBER OF PEDESTRIANS INJURED'] > 0) \
                                 | (crash_data['NUMBER OF PEDESTRIANS KILLED'] > 0)\
                                 | (crash_data['NUMBER OF CYCLIST INJURED'] > 0) \
                                 | (crash_data['NUMBER OF CYCLIST KILLED'] > 0)\
                                 | (crash_data['NUMBER OF MOTORIST INJURED'] > 0)\
                                 | (crash_data['NUMBER OF MOTORIST INJURED'] > 0)]

crash_data_car_ped = crash_data[(crash_data['NUMBER OF PEDESTRIANS INJURED'] > 0) \
                                 | (crash_data['NUMBER OF PEDESTRIANS KILLED'] > 0)]

crash_data_car_bike = crash_data[(crash_data['NUMBER OF CYCLIST INJURED'] > 0) \
                                 | (crash_data['NUMBER OF CYCLIST KILLED'] > 0)]

crash_data_car_motorist = crash_data[(crash_data['NUMBER OF MOTORIST INJURED'] > 0) \
                                 | (crash_data['NUMBER OF MOTORIST KILLED'] > 0)]
                                 
crash_data_car_car = pd.merge( \
    crash_data, \
    crash_data_car_other, \
    on=crash_data.columns.tolist(), \
    how='outer', \
    indicator=True).query("_merge == 'left_only'").drop('_merge', 1)

In [None]:
# get the percentage of car-car crashes happened in each borough
car_car_by_boroughs = crash_data_car_car['BOROUGH'].value_counts()
car_car_by_boroughs.sort_index(inplace=True)
print("Percentage of car-car crashes happened in each borough:\n",car_car_by_boroughs)

# get the percentage of car-ped crashes happened in each borough
car_ped_by_boroughs = crash_data_car_ped['BOROUGH'].value_counts()
car_ped_by_boroughs.sort_index(inplace=True)
print("Percentage of car-ped crashes happened in each borough:\n",car_ped_by_boroughs)

# get the percentage of car-bike crashes happened in each borough
car_bike_by_boroughs = crash_data_car_bike['BOROUGH'].value_counts()
car_bike_by_boroughs.sort_index(inplace=True)
print("Percentage of car-bike crashes happened in each borough:\n",car_bike_by_boroughs)

# get the percentage of car-bike crashes happened in each borough
car_motorist_by_boroughs = crash_data_car_motorist['BOROUGH'].value_counts()
car_motorist_by_boroughs.sort_index(inplace=True)
print("Percentage of car-motorist crashes happened in each borough:\n",car_motorist_by_boroughs)



In [None]:
def plot_type_of_crashes(crashes_by_borough,type):
    # ======== Retrieve and plot data ========
    plt.figure(figsize=(4, 4), dpi=80)

    print('---------------------')
    print("No. of "+type+" Crashes in every Borough:\n",crashes_by_borough)

    crashes_by_borough.plot.barh()

    # give labels and add legend
    plt.xlabel("No. of Crashes", labelpad=4)
    plt.yticks(rotation=45)
    # plt.xlim(0,40)
    # plt.ylabel("Borough", labelpad=4)
    plt.title("No. of "+type+" Crashes for every Borough", y=1.02)

    # save the graph
    plt.savefig('Results\\'+type+"_by_borough.png")
    print("\nSaved No. of "+type+" Crashes for every Borough to "+type+"_by_borough.png")
    plt.show()
    print('---------------------')


plot_type_of_crashes(car_car_by_boroughs,'car_car')
plot_type_of_crashes(car_ped_by_boroughs,'car_ped')
plot_type_of_crashes(car_bike_by_boroughs,'car_bike')
plot_type_of_crashes(car_motorist_by_boroughs,'car_motorist')

In [None]:
grp_by_borough = crash_data.groupby(['BOROUGH'])

cols = ['Car-Car','Car-Ped','Car-Bike','Car-Motorist']
for entry in grp_by_borough:
    
    borough = entry[0]
    data = entry[1]

    car_other = data[ (data['NUMBER OF PEDESTRIANS INJURED'] > 0) \
                  | (data['NUMBER OF PEDESTRIANS KILLED'] > 0)\
                  | (data['NUMBER OF CYCLIST INJURED'] > 0) \
                  | (data['NUMBER OF CYCLIST KILLED'] > 0)\
                  | (data['NUMBER OF MOTORIST INJURED'] > 0)\
                  | (data['NUMBER OF MOTORIST INJURED'] > 0)]

    car_ped = data[(data['NUMBER OF PEDESTRIANS INJURED'] > 0) \
               | (data['NUMBER OF PEDESTRIANS KILLED'] > 0)]

    car_bike = data[(data['NUMBER OF CYCLIST INJURED'] > 0) \
                | (data['NUMBER OF CYCLIST KILLED'] > 0)]

    car_motorist = data[(data['NUMBER OF MOTORIST INJURED'] > 0) \
                    | (data['NUMBER OF MOTORIST KILLED'] > 0)]

    car_car = pd.merge( \
      data, \
      car_other, \
      on=data.columns.tolist(), \
      how='outer', \
      indicator=True).query("_merge == 'left_only'").drop('_merge', 1)


    print('---------------------')

    types = [len(car_car),len(car_ped),len(car_bike),len(car_motorist)]
    print(types)
    print(cols)
    # df = pd.DataFrame({'Count': types,'Crash Type': cols, })
    df = pd.DataFrame(types, index = cols, columns = ['count'])
    df.plot.barh()

    print(borough)
    print(df)
    # plt.figure(figsize=(10, 6), dpi=80)
    # # ======== Retrieve and plot data for year 2020 ========

    # plt.plot
    # # give labels and add legend
    plt.xlabel("No. of Crashes", labelpad=2)
    # plt.ylabel("Type of crashes", labelpad=2)

    plt.title("No of types of crashes for "+borough, y=1.02)
    # save the graph
    plt.savefig('Results\\'+borough+'_crash_types.png')
    print("Saved No. of Crashes by "+borough+"_crash_types.png")
    plt.show()
    print('---------------------')
