In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## In this project I tried to analyse the Toronto Traffic Data [Toronto_Link](https://www.kaggle.com/jrmistry/killed-or-seriously-injured-ksi-toronto-clean). Primarily we did following:
#### Added new attributes
#### Removed few columns to clean up the data
#### Merged few columns to create more meaningfull data

#### Disclaimer - This data set is taken from kaggle and as a newbie I try to analyse it and clean it. If you find any error or suggestion, please DM me. I would try to resolve it.

## Motivation
#### The increase in the traffic accidents, injuries and fatalities since 2007 was the motivation to use this datasets. This is one of the first steps towards making toronto streets safe.
#### Toronto ranked one of the most visited cities to visit in North America [link](https://www.blogto.com/city/2018/01/toronto-ranked-one-most-visited-cities-north-america/) with a huge population of about 3.0 Million [Link](https://canadapopulation2018.com/population-of-toronto-2018.html). Curing the risk of accident is highly required to make it safe for Torontonians and keep up Toronto's reputation as a desired tourist destination.

## Content

#### The data set is taken from Kaggle which was originally taken from Toronto Police [website](https://data.torontopolice.on.ca/datasets/9f05c21dea4c40458264cb3f1e2362b8_0) in csv format.

* ## Column Definition 
__ACCNUM__ Accident Number<br>
__YEAR__ Year of Accident<br>
__MONTH__ Month of Accident<br>
__DAY__ Day of Accident<br>
__HOUR__ Hour of Accident (24hrs)<br>
__MINUTES__ Minute of Accident<br>
__WEEKDAY__ Weekday of Accident (0 is Monday)<br>
__LATITUDE__ Latitude<br>
__LONGITUDE__ Longitude<br>
__Ward_Name__ City Ward<br>
__Ward_ID__ City Ward ID<br>
__Hood_Name__ Neighbourhood Name<br>
__Hood_ID__ Neighbourhood ID<br>
__Division__ Police Division<br>
__District__ City District<br>
__STREET1__ Street of Accident<br>
__STREET2__ Street of Accident<br>
__OFFSET__ Distance and direction of the accident<br>
__ROAD_CLASS__ Road Classification<br>
__LOCCOORD__ Location Coordinate<br>
__ACCLOC__ Accident Location<br>
__TRAFFCTL__ Traffic Control Type<br>
__VISIBILITY__ Environment Condition<br>
__LIGHT__ Light Condition<br>
__RDSFCOND__ Road Surface Condition<br>
__ACCLASS__ Classification of Accident<br>
__IMPACTYPE__ Initial Impact Type<br>
__INVTYPE__ Involvement Type<br>
__INVAGE__ Age of Involved Party<br>
__INJURY__ Severity of Injury<br>
__FATAL_NO__ Sequential Number<br>
__INITDIR__ Initial Direction of Travel<br>
__VEHTYPE__ Type of Vehicle<br>
__MANOEUVER__ Vehicle Manouever<br>
__DRIVACT__ Apparent Driver Action<br>
__DRIVCOND__ Driver Condition<br>
__PEDTYPE__ Pedestrian Crash Type<br>
__PEDACT__ Pedestrian Action<br>
__PEDCOND__ Condition of Pedestrian<br>
__CYCLISTYPE__ Cyclist Crash Type<br>
__CYCACT__ Cyclist Action<br>
__CYCCOND__ Cyclist Condition<br>
__PEDESTRIAN__ Pedestrian Involved In Collision<br>
__CYCLIST__ Cyclists Involved in Collision<br>
__AUTOMOBILE__ Driver Involved in Collision<br>
__MOTORCYCLE__ Motorcyclist Involved in Collision<br>
__TRUCK__ Truck Driver Involved in Collision<br>
__TRSN_CITY_VEH__ Transit or City Vehicle Involved in Collision<br>
__EMERG_VEH__ Emergency Vehicle Involved in Collision<br>
__PASSENGER__ Passenger Involved in Collision<br>
__SPEEDING__ Speeding Related Collision<br>
__AG_DRIV__ Aggressive and Distracted Driving Collision<br>
__REDLIGHT__ Red Light Related Collision<br>
__ALCOHOL__ Alcohol Related Collision<br>
__DISABILITY__ Medical or Physical Disability Related Collision<br>
__FATAL__ Fatal Injury in Collision

## Analysis

#### We tried to answer below questions and make some predictions after analysing it
- Total number of accidents in the City of Toronto in percentage
- What type of vehicle involved in those accidents - Year wise and Total
- Visualization of above questions<br>
- What time of the day has the most accidents involved - Daylight, Early Eve, Late Eve, Night - added new attribute<br>
- Location wise accident data categorized in last 10 years<br>
- Visualization of above location data
- TIME SERIES - Calculate the monthly percentage over the period for each stock using the "shift trick"
-------

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pandas.plotting import autocorrelation_plot, scatter_matrix

#visualization 
import matplotlib.pyplot as plt
import seaborn as sea

from pandas import DataFrame, Series
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
import scipy, scipy.stats

import seaborn as sns
%matplotlib inline

#### Create a dataframe from the data in csv

In [None]:
train_data = pd.read_csv("/kaggle/input/KSI_CLEAN.csv")
df = pd.DataFrame(train_data)

#### Cleaning of data
- <font size="3">Adding a TIMESTAMP</font>

In [None]:
year = train_data['YEAR']
month = train_data['MONTH']
data = pd.read_csv("/kaggle/input/KSI_CLEAN.csv", parse_dates=[['YEAR', 'MONTH']])
data['YEAR'] = year
data['MONTH'] = month

In [None]:
data['TIMESTAMP'] = pd.to_datetime(data.YEAR_MONTH) + pd.to_timedelta(data.HOUR, unit='h') + pd.to_timedelta(data.MINUTES, unit='m')

- <font size="3">Adding NaN to all empty values</font>

In [None]:
df1 = data.replace(' ', np.nan, regex=False)

- <font size="3">Finding the missing percentage to find out which column to drop</font>
- <font size="3">If missing percentage is more than 80% of values, then columns are __dropped__</font>

In [None]:
missing_percent = df1.isna().sum()/len(df)
# missing_percent * 100

In [None]:
data_clean=df1.dropna(axis=1, thresh=3000, how="any")

- <font size="3">Creating a pivot table</font>

In [None]:
pivot=data_clean.pivot_table(index='YEAR',margins=True,margins_name='TOTAL',values=['ALCOHOL', 'PEDESTRIAN', 'CYCLIST', 'TRSN_CITY_VEH', 'MOTORCYCLE', 'TRUCK', 'EMERG_VEH', 'AG_DRIV', 'REDLIGHT', 'DISABILITY', 'FATAL', 'SPEEDING'],aggfunc=np.sum)
pivot

## Analysis 1. Total number of accidents in the City of Toronto in percentage?

### Analysis 

- Overall, Aggressive and Distracted Driving Collision (AG_DRIV) has the maximum contribution which is almost 1/4th of the total number of accidents which works out to 2 AG_DRIV crashes every day. To us it is clear that toronto police should take some action for the distracted driving. 
- The second most type of accidents is Pedestrian accidents. This data shows almost 1.4 accidents where pedestreans are involved everyday. 
- The least number of accidents were involved Emergency Vehicle, though not a good thing but better than other type of accidents. The reason can be because of less number of emergency vehicles on road.
- Total number of fatalities is huge in the accidents, more than 8% in total which shows that almost every two days, one individual dies because of road accidents in the city of toronto. 

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[11].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in last 10 years(%age)',fontsize=20)

In [None]:
pivot1= data_clean.pivot_table(index='YEAR', margins=False ,values=['ALCOHOL', 'PEDESTRIAN', 'CYCLIST', 'TRSN_CITY_VEH', 'MOTORCYCLE', 'TRUCK', 'EMERG_VEH', 'AG_DRIV', 'REDLIGHT', 'DISABILITY', 'FATAL', 'SPEEDING'],aggfunc=np.sum)

### Year wise distribution is given below

#### In 2007

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[0].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2007 (%age)',fontsize=10)

#### In 2008

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[1].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2008 (%age)',fontsize=10)

#### In 2009

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[2].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2009 (%age)',fontsize=10)

#### In 2010

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[3].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2010 (%age)',fontsize=10)

#### In 2011

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[4].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2011 (%age)',fontsize=10)

#### In 2012

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[5].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2012 (%age)',fontsize=10)

#### In 2013

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[6].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2013 (%age)',fontsize=10)

#### In 2014

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[7].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2014 (%age)',fontsize=10)

#### In 2015

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[8].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2015 (%age)',fontsize=10)

#### In 2016

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[9].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2016 (%age)',fontsize=10)

#### In 2017

In [None]:
ig, ax = plt.subplots(1,1)
pivot.iloc[10].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Ontario in 2017 (%age)',fontsize=10)

### Some more facts: [Reference](https://en.wikipedia.org/wiki/Cycling_in_Toronto)

During the period of 2010 to 2014 government in the City of Toronto, Bike lanes were proposed to remove from some of the streets in Toronto. In 2012 they were removed and by looking at the data we found in 2012 and 2013, cyclist accidents were the maximum since 2007. The removal of cyclist lanes clearly impacted the safety of cyclist in Toronto.

Comparing the effect of aggressive driving vs. driving under the effect of alcohol, and speeding:

In [None]:
aggressive1= data_clean.pivot_table(index='YEAR', margins=False ,values=['ALCOHOL', 'AG_DRIV', 'SPEEDING'],aggfunc=np.sum)
aggressive1.plot(figsize=(10,8), title="Accidents caused by aggressive driving vs. speeding vs. driving under the influence", grid=True)
plt.ylabel('Accidents')

## Analysis 2. What type of vehicle involved in those accidents - Year wise and Total

In [None]:
vehicle_data=data_clean.pivot_table(index='YEAR',margins=True,margins_name='TOTAL',values=['CYCLIST', 'TRSN_CITY_VEH', 'MOTORCYCLE', 'TRUCK', 'EMERG_VEH'],aggfunc=np.sum)
vehicle_data1=data_clean.pivot_table(index='YEAR',margins=False,values=['CYCLIST', 'TRSN_CITY_VEH', 'MOTORCYCLE', 'TRUCK', 'EMERG_VEH'],aggfunc=np.sum)


vehicle_data1.plot(figsize=(10,8), title="Type of vehicles involved in the accidents per year", grid=True)
plt.ylabel('Accidents')

In [None]:
Cyclist_data1=data_clean.pivot_table(index='YEAR',margins=False,values=['CYCLIST'],aggfunc=np.sum)


Cyclist_data1.plot(figsize=(10,8), title="Number of accidents involving cyclists", grid=True)
plt.ylabel('Accidents')

### Analysis 

- Bicycle were maximum involved in accidents followed by Motocyclist
- Truck and Transit city vehicle almost share the same percentage after Motorcyclist

In [None]:
ig, ax = plt.subplots(1,1)
vehicle_data.iloc[11].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Vehicle involved in accidents in last 10 years(%age)',fontsize=20)

## Analysis 3. What time of the day has the most accidents occurred - Daylight, Early Eve, Late Eve, Night

- First we will define the new attribute "TIMEOFDAY". We define labels or buckets as 
 - 12AM-4AM - [00 to 4 hours]
 - 4AM-8AM - [4 to 8 hours]
 - 8AM-12PM - [8 to 12 hours]
 - 12PM-4PM - [12 to 16 hours]
 - 4PM-8PM - [16 to 20 hours]
 - 8PM-12PM - [20 to Midnight]

In [None]:
bins = [0, 4, 8, 12, 16, 20, np.inf]
labels = ['12AM-4AM', '4AM-8AM','8AM-12PM', '12PM-4PM', '4PM-8PM', '8PM-12PM']
data_clean["TIMEOFDAY"] = pd.cut(data_clean["HOUR"], bins, labels = labels)

#### Preparing a new data frame name - time_day

In [None]:
time_day = pd.DataFrame()
time_day['12AM-4AM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='12AM-4AM'].value_counts()
time_day['4AM-8AM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='4AM-8AM'].value_counts()
time_day['8AM-12PM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='8AM-12PM'].value_counts()
time_day['12PM-4PM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='12PM-4PM'].value_counts()
time_day['4PM-8PM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='4PM-8PM'].value_counts()
time_day['8PM-12PM'] = data_clean['YEAR'][data_clean['TIMEOFDAY']=='8PM-12PM'].value_counts()
time_day.loc['Total']= time_day.sum()

In [None]:
time_day

### Analysis

- It is clear that most of the accidents occured during hours start from 4PM to 8PM, which is the time when people try to reach home after work. 
- Another point to be noted here is, 12PM to 4PM has the second highest accidents, around lunch hours to afternoon. 
- Most of the accidents occured in Daylight from 8AM to 8PM, which is mainly office hours. 

### Visualization

In [None]:
ig, ax = plt.subplots(1,1)
time_day.iloc[11].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(8,8), fontsize=10)
ax.set_ylabel('')
ax.set_xlabel('Time of the day vs total number accidents (%age)',fontsize=20)

#### Another way to visualize the above "TIMEOFDAY" data - using Seaborn plot

In [None]:
time_of_day = data_clean.groupby('TIMEOFDAY').count()
time_of_day['TOTAL'] = time_of_day['YEAR_MONTH']
time_of_day = time_of_day[['TOTAL']]

time_of_day

In [None]:
sea.barplot(x="TIMEOFDAY", y="TOTAL", data=time_of_day.reset_index())
plt.title = 'Intel'
plt.show()

## Analysis 4. Location wise accident data categorized in last 10 years

#### Prepare a new Data frame name "location"

In [None]:
location = pd.DataFrame()
location['Etobicoke'] = data_clean['YEAR'][data_clean['District']=='Etobicoke York'].value_counts()
location['NorthYork'] = data_clean['YEAR'][data_clean['District']=='North York'].value_counts()
location['Scarborough'] = df1['YEAR'][df1['District']=='Scarborough'].value_counts()
location['EastYork'] = df1['YEAR'][df1['District']=='Toronto East York'].value_counts()
location.loc['Total']= location.sum()

In [None]:
location

### Analysis

- Data shows most of the accidents happened in East York than Etobicoke region in Toronto.
- This clearly implies where toronto police should focus more.

In [None]:
location1 = pd.DataFrame()
location1['Etobicoke'] = data_clean['YEAR'][data_clean['District']=='Etobicoke York'].value_counts()
location1['NorthYork'] = data_clean['YEAR'][data_clean['District']=='North York'].value_counts()
location1['Scarborough'] = df1['YEAR'][df1['District']=='Scarborough'].value_counts()
location1['EastYork'] = df1['YEAR'][df1['District']=='Toronto East York'].value_counts()
result = location1.sort_index(inplace=True)
location1.plot(figsize=(10,8), title="Number of accidents in regions of GTA", grid=True)
plt.ylabel('Accidents')

### Visualization

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[11].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in last 10 years(%age)',fontsize=20)

### Last 5 year wise data (2012 to 2017) - 2017 first

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[10].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2017(%age)',fontsize=20)

#### In 2016

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[9].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2016(%age)',fontsize=20)

#### In 2015

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[8].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2015(%age)',fontsize=20)

#### In 2014

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[7].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2014(%age)',fontsize=20)

#### In 2013

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[6].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2013(%age)',fontsize=20)

#### In 2012

In [None]:
ig, ax = plt.subplots(1,1)
location.iloc[5].plot(kind='pie', ax=ax, autopct='%3.1f%%', figsize=(10,10), fontsize=15)
ax.set_ylabel('')
ax.set_xlabel('Total Accidents in Toronto region wise data in 2012(%age)',fontsize=20)

## Analysis 5. TIME SERIES - Calculate the monthly change and autocorrelation over the period for accidents using the "shift trick"

#### We use pivot data created above for this time series analysis. We convert the pivot data into time series data by using time stamp we created above as a new attribute. Timestamp is monthly

In [None]:
pivot_time=data_clean.pivot_table(index='YEAR_MONTH',margins=True,margins_name='TOTAL',values=['ALCOHOL', 'PEDESTRIAN', 'CYCLIST', 'TRSN_CITY_VEH', 'MOTORCYCLE', 'TRUCK', 'EMERG_VEH', 'AG_DRIV', 'REDLIGHT', 'DISABILITY', 'FATAL', 'SPEEDING'],aggfunc=np.sum)
# pivot_time

#### Prepare a new DataFrame which has present as well as shifted data and change in the data
#### Here, we analysed AG_DRIV column from the data set because that caused the maximum number of accidents in last 10 years 

In [None]:
time_series_ad = pd.DataFrame()
time_series_ad['present'] = pivot_time['AG_DRIV']
time_series_ad['shift'] = pivot_time['AG_DRIV'].shift(1)
time_series_ad['change'] = (pivot_time['AG_DRIV'] - pivot_time['AG_DRIV'].shift(1))*100/pivot_time['AG_DRIV'].shift(1)
# time_series_ad

### Visualization of Autocorralation

In [None]:
plt.figure()
autocorrelation_plot(time_series_ad['change'].dropna())
plt.title ='AG_DRIV'

## Analysis

- There are about 132 data points because the data is 11 years * 12 months. 
- We shifted the data by one month. 
- The above autocorrelation graph shows that the value of correlation is ZERO which means there is no correlation between the present value and shifted value. We can say that it will be hard to predict 133th data point when 132 points are given as there is no correlation.

## Lets analyze another data set of 'Pedestrian' because that contributes to the second highest number of accidents in last 10 years

### Prepare a new Data frame

In [None]:
time_series_pd = pd.DataFrame()
time_series_pd['present'] = pivot_time['PEDESTRIAN']
time_series_pd['shift'] = pivot_time['PEDESTRIAN'].shift(1)
time_series_pd['change'] = (pivot_time['PEDESTRIAN'] - pivot_time['PEDESTRIAN'].shift(1))*100/pivot_time['PEDESTRIAN'].shift(1)
# time_series_pd

### Visualization

In [None]:
plt.figure()
autocorrelation_plot(time_series_pd['change'].dropna())
plt.title ='PEDESTRIAN'

## Analysis

- Clearly it shows the same trend as above that there is no correlation between the data sets. 

## Some more analysis to find correlation - We try to find the correlation among attributes and below is the plot

In [None]:
sns.pairplot(pivot1)