# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime as dt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
casualties=pd.read_csv("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Casualties0514.csv")
accidents=pd.read_csv("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Accidents0514.csv")
vehicles=pd.read_csv("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Vehicles0514.csv")
lookup=pd.ExcelFile("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Road-Accident-Safety-Data-Guide.xls")

In [None]:
#Preprocessing
accidents['Latitude'] = accidents['Latitude'].astype(float)
accidents['Date']= accidents['Date'].astype('datetime64')
accidents['Longitude'] = accidents['Longitude'].astype(float)
heat_df = accidents[['Latitude', 'Longitude']]
heat_df = heat_df.dropna(axis=0, subset=['Latitude','Longitude'])
#Generate Heatmap from  sample of 250k values
heat_data = heat_df.sample(250000).values
#m = folium.Map(location=[51.5074, 0.1278],width='100%', height="100%", zoom_start=7.5)
#m


# Heatmap of accidents around the UK

In [None]:
#Heatmap
import folium
from folium.plugins import HeatMap
m = folium.Map(location=[54.251186, -4.463196],width=800,height=700, min_zoom=5, max_zoom=18, zoom_start=6, min_lat=48, max_lat=60, min_lon=-13, max_lon=4,
    control_scale=False,
    zoom_control=False)
HeatMap(heat_data,radius=9.5).add_to(m)
m

In [None]:
#Merge Weather by code
weather = accidents["Weather_Conditions"].value_counts(dropna =False)
weather_lookup = pd.read_excel("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Road-Accident-Safety-Data-Guide.xls",sheet_name="Weather")
merged_weather = pd.merge(weather, weather_lookup, left_index=True, right_on='code')
merged_weather= merged_weather[merged_weather["Weather_Conditions"]>5000] #We only take the values greater than a specific threshold
merged_weather

In [None]:
#Plot of Weather Condition vs Number of accidents
plt.figure(figsize=(15,10))
sns.barplot(x=merged_weather["label"], y=merged_weather["Weather_Conditions"])
plt.xticks(rotation= 45)
plt.xlabel('Weather Condition')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given Weather Condition')

Here we find that the accidents, is not correlated with the weather as most of the accidents happen with very good weather

In [None]:
# Corelation of Weather with other variables
weather_corr = accidents.corr()["Weather_Conditions"][accidents.corr()["Weather_Conditions"] !=1].sort_values()
weather_corr

In [None]:
#Correlation plot
plt.figure(figsize=(20,7))
ax = sns.lineplot(data=weather_corr, sort=False)
plt.xticks(rotation= 90)
plt.xlabel('Feature')
plt.ylabel('Coorelatation with Weather')
plt.title('The relationship of weather with other features')

As we can tell from this plot, only the Road Surface Conditions and Light conditions are correlated with weather which makes sense, since weather affects both of them, so based on this piece of information, and our other plot we can come to the conclusion that the weather doesn't really affect the accident conditions. So if we'd rather invest in other areas to help reduce accidents

In [None]:
#Merge Ageband by code
age_band = vehicles["Age_Band_of_Driver"].value_counts(dropna =False)
age_band_lookup = pd.read_excel("/kaggle/input/uk-accidents-10-years-history-with-many-variables/Road-Accident-Safety-Data-Guide.xls",sheet_name="Age Band")
merged_age_band = pd.merge(age_band, age_band_lookup, left_index=True, right_on='code')
merged_age_band = merged_age_band[merged_age_band["code"]!=-1]
merged_age_band

In [None]:
#Plot Driver Ageband vs Number of accidents
plt.figure(figsize=(15,10))
sns.barplot(x=merged_age_band["label"], y=merged_age_band["Age_Band_of_Driver"])
plt.xticks(rotation= 90)
plt.xlabel('Age band')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given Driver Age band')

As expected, it seems like the younger age bands are the most likely to be involved in accidents, probably because they are generally more careless, or could be because they are likely the most age band to be spending time driving, thus the most likely to be involved in accidents..
We can also tell, that older people 66+ seem to be unlikely to get involved in accidents probably because they are either too careful or they simply don't drive that much.

In [None]:
#Merge Casualty Age band by code
age_band_caus = casualties["Age_Band_of_Casualty"].value_counts(dropna =False)
merged_age_band_caus = pd.merge(age_band_caus, age_band_lookup, left_index=True, right_on='code')
merged_age_band_caus = merged_age_band_caus[merged_age_band_caus["code"]!=-1]
merged_age_band_caus

In [None]:
#Plot Casulty age band vs Number of accidents
plt.figure(figsize=(15,10))
sns.barplot(x=merged_age_band_caus["label"], y=merged_age_band_caus["Age_Band_of_Casualty"])
plt.xticks(rotation= 90)
plt.xlabel('Age band')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given Casualty Age band')

Interestingly, the top 2 casualty age bands happen to be the same as the top 2 driver age bands, which could strength the supposition that younger people tend to spend more time on the streets, thus effectively increasing their changes of getting in accidents whether they are the drivers of the pedestrians 

In [None]:
from statsmodels.tsa.arima_model import ARIMA
dates = pd.DataFrame.copy(accidents[accidents["Accident_Severity"]==3])
numb = np.ones(len(dates))
dates["numb"]=numb

dates["Date"] =pd.to_datetime(dates.Date,format='%d/%m/%Y')
dates=dates.sort_values("Date")
#print(dates["Date"])

# print list
count = dates.groupby(["Date"]).count() 

#print(dates,count)
times=count.filter(["Date","numb"])
ds = pd.to_datetime(dates.Date.unique())
ns = np.asarray(times.numb)
#print(ns,times)

timeser=pd.DataFrame({"Date":ds,"accidents":ns})
print(timeser.head())
print(type(ds),type(ns))

plt.figure(figsize=(15,10))
sns.set(color_codes=True)

sns.lineplot(x="Date",y="accidents",data=timeser)

plt.xticks(rotation= 45)
plt.xlabel('Date')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given time')
plt.show()



**The trend is intially peaked and decreasinguntil mid 2013 and then increased at a lower rate**


In [None]:
plt.figure(figsize=(15,10))
sns.set(color_codes=True)

sns.lineplot(x="Date",y="accidents",data=timeser[0:370])

plt.xticks(rotation= 45)
plt.xlabel('Date')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given time')

**Now seasonally at december of every year the accidents decrease drastically probably due to snow fall, it satrts increasing again slightly but then decreases until april, during the summer the accidents increase slightly again, and they peak at the beginning of the winter before the snowfall**

In [None]:
for i in range(12):
    plt.figure(figsize=(15,10))
    sns.set(color_codes=True)

    sns.lineplot(x="Date",y="accidents",data=timeser[30*i:30*(i+1)])

    plt.xticks(rotation= 45)
    plt.xlabel('Date')
    plt.ylabel('Number of Accidents')
    plt.title('Number of Accidents Given time')

**Now on daily basis there is no pattern for the daily number of accidents in a month**

In [None]:
cars = pd.DataFrame.copy(vehicles)
#cars = cars["Vehicle_Type"].value_counts(dropna =False)
df2 = pd.read_excel(lookup,'Casualty Type')
order = df2.iloc[:,1]
case_lookup = pd.read_excel(lookup,"Casualty Type")
merged_vehicle = pd.merge(cars, case_lookup, left_on='Vehicle_Type', right_on='code')
plt.figure(figsize=(15,10))
sns.countplot(x= 'label' ,data=merged_vehicle)
plt.xticks(rotation= 90)
plt.xlabel('Vehicle Type - Need Look up')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given Vehicle Type')

In [None]:
cars

**The names of the vehicles were too large to include, so 9 is oviosuly normal cars accidents, and since it blows the entire barplot out of propotion in the nect bar plot it will be excluded**

In [None]:
#cars=cars[cars["Vehicle_Type"]!=9]
plt.figure(figsize=(15,10))
sns.countplot(x="label",data=merged_vehicle[merged_vehicle['label']!='Car occupant'])
plt.xticks(rotation= 90)
plt.xlabel('Vehicle Type - Need Look up')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents Given Vehicle Type')

print(df2)

**So here we find cyclists are the second category in number of accidents, and unlike the cars that are very abundant, bikes are rare so this means they are really dangerous
Next up are trucks whcih aren't rare to come by and according to studies this is due to the fact that they are really hard to control in emergencies
Thirdly are motorcycles which are also very dangerous but are safer on highroads than bikes
after that there are buses and heavier trucks
Now the cars are the vehicles you'll see most often on the roads and that explains why they are 10x times any other vehicles**

In [None]:
people_age = casualties["Age_of_Casualty"].value_counts(dropna =False)
people_age.head()

In [None]:
#Dividing the people age into 5 parts to be able to be drawn with the casuality type
people_age_first=[]
people_age_second=[]
people_age_third=[]
people_age_fourth=[]
people_age_fifth=[]
people_age.sort_values()
for i in range(len(people_age)):
    if (people_age.index[i] < 20 ):
        people_age_first.append(people_age.index[i])
    elif (20 < people_age.index[i] < 42):
        people_age_second.append(people_age.index[i])
    elif (42 < people_age.index[i] < 63):
        people_age_third.append(people_age.index[i])
    elif (63 < people_age.index[i] <84):
        people_age_fourth.append(people_age.index[i])
    else:
        people_age_fifth.append(people_age.index[i])        

In [None]:
len(people_age_first)

In [None]:
plt.figure(figsize=(25,25))
sns.barplot(x=people_age.index, y=people_age)
plt.xticks(rotation= 45)
plt.xlabel('Casualities Age')
plt.ylabel('Casualities Number')
plt.title('Number of Casualities given the Age')

#the maximum number of casualities occur at the age from 18-20

In [None]:
casuality_class = casualties["Casualty_Class"].value_counts(dropna =False)
casuality_class.head()

In [None]:
casualty_severity = casualties["Casualty_Severity"].value_counts(dropna = False)
casualty_severity.head()

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x=casuality_class.index, y=casualty_severity.index)
plt.xticks(rotation= 45)
plt.xlabel('Casualities Class')
plt.ylabel('Casualities Severity')
plt.title('Casualities Class Vs Casuality Severity')

#Class Driver is the Fatal Severity 
#Class Passenger is the Serious Severity
#Class Pedestrians is the Slight Severity


In [None]:
casuality_type = casualties["Casualty_Type"].value_counts(dropna =False)
casuality_type.head()

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x= people_age_first, y=casuality_type.index)
plt.xticks(rotation= 45)
plt.xlabel('Casualities Age')
plt.ylabel('Casualities Type')
plt.title('Casualities Age Vs Casuality Type')

#Age 0 -> baby who is just born died of electric motorcycle
#Age 1 -> Goods Vehicles of Unknown Weight
#Age 2 -> Tram Occupant
#Age 3 -> Motorcycle
#Age 4 -> Mobility Scooter Rider
#Age 5 

In [None]:
accidents.shape
accidents = accidents.dropna()
accidents.isnull().any()
accidents.head()

In [None]:
weather_lookup = pd.read_excel(lookup,"Weather")
weather_lookup

In [None]:
light_lookup = pd.read_excel(lookup,"Light Conditions")
light_lookup

In [None]:
light_conditions = accidents[['Light_Conditions']]
weather_conditions = accidents[['Weather_Conditions']]
no_of_casualties = accidents["Number_of_Casualties"].value_counts(dropna =False)
weather_conditions.isnull().any()
light_conditions.columns

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(15,10))
plt.xticks(rotation= 0)
weatherplot = sns.countplot(x='Weather_Conditions',data=weather_conditions)
weatherplot.set(xlabel='Weather conditions', ylabel='Number of Accidents')
plt.show()

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(15,10))
plt.xticks(rotation= 0)
weatherplot = sns.countplot(x='Light_Conditions',data=light_conditions)
weatherplot.set(xlabel='Light conditions', ylabel='Number of Accidents')
plt.show()

In [None]:
data_frame_conditions = accidents[['Light_Conditions','Weather_Conditions']]
data_frame_conditions.columns

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(20,25))
plt.xticks(rotation= 0)
weatherplot = sns.countplot(x='Light_Conditions',data=data_frame_conditions,hue="Weather_Conditions")
weatherplot.set(xlabel='Light conditions', ylabel='Number of Accidents')
plt.show()

In [None]:
plt.figure(figsize=(25,15))
corr=accidents.corr()
corr[abs(corr) <= 0.1]=None
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, linewidths=.5,annot=True,mask=mask)

In [None]:
urban_rural = accidents[['Urban_or_Rural_Area','Speed_limit','Road_Surface_Conditions']]
urban_rural.head()

In [None]:
area_lookup = pd.read_excel(lookup,"Urban Rural")
area_lookup
merged_urban = pd.merge(urban_rural, area_lookup, left_on='Urban_or_Rural_Area', right_on='code')
merged_urban

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(20,10))
plt.xticks(rotation= 0)
weatherplot = sns.countplot(x='label',data=merged_urban,hue="Speed_limit")
weatherplot.set(xlabel='Urban or Rural', ylabel='Number of Accidents')
plt.show()

In [None]:
from datetime import datetime
ts = accidents['Date']
ts_df= ts.value_counts().sort_index().rename_axis('ds').reset_index(name='y')

# Model Testing

# Model

In [None]:
from fbprophet import Prophet
m = Prophet()
m.fit(ts_df)

In [None]:
#1 year feature prediction
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
fig1 =m.plot(forecast)

In [None]:
fig2 = m.plot_components(forecast)

# Model Accuracy Metrics

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Train Set

In [None]:
print("Train MSE: %0.2f"%mean_squared_error(ts_df.y,forecast[:4017-365].yhat))
print("Train R2: %0.2f"%r2_score(ts_df.y,forecast[:4017-365].yhat))
print("Train MAE: %0.2f"%mean_absolute_error(ts_df.y,forecast[:4017-365].yhat))

# Test Set

In [None]:
m_test = Prophet()
m_test.fit(ts_df[:-2000])
future_test = m_test.make_future_dataframe(periods=2000)
forecast_test = m_test.predict(future_test)

In [None]:
print("Test MSE: %0.2f"%mean_squared_error(ts_df[-2000:].y,forecast_test[-2000:].yhat))
print("Test R2: %0.2f"%r2_score(ts_df[-2000:].y,forecast_test[-2000:].yhat))
print("Test MAE: %0.2f"%mean_absolute_error(ts_df[-2000:].y,forecast_test[-2000:].yhat))

In [None]:
for i in range(11):
    print(np.average(ts_df[365*i:365*i+365].y),np.std(ts_df[365*i:365*i+365].y))
    