# Visual analysis of traffic accidents on Brazilian federal highways
<img src = 'https://user-images.githubusercontent.com/48328204/53973771-59c4fa00-40cf-11e9-8bc5-b9ac37ef915e.gif'>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob
import matplotlib.pyplot as plt
import plotly.express as px
import datetime as dt
import os

# Read data

In [None]:
path = r'/kaggle/input/brazil-highway-traffic-accidents/por_ocorrencias/'                     
all_files = glob(os.path.join(path, "*.csv"))     

df_from_each_file = (pd.read_csv(f, sep = ';', encoding = 'latin1', decimal = ',') for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)

In [None]:
concatenated_df.head()

In [None]:
concatenated_df.columns

In [None]:
concatenated_df_original = concatenated_df.copy()

In [None]:
concatenated_df.data_inversa = concatenated_df.data_inversa.astype('datetime64[ns]')

In [None]:
concatenated_df.index = concatenated_df.data_inversa

In [None]:
concatenated_df.drop('data_inversa', axis =1 , inplace = True)

In [None]:
time_series_full_day = concatenated_df.groupby('data_inversa').count()['id']

# Graphs
<img src = 'https://thumbs.gfycat.com/GaseousIndelibleIndianhare-size_restricted.gif'>

In [None]:
figura = px.line()
figura.add_scatter(x = time_series_full_day.index, y = time_series_full_day)
figura.update_layout(
    title="#Acc on Brazilian federal highways per day",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

In [None]:
time_series_full_month = concatenated_df.groupby(concatenated_df.index.to_period("M")).agg('count')['id']
figura = px.line()
figura.add_scatter(x = time_series_full_month.index.astype('datetime64[ns]'), y = time_series_full_month)
figura.update_layout(
    title="#Acc on Brazilian federal highways per month",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

In [None]:
time_series_full_month_sev = concatenated_df.groupby(concatenated_df.index.to_period("M")).agg('sum')[['mortos', 'feridos_leves', 'feridos_graves','ilesos']]

figura = px.line()
for i in time_series_full_month_sev.columns:
  figura.add_scatter(x = time_series_full_month_sev.index.astype('datetime64[ns]'), y = time_series_full_month_sev[i], name = i)
figura.update_layout(
    title="#Acc on Brazilian federal highways per month and severity",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="black"
    ),
    template= 'plotly_white'
)
figura.show()

In [None]:
concatenated_df.tipo_acidente.unique()

In [None]:
def correct_type(text):
  acc_type = ''
  if text == 'Colisão com objeto estático' or text == 'Colisão com objeto em movimento' or text == 'Colisão com objeto fixo' or text == 'Colisão com objeto móvel':
    acc_type = 'Colisão com objeto'
  elif text == 'Atropelamento de animal' or text == 'Atropelamento de Animal':
    acc_type = 'Atropelamento de animal'
  elif text == 'Atropelamento de Pedestre' or text == 'Atropelamento de pessoa':
    acc_type = 'Atropelamento de pessoa'
  elif text == 'Queda de motocicleta / bicicleta / veículo' or text == 'Queda de ocupante de veículo':
    acc_type = 'Queda de ocupante de veículo'
  elif text == 'Saída de leito carroçável' or text == 'Saída de Pista':
    acc_type = 'Saída de Pista'
  elif text == 'Colisão Transversal' or text == 'Colisão transversal':
    acc_type = 'Colisão transversal'
  elif text == 'Derramamento de Carga' or text == 'Derramamento de carga':
    acc_type = 'Derramamento de carga'
  elif text == 'Danos Eventuais' or text == 'Danos eventuais':
    acc_type = 'Danos eventuais'
  else:
    acc_type = text  

  return acc_type

In [None]:
time_series_full_month_tipo = concatenated_df.copy()
time_series_full_month_tipo['tipo_acidente'] = time_series_full_month_tipo['tipo_acidente'].apply(correct_type)

time_series_full_month_tipo = pd.get_dummies(time_series_full_month_tipo[['tipo_acidente']], sparse=False, drop_first=False, prefix = '', prefix_sep = '')
time_series_full_month_tipo = time_series_full_month_tipo.groupby(concatenated_df.index.to_period("M")).agg('sum')

figura = px.line()
for i in time_series_full_month_tipo.columns:
  figura.add_scatter(x = time_series_full_month_tipo.index.astype('datetime64[ns]'), y = time_series_full_month_tipo[i], name = i)
figura.update_layout(
    title="#Acc on Brazilian federal highways per month and type",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="Type",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="black"
    ),
    template= 'plotly_white',
    legend=dict(font = dict(size = 12),
        orientation="h")
)
figura.show()

In [None]:
time_series_full_month_uf = pd.get_dummies(concatenated_df[['uf']], sparse=False, drop_first=False, prefix = '', prefix_sep = '').drop(['(null)'],axis = 1)
time_series_full_month_uf = time_series_full_month_uf.groupby(concatenated_df.index.to_period("M")).agg('sum')

figura = px.line()
for i in time_series_full_month_uf.columns:
  figura.add_scatter(x = time_series_full_month_uf.index.astype('datetime64[ns]'), y = time_series_full_month_uf[i], name = i)
figura.update_layout(
    title="#Acc on Brazilian federal highways per month and states",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="black"
    ),
    template= 'plotly_white'
)
figura.show()

In [None]:
time_series_full_month_uf = concatenated_df.groupby(concatenated_df.uf).agg('count').reset_index().query('uf != "(null)"')

figura = px.bar(time_series_full_month_uf, x = time_series_full_month_uf.uf, y = time_series_full_month_uf.id)
figura.update_layout(
    title="#Acc on Brazilian federal highways per states",
    xaxis_title="States",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)
figura.show()

In [None]:
import plotly.graph_objects as go

nan = np.nan
concatenated_df_map = concatenated_df.reset_index().query('(data_inversa == 2019) and (latitude != @nan) and (longitude != @nan) and (classificacao_acidente != @nan)')

fig = go.Figure(data=go.Scattergeo(
        lon = concatenated_df_map['longitude'],
        lat = concatenated_df_map['latitude'],
        text = concatenated_df_map['classificacao_acidente'],
        mode = 'markers'
        ))

fig.update_layout(
        title = 'Acc on Brazilian federal highways (2019)<br>(Hover for severity)',
        geo_scope='south america'
    )
fig.show()

In [None]:
time_series_full_month_dia_semana = concatenated_df.groupby(concatenated_df.index.day_name()).agg('count').reset_index()

figura = px.bar(time_series_full_month_dia_semana, x = time_series_full_month_dia_semana.data_inversa, y = time_series_full_month_dia_semana.id,
               category_orders={"data_inversa": ['Monday','Tuesday','Wednesday',  'Thursday', 'Friday','Saturday', 'Sunday']})
figura.update_layout(
    title="#Acc on Brazilian federal highways per days of the week",
    xaxis_title="Days",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="black"
    ),
    template= 'plotly_white'
)
figura.show()

In [None]:
time_series_full_month_dia_horario = concatenated_df.copy()
time_series_full_month_dia_horario.horario = pd.cut(pd.to_datetime(concatenated_df['horario'],format= '%H:%M:%S' ).dt.hour, bins = list(range(0,25,3)))
time_series_full_month_dia_horario = time_series_full_month_dia_horario.groupby('horario').agg('count').reset_index()

figura = px.bar(time_series_full_month_dia_horario, x = time_series_full_month_dia_horario.horario.astype('str'), y = time_series_full_month_dia_horario.id)
figura.update_layout(
    title="#Acc on Brazilian federal highways per hour",
    xaxis_title="Hour",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

figura.show()

In [None]:
concatenated_df.index.month_name().unique()

In [None]:
time_series_full_month_dia_mes = concatenated_df.groupby(concatenated_df.index.month_name()).agg('count').reset_index()

figura = px.bar(time_series_full_month_dia_mes, x = time_series_full_month_dia_mes.data_inversa, y = time_series_full_month_dia_mes.id,
               category_orders={"data_inversa": ['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December']}
               )
figura.update_layout(
    title="#Acc on Brazilian federal highways per month",
    xaxis_title="Month",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="black"
    ),
    template= 'plotly_white'
)
figura.show()

# Time series
<img src ='https://i.pinimg.com/originals/97/0b/1c/970b1c6d874c8942c2a2fe29e17a07c5.gif'>

In [None]:
!pip install pmdarima

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from plotly.subplots import make_subplots
from fbprophet import Prophet

I've used only accidents with some severity

In [None]:
time_series_full_month = concatenated_df.query('(mortos > 0) or (feridos_graves > 0)').copy()
time_series_full_month = time_series_full_month.groupby(time_series_full_month.index.to_period("M")).agg('count')['id']
time_series_full_month.name = 'acc'
time_series_full_month.index.name = 'datetimeindex'
time_series_full_month.head(),time_series_full_month.tail()

In [None]:
decomposition = seasonal_decompose(time_series_full_month, model='additive', extrapolate_trend='freq', freq=12)

In [None]:
observed = decomposition.observed
trend = decomposition.trend
seasonal = decomposition.seasonal
resid = decomposition.resid

figura = make_subplots(rows=4, cols=1, shared_xaxes=True)

figura.add_trace(go.Scatter(x = observed.index.astype('datetime64[ns]'), y = observed, name="Observed"), row=1, col=1)
figura.add_trace(go.Scatter(x = trend.index.astype('datetime64[ns]'), y = trend, name="Trend"), row=2, col=1)
figura.add_trace(go.Scatter(x = seasonal.index.astype('datetime64[ns]'), y = seasonal, name="Seasonal"), row=3, col=1)
figura.add_trace(go.Scatter(x = resid.index.astype('datetime64[ns]'), y = resid, name="Resid"), row=4, col=1)

figura.update_layout(
    title="#Acc on Brazilian federal highways per month",
    xaxis_title="",
    yaxis_title="",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

## ARIMA

In [None]:
train_size = int(len(time_series_full_month)*0.70)
train, test = time_series_full_month[0:train_size], time_series_full_month[train_size:len(time_series_full_month)]
train[0:5], test[0:5]

In [None]:
model = auto_arima(train, suppress_warnings=True, error_action='ignore')

In [None]:
model.order

In [None]:
predictions, conf_int = model.predict(n_periods=len(test), return_conf_int=True)

In [None]:
conf_int_inf = np.array(conf_int)[:,0]
conf_int_sup = np.array(conf_int)[:,1]

In [None]:
figura = px.line()
figura.add_scatter(x = time_series_full_month.index.astype('datetime64[ns]'), y = time_series_full_month, name = 'Observed')
figura.add_scatter(x = test.index.astype('datetime64[ns]'), y = predictions, name = 'Predictions - Arima (2, 1, 3)')

figura.add_scatter(name='Upper Bound',x = test.index.astype('datetime64[ns]'), y = conf_int_sup,marker=dict(color="#444"),
        line=dict(width=0),mode='lines',
        showlegend=False)
figura.add_scatter(name='Lower Bound',x = test.index.astype('datetime64[ns]'), y = conf_int_inf,marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.1)',
        fill='tonexty',
        showlegend=False)
figura.update_layout(
    title="#Acc on Brazilian federal highways per month",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test, predictions)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
train_size = int(len(time_series_full_month)*0.70)
train, test = time_series_full_month[0:train_size], time_series_full_month[train_size:len(time_series_full_month)]
train[0:5], test[0:5]

In [None]:
X = pd.DataFrame([decomposition.observed,decomposition.trend,decomposition.seasonal]).transpose()

In [None]:
for i in range(3, 6):
    X["lag_{}".format(i)] = X.acc.shift(i)

X.dropna(inplace = True)

In [None]:
train_size = int(len(X)*0.70)
X_train, X_test = X[0:train_size], X[train_size:len(X)]

In [None]:
y_train = X_train['acc']
y_test = X_test['acc']
X_train.drop(['acc'], inplace = True, axis = 1)
X_test.drop(['acc'], inplace = True, axis = 1)

In [None]:
linearModel = LinearRegression()
linearModel.fit(X_train,y_train)

In [None]:
predictions = linearModel.predict(X_test)

In [None]:
figura = px.line()
figura.add_scatter(x = time_series_full_month.index.astype('datetime64[ns]'), y = time_series_full_month, name = 'Observed')
figura.add_scatter(x = X_test.index.astype('datetime64[ns]'), y = predictions, name = 'Predictions - LR')

figura.update_layout(
    title="#Acc on Brazilian federal highways per month",
    xaxis_title="",
    yaxis_title="#Acc",
    legend_title="",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    template= 'plotly_white'
)

In [None]:
mean_absolute_error(y_test, predictions)