<a href="https://colab.research.google.com/github/sahilfatima/Road-Accidents-Prediction/blob/main/Road_Accident.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'road-accident-casualties:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4229738%2F7292741%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240202%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240202T083532Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D48eb18357f02df0794bba713854474ab61bb66419849068f902c8a4a2b4ff55dc6e527455261fc601f0eea465e5bca2cd93601dbcd34356f4f586f820f29836051347a1899cc1a81d4ca05ca1dd9936c25f103c0623abfcca52f69b555f2c2b7dba7e0c02eabfda4b1b58f735d516a689c4db2cfb9745367ccb00be14187bd984e198b9292fe0292ff0bbaf5f834d3a4d963da28ac8f116efc65fbef65fcf2fcc9cea61fafb0f3d7ba7b7c6c81950582e64ed0139495fceea103995a66a8c853a10852a48e8c86e5509af28cd8d0d9a77028280e0e664e401aba74ec162091593b668945fdd493821e6b49b064d5606ef1e33805d158f1083671362ef8ac50d0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime as dt
import plotly_express as px

In [None]:
#Reading a csv file
df = pd.read_csv('/kaggle/input/road-accident-casualties/data_accident.csv')
df['Accident_Date'] = pd.to_datetime(df['Accident_Date'], yearfirst=True)
df['Year'] = df['Accident_Date'].dt.year
df['Month'] = df['Accident_Date'].dt.strftime('%B')
df['Day'] = df['Accident_Date'].dt.day
df['Week_Day'] = df['Accident_Date'].dt.strftime('%A')
df = df.groupby(['Accident_Date','Year','Month','Week_Day','Day'])['Accident_Date'].count().reset_index(name='Count')
df.head(3)

In [None]:
#Visualising data
plt.figure(figsize=(15,4))
plt.subplot(121)
sns.histplot(df, x='Count');
plt.title('Distribution of the number of accidents per day')
plt.subplot(122)
sns.boxplot(df, x='Year', y='Count');
plt.title('Statistical Distribution of the number of accidents per day')
plt.show()

In [None]:
#Showing number of road accidents in week, month and year
plt.figure(figsize=(20,5))
plt.subplot(131)
sns.barplot(
    df.groupby('Week_Day')['Count'].sum().reset_index(name='Count').sort_values(
        by='Count', ascending=False), x='Week_Day', y='Count')
plt.title('Accident Number by Day of Week Day');
plt.xticks(rotation=45)
plt.subplot(132)
sns.barplot(
    df.groupby('Month')['Count'].sum().reset_index(name='Count').sort_values(
        by='Count', ascending=False), x='Month', y='Count')
plt.title('Accident Number by Month')
plt.xticks(rotation=45)
plt.subplot(133)
sns.barplot(
    df.groupby('Year')['Count'].sum().reset_index(name='Count'), x='Year', y='Count')
plt.title('Accident Number by Year')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Road Accidents in year by each months
list_month = df.Month.unique()
for month in list_month:
    plt.figure(figsize=(15,4))
    sns.lineplot(data=df.query(f"Month=='{month}'"), x='Day', y='Count', hue='Year')
    plt.grid(axis='y', color='black', linestyle='-', linewidth=0.1)
    plt.title(f"Accident Number by Month: {month}")
    plt.ylim(0, 700)
    plt.show()

In [None]:
#Determining the Longitude and Latitude of Roads
df2 = pd.read_csv('/kaggle/input/road-accident-casualties/latitutde_accident.csv')
df3 = pd.read_csv('/kaggle/input/road-accident-casualties/longitutde_accident.csv')

print(df2.shape, df3.shape)
print(df2.columns, df3.columns)

In [None]:
#Finding length
lat_long_data = list(zip(df2['Latitude'], df3['Longitude']))
lat_long = pd.DataFrame(lat_long_data, columns=['Latitude', 'Longitude'])
lat_long['Local'] = 1
df_lat_long = lat_long.groupby(['Latitude','Longitude'])['Local'].sum().sort_values(ascending=False)
display(df_lat_long)

In [None]:
#Location
locais_mais_5 = df_lat_long[df_lat_long.values > 5].count()
locais_mais_5

In [None]:
data_mais_5 = df_lat_long[df_lat_long.values > 5]
data_mais_5 = data_mais_5.reset_index()
data_mais_5

In [None]:
#Using map to visualise accidents more than 5
fig = px.scatter_mapbox(data_mais_5, lat="Latitude", lon="Longitude",
                        color='Local', zoom=3, height=500, )
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#Road accidents more than 15
locais_mais_15 = df_lat_long[df_lat_long.values > 15].count()
data_mais_15 = df_lat_long[df_lat_long.values > 15]
data_mais_15 = data_mais_15.reset_index()
fig = px.scatter_mapbox(data_mais_15, lat="Latitude", lon="Longitude",
                        color='Local', zoom=3, height=500, )
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#Light affecting driving; resulting into Accidents
df3 = pd.read_csv('/kaggle/input/road-accident-casualties/dark-light_accident.csv')
df3['Light_Conditions'].value_counts(normalize=1).reset_index(name='Accident %')

In [None]:
#Rural and Urban accidents
df4 = pd.read_csv('/kaggle/input/road-accident-casualties/ubar_rural_area.csv')
df4['Urban_or_Rural_Area'].value_counts(normalize=1).reset_index(name='%')

In [None]:
#Severity of accidents
df5 = pd.read_csv('/kaggle/input/road-accident-casualties/accident_fatal.csv')
df5['Accident_Severity'].value_counts(normalize=1).reset_index(name='%')

In [None]:
#Districts with high ratio of accidents and with lowers
df6 = pd.read_csv('/kaggle/input/road-accident-casualties/disctrit-area_accident.csv')
plt.figure(figsize=(16,4))
plt.subplot(121)
sns.barplot(data = df6['District_Area'].value_counts().reset_index(name='Count').head(10), x='District_Area', y='Count')
plt.title('Districts with the most accidents')
plt.xticks(rotation=45)
plt.subplot(122)
sns.barplot(data = df6['District_Area'].value_counts().reset_index(name='Count').tail(10), x='District_Area', y='Count')
plt.title('Districts with the fewer accidents')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Road Conditions effecting driving
df7 = pd.read_csv('/kaggle/input/road-accident-casualties/road-surface-condictions_accident.csv')
df7['Road_Surface_Conditions'].value_counts(normalize=1).reset_index(name='%')

In [None]:
#Road type causing accidents
df8 = pd.read_csv('/kaggle/input/road-accident-casualties/road_type- corrigir G-sheets.csv')
df8['Road_Type'].value_counts(normalize=1).reset_index(name='%')

In [None]:
#Weather Condition
df9 = pd.read_csv('/kaggle/input/road-accident-casualties/wheather_condictions_accident.csv')
df9['Weather_Conditions'].value_counts(normalize=1).reset_index(name='%')

In [None]:
#Vehicle type causing higher risk of accident
df10 = pd.read_csv('/kaggle/input/road-accident-casualties/vehicule_type.csv')
plt.figure(figsize=(8,4))
sns.barplot(data = df10['Vehicle_Type'].value_counts().reset_index(name='Count'), y='Vehicle_Type', x='Count')
plt.title('Road-accident-casualties/vehicule_type')
plt.show()