In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# US Accidents Analysis 

## 1. About Data

This is a countrywide traffic accident dataset, which covers 49 states of the United States. The data is continuously being collected from February 2016, using several data providers, including two APIs which provide streaming traffic event data.

## 2. Columns Description 

**ID** => This is a unique identifier of the accident record.<br>
**Source** => Indicates source of the accident report (i.e. the API which reported the accident.). <br>
**TMCA** => traffic accident may have a Traffic Message Channel (TMC) code which provides more detailed description of the event. <br>
**Severity** => Shows the severity of the accident, a number between 1 and 4, where 1 indicates the least impact on traffic (i.e., short delay as a result of the accident) and 4 indicates a significant impact on traffic (i.e., long delay).<br>
**Start_Time** => Shows start time of the accident in local time zone.<br>
**End_Time** => Shows end time of the accident in local time zone.<br>
**Start_Lat** => Shows latitude in GPS coordinate of the start point.<br>
**Start_Lng** => Shows longitude in GPS coordinate of the start point.<br>
**End_Lat** => Shows latitude in GPS coordinate of the end point.<br>
**End_Lng** => Shows longitude in GPS coordinate of the end point.<br>
**Distance(mi)** => The length of the road extent affected by the accident.<br>
**Description** => Shows natural language description of the accident.<br>
**Number** => Shows the street number in address field.<br>
**Street** => Shows the street name in address field.<br>
**Side** => Shows the relative side of the street (Right/Left) in address field.<br>
**City** => Shows the city in address field.<br>
**County** => Shows the county in address field.<br>
**State** => Shows the state in address field.<br>
**Zipcode** => Shows the zipcode in address field.<br>
**Country** => Shows the country in address field.<br>
**Timezone** => Shows timezone based on the location of the accident (eastern, central, etc.).<br>
**Airport_Code** => Denotes an airport-based weather station which is the closest one to location of the accident.<br>
**Weather_Timestamp** => Shows the time-stamp of weather observation record (in local time).<br>
**Temperature(F)** => Shows the temperature (in Fahrenheit).<br>
**Wind_Chill(F)** => Shows the wind chill (in Fahrenheit).<br>
**Humidity(%)** => Shows the humidity (in percentage).<br>
**Pressure(in)** => Shows the air pressure (in inches).<br>
**Visibility(mi)** => Shows visibility (in miles).<br>
**Wind_Direction** => Shows wind direction.<br>
**Wind_Speed(mph)** => Shows wind speed (in miles per hour).<br>
**Precipitation(in)** => Shows precipitation amount in inches, if there is any.<br>
**Weather_Condition** => Shows the weather condition (rain, snow, thunderstorm, fog, etc.)<br>
**Amenity** => A POI annotation which indicates presence of amenity in a nearby location.<br>
**Bump** => A POI annotation which indicates presence of speed bump or hump in a nearby location.<br>
**Crossing** => A POI annotation which indicates presence of crossing in a nearby location.<br>
**Give_Way** => A POI annotation which indicates presence of give_way in a nearby location.<br>
**Junction** => A POI annotation which indicates presence of junction in a nearby location.<br>
**No_Exit** => A POI annotation which indicates presence of no_exit in a nearby location.<br>
**Railway** => A POI annotation which indicates presence of railway in a nearby location.<br>
**Roundabout** => A POI annotation which indicates presence of roundabout in a nearby location.<br>
**Station** => A POI annotation which indicates presence of station in a nearby location.<br>
**Stop** => A POI annotation which indicates presence of stop in a nearby location.<br>
**Traffic_Calming** => A POI annotation which indicates presence of traffic_calming in a nearby location.<br>
**Traffic_Signal** => A POI annotation which indicates presence of traffic_signal in a nearby location.<br>
**Turning_Loop** => A POI annotation which indicates presence of turning_loop in a nearby location.<br>
**Sunrise_Sunset** => Shows the period of day (i.e. day or night) based on sunrise/sunset.<br>
**Civil_Twilight** => Shows the period of day (i.e. day or night) based on civil twilight.<br>
**Nautical_Twilight** => Shows the period of day (i.e. day or night) based on nautical twilight.<br>
**Astronomical_Twilight** => Shows the period of day (i.e. day or night) based on astronomical twilight.<br>

## 3. Data Exploration

In [None]:
df=pd.read_csv('../input/us-accidents/US_Accidents_May19.csv')

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
listItem = []
for col in df.columns :
    listItem.append([col, df[col].dtype, df[col].isna().sum(), round((df[col].isna().sum()/len(df[col])) * 100,2),
                    df[col].nunique(), list(df[col].unique()[:2])]);

dfDesc = pd.DataFrame(columns=['dataFeatures', 'dataType', 'null', 'nullPct', 'unique', 'uniqueSample'],
                     data=listItem)
dfDesc

## 4. Data Preprocessing 

### 4.1 Handle Mising Value

-  **TMC**

In [None]:
df[df['TMC'].isnull()]['Source'].value_counts()

I assume that Data with missing value at TMC Columns is because the data Source came from Bing.

In [None]:
len(df[df['Source'] == 'Bing'])

Yes, thats True, all data from Bing Doesnt have TMC Value, so i decided to fill it by some placeholder like "Unknown" maybe. Basically we have to do some research why the data from Bing came with nan value at TMC, but i don't do this things right now, because it will take long time.

In [None]:
df['TMC'] = df['TMC'].fillna('Unknown')

- **End_Lat , End_Lng**

There is 76% data was missing at this two columns, so i decided to remove this two columns, because it was very hard to fill with some value

In [None]:
df.drop(['End_Lat','End_Lng'],axis=1,inplace=True)

- **Description**

I decided to fill this missing with "unknown"

In [None]:
df['Description'] = df['Description'].fillna('Unknown')

- **Number**

I will not use this column, and there is too many data was null, so i decided to remove this column

In [None]:
df.drop('Number',axis=1,inplace=True)

- **City**

The missing value was very small, so i decided to delete the row

In [None]:
df.dropna(subset=['City'],inplace=True)

- **Lazy Cleaning**

To make this kind of work faster, i decided to remove column with missing data over 50% and remove the row if the column miss under 50%

In [None]:
def remove_column_or_row(name_col):
    null = df[name_col].isnull().sum()
    perc = null/len(df) * 100
    if(perc > 50):
        df.drop(name_col,axis=1,inplace=True)
    else:
        df.dropna(subset=[name_col],inplace=True)

In [None]:
for item in df.columns:
    remove_column_or_row(item)

In [None]:
df.isnull().sum()

Bumm, magic, our data was clean, but you have noted that we might losse much information if we clean the data using this method, before you deleted either row or column, you have to observe first, why this data missing??

### 4.2 Feature Engineering

In [None]:
df.columns

> Drop Column ID

In [None]:
df.drop('ID' , inplace=True,axis=1)

> Make Minutes Column based Start_time and End_time

In [None]:
diff = pd.to_datetime(df['End_Time']) - pd.to_datetime(df['Start_Time'])
df['Minutes'] = diff.dt.total_seconds().div(60).astype(int)

## 5. EDA

In [None]:
df.columns

> Count Severity

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
sns.countplot(df['Severity'],ax=ax[0])
sns.countplot(df[(df['Severity'] == 0) | (df['Severity']==1)]['Severity'],ax=ax[1])

In [None]:
df['Severity'].value_counts(normalize=True)

65% of accidents data grouped by Severity 2 which means normal accident, not too chaos

> Top Ten State With the most Accidents in US

In [None]:
import plotly.graph_objects as go

labels = df['State'].value_counts().head(10).index
values = df['State'].value_counts().head(10).values

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

In [None]:
# import plotly.graph_objects as go

labels = df['TMC'].value_counts().head(10).index
values = df['TMC'].value_counts().head(10).values

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

> The Longest Accident in Minutes

In [None]:
df.sort_values(by='Minutes',ascending=False).head(10)[['Severity','County','Description','Distance(mi)','Minutes']]

In [None]:
sns.distplot(df['Minutes'])
plt.tight_layout()

> The Longest Accident in Distance(mi)

In [None]:
df.sort_values(by='Distance(mi)',ascending=False).head(10)[['Severity','County','Description','Distance(mi)','Minutes']]

In [None]:
sns.distplot(df['Distance(mi)'])
plt.tight_layout()

> Relationship Between Distance and Minutes

In [None]:
import plotly.express as px
fig = px.scatter(x=df['Minutes'], y=df['Distance(mi)'])
fig.show()

In [None]:
df.groupby('State').mean()[['Distance(mi)','Minutes']].sort_values(by='Distance(mi)',ascending=False).head(10)

In [None]:
df.groupby('State').mean()[['Distance(mi)','Minutes']].sort_values(by='Minutes',ascending=False).head(10)