## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
import folium

## Load the Dataset

In [None]:
df = pd.read_csv("../input/us-accidents/US_Accidents_Dec20_Updated.csv")

In [None]:
df.head()

In [None]:
df.columns

## Data Preparation and Cleaning

In [None]:
# Look at some Brief Information about the Data
df.info(memory_usage="deep")

In [None]:
#look at statistical info about the data
df.describe()

In [None]:
#Convert Time reference columns to datetime series
time = ["Start_Time","End_Time","Weather_Timestamp"]
for i in time:
    df[i] = pd.to_datetime(df[i])

In [None]:
# Select Columns which have "Object" dtype
object_col = df.columns[df.dtypes == "object"]
object_col

In [None]:
object_col = object_col.drop(['Description','Street',"Zipcode","Country"])
object_col

In [None]:
# Covert Object into category dtype for less memory usage 
for i in object_col:
    df[i] = df[i].astype("category")

In [None]:
df.info(memory_usage="deep")

In [None]:
# Finding missing values
missing_values = df.isna().sum()
percentage = (missing_values/df.shape[0])*100

In [None]:
missing_percent = pd.DataFrame({"Number_of_missing_values":missing_values,"Percentage":percentage})
missing_percent.sort_values(by="Percentage",ascending = False)

In [None]:
#Dropping columns consists more then 40% missing values 
#Country is US in the whole dataset so we can drop it 
#Turning_Loop is False in the whole dataset so we can drop it 
df.drop(["Country","Number","Turning_Loop","End_Lat","End_Lng","Precipitation(in)","Wind_Chill(F)"],axis=1,inplace=True)

In [None]:
#droping rows consists more then 10 missing values 
row_drop = df[df.isna().sum(axis=1)>=10].index
df.drop(row_drop,axis=0,inplace=True)

In [None]:
float_missing = ["Wind_Speed(mph)","Visibility(mi)","Humidity(%)","Temperature(F)","Pressure(in)"]
category_missing = ["Weather_Condition","Wind_Direction","Airport_Code","Timezone","Zipcode","Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight","City","Wind_Direction"]

In [None]:
#Fill missing values by mean in case of numerial columns
for i in float_missing:
    df[i].fillna(df[i].mean(),inplace=True)

In [None]:
#Fill missing values by mode in case of categorical columns
for i in category_missing:
    df[i].fillna(df[i].value_counts().index[0],inplace=True)

In [None]:
#Fill missing values of wheather_timestamp column by 0
df["Weather_Timestamp"].fillna(0,inplace=True)

In [None]:
df.isna().any().sum()

## Exploratory Analyses and Visualization

### At what Time do Accidents Occurs in Us

In [None]:
#Creating a new DataFrame with Star_Time Column
df2 = df[["Start_Time"]].copy()

In [None]:
#Creating year,month,weekday,day and hour wise individual columns 
df2["Year"] = df["Start_Time"].dt.year
df2["Month"] = df["Start_Time"].dt.month
df2["Day"] = df["Start_Time"].dt.day
df2["WeekDay"] = df["Start_Time"].dt.weekday
df2["Hour"] = df["Start_Time"].dt.hour

In [None]:
df2

In [None]:
df2.Day.value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=df2.Day)
plt.title("Number of Accidents Day-wise");

In [None]:
plt.style.use("ggplot")
plt.figure(figsize=(12,6))
sns.countplot(x=df2.Year,hue=df2.Month)
plt.title("Number of Accident per year with months")
plt.ylabel("Count")
plt.legend(title = "Months",loc="upper left",shadow=True);

In [None]:
df2.Month.value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df2.Month)
plt.title("Number of Accident per month");

In [None]:
df2.Hour.value_counts()

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x=df2.Hour)
plt.title("Number of Accidents per Hour");

In [None]:
df2.WeekDay.value_counts()

In [None]:
df2.WeekDay.replace([0,1,2,3,4,5,6],["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],inplace=True)

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x=df2.WeekDay,order=["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"])
plt.title("Number of Accidents per Week");

### States that has highest number of accidents 

In [None]:
df.State.value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x= df.State, order = df["State"].value_counts().index,palette="Spectral")
plt.title("States that has highest number of Accidents");

### Location at which Maximum Accident Happened

In [None]:
lat_lng = df.groupby(["Start_Lat","Start_Lng"]).count()

In [None]:
lat_lng.Severity.nlargest(1)

In [None]:
# generate map of San_Francisco
San_Francisco = folium.Map(width=500, height =300 ,location = [37.808498 , -122.366852],zoom_start=12)
## add a marker to the location 

# create a feature group
Bridge = folium.map.FeatureGroup()
# style the feature group
Bridge.add_child(folium.features.CircleMarker([37.808498 , -122.366852],radius=5,color="red",fill_color="red"))

# add the feature group to the map
San_Francisco.add_child(Bridge)

# label the marker
folium.Marker([37.808498 , -122.366852],popup="San Francisco-Oakland Bay Brg E").add_to(San_Francisco)
San_Francisco

## Factors Affecting Accidents Severity

### Accidents in Day & Night

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df.Sunrise_Sunset)
plt.title("Accident in Day & night");

### Top 10 Temperatures(F) which causes Maximum Accidents

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df["Temperature(F)"].value_counts().index[:10],y=df["Temperature(F)"].value_counts().values[:10],
           order=df["Temperature(F)"].value_counts().index[:10],palette="Spectral")
plt.title("Top 10 Temperatures(F) which causes Maximum Accidents");

- Maximum Accidents happened in 60-80F Temperature

### Accidents due to Whether Condition 

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df["Weather_Condition"].value_counts().index[:15],y=df["Weather_Condition"].value_counts().values[:15],order=df["Weather_Condition"].value_counts().index[:15])
plt.xticks(rotation=90)
plt.title("Accidents due to Weather Condition");

- Maximum Accidents happened in Fair whether