### import Libraries:

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### import data-set

In [None]:
df=pd.read_csv("/kaggle/input/us-accidents/US_Accidents_Dec20_Updated.csv")

### Get the basic info. about the data-set:

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

## Clean the data:

In [None]:
df.head(3)

In [None]:
len(df.columns)

In [None]:
percentage_of_missing_values=df.isna().sum()/len(df)*100
percentage_of_missing_values[percentage_of_missing_values!=0].plot(kind="bar")

#### As we can see in Number column there is 60% of the missing values .
#### There is also 45%  missing values in the precipitation.
#### 40% missing value in the wind column
#### Instead of fill the missing values it is better to remove those columns:

df.drop(["Number"],axis=1,inplace=True)
df.drop(["Precipitation(in)"],axis=1,inplace=True)
df.drop(["Wind_Chill(F)"],axis=1,inplace=True)

#### ALso there are some columns which are of no use it is better to remove those columns:
#### like the column country we know we are doing analysis on only one country US. so there is non use of that column:

df.drop(["Start_Lat"],axis=1,inplace=True)
df.drop(["Start_Lng"],axis=1,inplace=True)
df.drop(["End_Lat"],axis=1,inplace=True)
df.drop(["End_Lng"],axis=1,inplace=True)
df.drop(["Airport_Code"],axis=1,inplace=True)
df.drop(["Zipcode"],axis=1,inplace=True)
df.drop(["Country"],axis=1,inplace=True)
df.drop(["Timezone"],axis=1,inplace=True)
df.drop(["Pressure(in)"],axis=1,inplace=True)
df.drop(["Wind_Direction"],axis=1,inplace=True)
df.drop(["Wind_Speed(mph)"],axis=1,inplace=True)
df.drop(["Amenity"],axis=1,inplace=True)
df.drop(["Give_Way"],axis=1,inplace=True)
df.drop(["No_Exit"],axis=1,inplace=True)
df.drop(["Roundabout"],axis=1,inplace=True)
df.drop(["Stop"],axis=1,inplace=True)
df.drop(["Traffic_Calming"],axis=1,inplace=True)
df.drop(["Turning_Loop"],axis=1,inplace=True)
df.drop(["Astronomical_Twilight"],axis=1,inplace=True)
df.drop(["Nautical_Twilight"],axis=1,inplace=True)
df.drop(["Civil_Twilight"],axis=1,inplace=True)
df.drop(["Sunrise_Sunset"],axis=1,inplace=True)
df.drop(["Weather_Timestamp"],axis=1,inplace=True)


In [None]:
df.columns

In [None]:
df.head()

#### NOW IMPUTE THE MISSING VALUES:

#### 01. impute the missing values of numerical columns:

In [None]:
df["Temperature(F)"]=df["Temperature(F)"].fillna(df["Temperature(F)"].median())
df["Humidity(%)"]=df["Humidity(%)"].fillna(df["Humidity(%)"].median())
df["Visibility(mi)"]=df["Visibility(mi)"].fillna(df["Visibility(mi)"].median())


#### 02. Impute the missing values of categorical column:

##### in city column we will remove the missing rows:


In [None]:
df.dropna(subset=["City"],inplace=True)

In [None]:
df["Weather_Condition"]=df["Weather_Condition"].fillna(df["Weather_Condition"].mode()[0])

In [None]:
df.isna().sum()


#### Now our data is clean :

### LET'S DO SOME ANALYSIS:

In [None]:
df.head(3)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20,7))
sns.countplot(x="State",data=df)
plt.yscale("log")
plt.title("STATES WITH NUMBER OF ACCIDENTS",fontsize=20)
plt.show()

In [None]:
top_cities=df["City"].value_counts().sort_values()[-20:].reset_index()
top_cities.columns=["city","number_of_accidents"]


In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x="city",y="number_of_accidents",data=top_cities)
plt.title("TOP 10 CITIES WITH HIGHEST NUMBER OF ACCIDENTS",fontsize=20)
plt.xticks(rotation=40)
plt.show()

In [None]:
top_streets=df["Street"].value_counts().sort_values()[-20:].reset_index()
top_streets.columns=["street_name","number_of_accidents"]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=top_streets["street_name"],y=top_streets["number_of_accidents"])
plt.xticks(rotation=90)
plt.title("TOP 20 STREETS WITH MAXIMUM NUMBER OF ACCIDENTS ",fontsize=20)
plt.show()


In [None]:
df["Severity"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="Severity",data=df)
plt.yscale("log")
plt.title("COUNT OF SEVERITY",fontsize=20)
plt.show()

In [None]:
df.columns

In [None]:
df["year"]=pd.DatetimeIndex(df["Start_Time"]).year

In [None]:
df["month"]=pd.DatetimeIndex(df["Start_Time"]).month

In [None]:
df["dayofweek"]=pd.DatetimeIndex(df["Start_Time"]).dayofweek

In [None]:
df["day"]=pd.DatetimeIndex(df["Start_Time"]).day

In [None]:
yearly_accidents=df["year"].value_counts().reset_index()
yearly_accidents.columns=["year","accidents"]

In [None]:
import plotly.express as px

In [None]:
px.bar(x=yearly_accidents["year"],y=yearly_accidents["accidents"])

In [None]:
df["month"].unique()

In [None]:
df["month"]=df["month"].map({1:"janurary",
                2:"feburary",
                3:"march",
                4:"april",
                5:"may",
                6:"june",
                7:"july",
                8:"august",
                9:"september",
                10:"october",
                11:"november",
                12:"december"})

In [None]:
monthly_accidents=df["month"].value_counts().reset_index()
monthly_accidents.columns=["month","accidents"]

In [None]:
monthly_accidents=df[["month","year"]].value_counts().reset_index()
monthly_accidents.columns=["month","year","number_of_accidents"]

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x="month",y="number_of_accidents",hue="year",data=monthly_accidents)

In [None]:
df["day"].unique()

In [None]:
weekly_accidents=df[["dayofweek","year"]].value_counts().reset_index()
weekly_accidents.columns=["day_of_the_week","year","number_of_accidents"]


In [None]:
df["dayofweek"].unique()

In [None]:
df["dayofweek"]=df["dayofweek"].map({0:"sunday",
                                         1:"monday",
                                         2:"tuesday",
                                        3:"wednesday",
                                        4:"thursday",
                                        5:"friday",
                                        6:"saturday"})

In [None]:

plt.figure(figsize=(10,6))
sns.barplot(x="day_of_the_week",y="number_of_accidents",hue="year",data=weekly_accidents)

In [None]:
df.columns

In [None]:
df["Severity"]

In [None]:







df[["Severity","year"]]

In [None]:
pd.crosstab(df["year"],df["Severity"]).plot(kind="bar")


### weather:

In [None]:
df.columns

In [None]:
range_temp = pd.cut(df['Temperature(F)'], 5)
range_temp.unique()

In [None]:
df["Temperature(F)"].value_counts()

In [None]:
df.loc[ (df['Temperature(F)'] <= 10), 'Temperature(F)']=1
df.loc[ (df['Temperature(F)'] > 10) & (df['Temperature(F)'] <= 50), 'Temperature(F)']=2
df.loc[ (df['Temperature(F)'] > 50) & (df['Temperature(F)'] <= 90), 'Temperature(F)']=3
df.loc[ (df['Temperature(F)'] > 90) & (df['Temperature(F)'] <= 130),'Temperature(F)' ]=4
df.loc[  df['Temperature(F)'] > 130, 'Temperature(F)']=5

In [None]:
weather_dist = df['Temperature(F)'].value_counts().reset_index()
weather_dist.columns=["range_temp","number_of_accidents"]
weather_dist

In [None]:
weather_dist.range_temp=weather_dist.range_temp.map({1.0:"below 10 degree F",
                            2.0:"from 10 to 50 degree F",
                            3.0:"from 50 to 90 degree F",
                            4.0:"from 90 to 130 degree F",
                             5.0:"above 130 degree F"})

In [None]:

px.bar(x=weather_dist["range_temp"],y=weather_dist["number_of_accidents"])


In [None]:
df.columns

In [None]:
df.loc[(df["Humidity(%)"]<=20),"Humidity(%)"]=1
df.loc[(df["Humidity(%)"]>20) & (df["Humidity(%)"]<=40),"Humidity(%)"]=2
df.loc[(df["Humidity(%)"]>40) & (df["Humidity(%)"]<=60),"Humidity(%)"]=3
df.loc[(df["Humidity(%)"]>60) & (df["Humidity(%)"]<=80),"Humidity(%)"]=4
df.loc[(df["Humidity(%)"]>80) & (df["Humidity(%)"]<=100),"Humidity(%)"]=5
df.loc[(df["Humidity(%)"]>100),"Humidity(%)"]=6



In [None]:
df["Humidity(%)"].unique()

In [None]:
ax=sns.countplot(df["Humidity(%)"])
for i in ax.patches:
    count = '{:,.1f}'.format(i.get_height())
    x = i.get_x()+i.get_width()-0.60
    y = i.get_height()+10000
    ax.annotate(count, (x, y))
plt.show()

In [None]:
len(df["Weather_Condition"].unique())

In [None]:
df["Weather_Condition"].value_counts()

In [None]:
df[["Weather_Condition","Severity"]].value_counts().sort_values()[-10:]

In [None]:
np.arange(1,5)

In [None]:
for x in range(1,5):
    plt.subplots(figsize=(10,6))
    df.loc[df["Severity"] == x]['Weather_Condition'].value_counts().sort_values()[-10:].plot(kind="bar")

In [None]:
for x in np.arange(1,5):
    plt.subplots(figsize=(12,5))
    df.loc[df["Severity"] == x]['Weather_Condition'].value_counts().sort_values(ascending=False).head(20).plot.bar(width=0.5,color='y',edgecolor='k',align='center',linewidth=1)
    plt.xlabel('Weather Condition',fontsize=16)
    plt.ylabel('Accident Count',fontsize=16)
    plt.title('20 of The Main Weather Conditions for Accidents of Severity ' + str(x),fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

for s in ["Fog","Light Rain","Rain","Heavy Rain","Snow"]:
    plt.subplots(1,2,figsize=(12,5))
    plt.suptitle('Accident Severity Under ' + s,fontsize=16)
    plt.subplot(1,2,1)
    df.loc[df["Weather_Condition"] == s]['Severity'].value_counts().plot.bar(width=0.5,color='y',edgecolor='k',align='center',linewidth=1)
    plt.xlabel('Severity',fontsize=16)
    plt.ylabel('Accident Count',fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.subplot(1,2,2)
    df.loc[df["Weather_Condition"] == s]['Severity'].value_counts().plot.pie(autopct='%1.0f%%',fontsize=16)

In [None]:
values=["Fog","Light Rain","Rain","Heavy Rain","Snow"]

In [None]:
for x in ["Fog","Light Rain","Rain","Heavy Rain","Snow"]:
    plt.subplots(1,2,figsize=(10,6))
    plt.subplot(1,2,1)
    df.loc[df["Weather_Condition"]==x]["Severity"].value_counts().sort_values().plot(kind="bar")
    plt.suptitle("Severity for "+str(x),fontsize=20)
    plt.xlabel("severity")
    plt.ylabel("number_of_deaths")
    plt.subplot(1,2,2)
    df.loc[df["Weather_Condition"] == x]['Severity'].value_counts().plot.pie(autopct='%1.0f%%',fontsize=16)

In [None]:
df.columns

In [None]:
df["Start_Time"]=pd.DatetimeIndex(df["Start_Time"])

In [None]:
df.End_Time=pd.DatetimeIndex(df.End_Time)

In [None]:
df['Duration'] = df.End_Time - df.Start_Time 

In [None]:
df['Duration']

In [None]:
df['Duration'] = df['Duration'].apply(lambda x:round(x.total_seconds() / 60) )
print("The overall mean duration is: ", (round(df['Duration'].mean(),3)), 'min')

In [None]:
len(df['Duration'].unique())

sns.set(style="whitegrid")
plt.figure(figsize=(8,5))
total = float(len(train_df))
ax = sns.countplot(x="event", hue="event", data=train_df)
plt.title('Data provided for each event', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()