In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
##### Checking Duplicate Columns


df_duplicate = df[df.duplicated()]
df_duplicate

In [None]:
df_duplicate = df[df.duplicated()]
df_duplicate.count()

In [None]:
### Finding Categorical Variables


cat = [cat for cat in df.columns if df[cat].dtypes == "O"]
print(len(cat))
cat

In [None]:
### Finding Boolean Types


num = [num for num in df.columns if df[num].dtypes =="bool"]
print(len(num))
num

In [None]:
### Finding Numerical Variables


num = [num for num in df.columns if df[num].dtypes !="bool" and df[num].dtypes!='O']
print(len(num))
num

In [None]:
### is there some null values

df.isnull().sum()

In [None]:
### How many missing values are there in each column

missing_per = df.isnull().sum().sort_values(ascending=False)
missing_per

In [None]:
### Missing percentages of values

missing_per = df.isnull().sum().sort_values(ascending=False)/len(df)
missing_per

In [None]:
plt.figure(figsize=(10,15))
missing_per.plot(kind='barh')
plt.title('Missing values')

In [None]:
missing_per[missing_per!=0]

In [None]:
plt.figure(figsize=(8,12))
missing_per[missing_per!=0].sort_values(ascending=True).plot(kind="barh")

In [None]:
## Analyse CITY

len(df.City.unique())

In [None]:
cities_by_accidents = df.City.value_counts()
cities_by_accidents.head(50)

##### cities_by_accidents[cities_by_accidents<100].count()

In [None]:
#### Top 100 Cities with Number of accidents

pd.set_option("display.max_rows",None)
cities_by_accidents[:100]


In [None]:
#### Is NEW YOUR present there

if "New York" in cities_by_accidents:
  print("New York:",cities_by_accidents["New York"])

In [None]:
df[df.City == "New York"].head()

In [None]:
plt.figure(figsize=(12,10))
cities_by_accidents[:60].plot(kind="barh")

In [None]:
#### How many States have what number of accidents


plt.figure(figsize=(15,8))
df.groupby("State")["City"].count().plot(kind="bar")
plt.yscale("log")

In [None]:
### Highest and lowest accidents 


high_accident = cities_by_accidents[cities_by_accidents>=1000]
low_accident = cities_by_accidents[cities_by_accidents<1000]

In [None]:
print("Cities with more than 1000 accidents:",len(high_accident))
print("Percentage > 1000:",len(high_accident)/len(cities_by_accidents))
print("Cities with less than 1000 accidents:",len(low_accident))
print("Percentage < 1000:",len(low_accident)/len(cities_by_accidents))

In [None]:
sns.histplot(high_accident,log_scale=True)
plt.title('Number of Cities with highest accidents')

In [None]:
sns.histplot(low_accident,log_scale=True)
plt.title("Number of Cities with lower accidents")

In [None]:
## Time conversion to Timestamp

df.Start_Time[0]
type(df.Start_Time[0])

In [None]:
df.Start_Time = pd.to_datetime(df.Start_Time)

In [None]:
df.Start_Time[0]

In [None]:
sns.set_theme()

In [None]:
### Number of accidents in hours 

sns.distplot (df.Start_Time.dt.hour,bins=24,norm_hist=True,kde=False,hist=True)

In [None]:
#### Number of accidents in weeks

sns.distplot (df.Start_Time.dt.dayofweek,bins=7,norm_hist=True,kde=False,hist=True)

In [None]:
#### Number of accidents on Sunday


sunday = df.Start_Time[df.Start_Time.dt.dayofweek == 6]
sns.distplot(sunday.dt.hour,bins=24,norm_hist=True,kde=False)

In [None]:
### Number of accidents on Monday


monday = df.Start_Time[df.Start_Time.dt.dayofweek == 0]
sns.distplot(monday.dt.hour,bins=24,norm_hist=True,kde=False)

In [None]:
#### Number of accidents in months

sns.distplot(df.Start_Time.dt.month,bins=12,norm_hist=True,kde=False)

In [None]:
#### Number of accidents in each year


for year in [2016,2017,2018,2019,2020]:
  year_wise = df[df.Start_Time.dt.year == year]
  plt.figure(figsize=(7,7))
  sns.distplot(year_wise.Start_Time.dt.month,bins=12,norm_hist=True,kde=False)
  plt.title(year)

In [None]:
#### which area is much more prone to accidents 

plt.figure(figsize=(10,10))
sns.scatterplot(x=df.Start_Lng,y=df.Start_Lat)

In [None]:
plt.figure(figsize=(20,7))
sns.countplot(x="State",data=df)
plt.yscale("log")
plt.title("STATES WITH NUMBER OF ACCIDENTS",fontsize=20)
plt.show()

In [None]:
top_cities=df["City"].value_counts().sort_values()[-20:].reset_index()
top_cities.columns=["city","number_of_accidents"]
top_cities

In [None]:
top_cities=df["City"].value_counts().sort_values()[-20:].reset_index()
top_cities.columns=["city","number_of_accidents"]
top_cities["prob_acc"] = top_cities["number_of_accidents"]/sum(top_cities["number_of_accidents"])

In [None]:
top_cities


In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x="city",y="prob_acc",data=top_cities)
plt.title("TOP 10 CITIES WITH HIGHEST NUMBER OF ACCIDENTS",fontsize=20)
plt.xticks(rotation=40)

In [None]:
top_streets=df["Street"].value_counts().sort_values()[-20:].reset_index()
top_streets.columns=["street_name","number_of_accidents"]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=top_streets["street_name"],y=top_streets["number_of_accidents"])
plt.xticks(rotation=90)
plt.title("TOP 20 STREETS WITH MAXIMUM NUMBER OF ACCIDENTS ",fontsize=20)