In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_filename = '../input/us-accidents/US_Accidents_Dec20_updated.csv'

## Data preparation and cleaning
- Load the file using Pandas
- Look at some information about the data and the columns
- Fix any missing or incorrect values

In [None]:
df = pd.read_csv(data_filename)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
numerics = ['int16', 'int32','int64', 'float16','float32','float64']
numeric_df = df.select_dtypes(include=numerics)
len(numeric_df.columns)

### Percentage of missing values per columns

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
missing_value = df.isnull().sum().sort_values(ascending=False) /len(df)
missing_percentages = missing_value * 100


In [None]:
print("Percentages of missing values per columns: \n",missing_percentages)

In [None]:
missing_percentages[missing_percentages != 0]

## Ploting the percentages

In [None]:
missing_percentages[missing_percentages != 0].plot(kind='barh')

### Remove columns which are missing above 50% , because 50% of that data will not use in model preparation.

### We see that Number column is missing data closed to 70% so I drop it.

In [None]:
df = df.drop('Number', axis=1)

In [None]:
df.shape

Exploratory Analysis and Visualization

The Columns we'll analyze
1. City
2. Start time
3. Start Lat, Start Lng
4. Temparature
5. Weather Condition

In [None]:
df.columns

In [None]:
df.City

In [None]:
cities = df.City.unique()
len(cities)

In [None]:
cities_by_accident = df.City.value_counts()
cities_by_accident

In [None]:
# Top 30 cities by number of accident
cities_by_accident[:30]

In [None]:
cities_by_accident[:25].plot(kind='barh')

In [None]:
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
sns.distplot(cities_by_accident[:100])


In [None]:
high_accident_cities = cities_by_accident[cities_by_accident >= 1000]
low_accident_cities = cities_by_accident[cities_by_accident < 1000]

In [None]:
(len(high_accident_cities) /len(cities)) * 100

In [None]:
sns.histplot(high_accident_cities, log_scale=True)

In [None]:
(len(low_accident_cities) /len(cities)) * 100

In [None]:
sns.histplot(low_accident_cities, log_scale=True)

In [None]:
cities_by_accident[cities_by_accident == 1]

###  Start time

In [None]:
df.Start_Time[0]

In [None]:
df.Start_Time = pd.to_datetime(df.Start_Time)

In [None]:
df.Start_Time

In [None]:
# to get hour from the timestamp
df.Start_Time.dt.hour

In [None]:
sns.histplot(df.Start_Time.dt.hour, bins=24)

In [None]:
sns.histplot(df.Start_Time.dt.dayofweek, bins=7)

### Is the distribution of accidents by hour the same as weekdays as on weekends?

In [None]:
sunday_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 6]
sns.histplot(sunday_start_time.dt.hour, bins=24)

On sundays peak accidents occure mostly between 12pm to 11pm

In [None]:
monday_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 0]
sns.histplot(monday_start_time.dt.hour, bins=24)

### Month

In [None]:
sns.histplot(df.Start_Time.dt.month, bins=12)

In [None]:
# data for yar 2019
df_2019 = df[df.Start_Time.dt.year == 2019]
sns.histplot(df_2019.Start_Time.dt.month, bins=12)

Can you explain the month-wise trenk of accident?
    - Much data are missing before 2016

### Start_lat and Start_lng

In [None]:
sns.scatterplot(x=df.Start_Lng, y=df.Start_Lat, size=0.001)

In [None]:
# take smaller sample from the data
sample_df = df.sample(int(0.1 * len(df)))

In [None]:
sns.scatterplot(x=sample_df.Start_Lng, y=sample_df.Start_Lat, size=0.005)

In [None]:
# plot that data into map using folium
import folium

In [None]:
lat, lon = df.Start_Lat[0], df.Start_Lng[0]
lat, lon

In [None]:
map = folium.Map()
marker = folium.Marker((lat, lon))
marker.add_to(map)
map

## Ask Question and Prepare answer
1. Are there more accidents in warmer or colder areas?
2. Which 5 state has the highest number pf accidents?
3. Does New York show up in the data? If yes, thwn why its count lower regardless of most populated city in US?
4. Among the top 100 cities in number of accident, which state do they belong to most frequently?
5. What time of the day are accidents most frequent in?
    - 6 to 10am and 12pm to 8pm
6. Which months have the most accidents?
7. What is the trend of accidents year over year ( decresing / incresing)?

## Summary and Conclusion

Insights:
- No data from New York
- Less than 3% of cities have more than 1000 yearly accidents.
- Nearly 1200 cities have reported just 1 accidents.
