## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib notebook
import seaborn as sb
%pylab inline
from datetime import datetime as dt


In [None]:
mpl.rcParams['agg.path.chunksize'] = 10000

In [None]:
import folium

## Load the Dataset

In [None]:
df = pd.read_csv("../input/us-accidents/US_Accidents_Dec20_Updated.csv")

In [None]:
df1 = df

In [None]:
df1.columns

## Data Preparation and Cleaning

In [None]:
# Look at some Brief Information about the Data
df.info(memory_usage="deep")

In [None]:
#look at statistical info about the data
df.describe()

In [None]:
#Convert Time reference columns to datetime series
time = ["Start_Time","End_Time","Weather_Timestamp"]
for i in time:
    df[i] = pd.to_datetime(df[i])

In [None]:
# Select Columns which have "Object" dtype
object_col = df.columns[df.dtypes == "object"]
object_col

In [None]:
object_col = object_col.drop(['Description','Street',"Zipcode","Country"])
object_col

In [None]:
# Covert Object into category dtype for less memory usage 
for i in object_col:
    df[i] = df[i].astype("category")

In [None]:
df.info(memory_usage="deep")

In [None]:
# Finding missing values
missing_values = df.isna().sum()
percentage = (missing_values/df.shape[0])*100

In [None]:
missing_percent = pd.DataFrame({"Number_of_missing_values":missing_values,"Percentage":percentage})
missing_percent.sort_values(by="Percentage",ascending = False)

In [None]:
#Dropping columns consists more then 40% missing values 
#Country is US in the whole dataset so we can drop it 
#Turning_Loop is False in the whole dataset so we can drop it 
df.drop(["Country","Number","Turning_Loop","End_Lat","End_Lng","Precipitation(in)","Wind_Chill(F)"],axis=1,inplace=True)

In [None]:
#droping rows consists more then 10 missing values 
row_drop = df[df.isna().sum(axis=1)>=10].index
df.drop(row_drop,axis=0,inplace=True)

In [None]:
float_missing = ["Wind_Speed(mph)","Visibility(mi)","Humidity(%)","Temperature(F)","Pressure(in)"]
category_missing = ["Weather_Condition","Wind_Direction","Airport_Code","Timezone","Zipcode","Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight","City","Wind_Direction"]

In [None]:
#Fill missing values by mean in case of numerial columns
for i in float_missing:
    df[i].fillna(df[i].mean(),inplace=True)

In [None]:
#Fill missing values by mode in case of categorical columns
for i in category_missing:
    df[i].fillna(df[i].value_counts().index[0],inplace=True)

In [None]:
#Fill missing values of wheather_timestamp column by 0
df["Weather_Timestamp"].fillna(0,inplace=True)

In [None]:
    df.isna().any().sum()

## Exploratory Analyses and Visualization

### At what Time do Accidents Occurs in Us

In [None]:
df2 = df[["Start_Time"]].copy()

In [None]:
df2

In [None]:
df2["Year"] = df["Start_Time"].dt.year
df2["Month"] = df["Start_Time"].dt.month
df2["Day"] = df["Start_Time"].dt.day
df2["WeekDay"] = df["Start_Time"].dt.weekday
df2["Hour"] = df["Start_Time"].dt.hour

In [None]:
df2
# 21.1.2017 Washington

In [None]:
dfResult = pd.merge(df1, df2, on = ['Start_Time'])

In [None]:
dfResult

In [None]:
conditions = [
    (dfResult['Year'] == 2017) & (dfResult['Month'] == 1) & (dfResult['Day'] == 21) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2018) & (dfResult['Month'] == 5) & (dfResult['Day'] == 22) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 21) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 22) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 23) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 24) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 25) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 26) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2019) & (dfResult['Month'] == 9) & (dfResult['Day'] == 27) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2017) & (dfResult['Month'] == 4) & (dfResult['Day'] == 22) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2017) & (dfResult['Month'] == 4) & (dfResult['Day'] == 22) & (dfResult['City'] == 'Los Angeles'),
    (dfResult['Year'] == 2017) & (dfResult['Month'] == 4) & (dfResult['Day'] == 22) & (dfResult['City'] == 'Chicago'),
    (dfResult['Year'] == 2017) & (dfResult['Month'] == 4) & (dfResult['Day'] == 22) & (dfResult['City'] == 'Boston'),
    (dfResult['Year'] == 2018) & (dfResult['Month'] == 3) & (dfResult['Day'] == 24) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 6) & (dfResult['City'] == 'Minniapolis'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 6) & (dfResult['City'] == 'Los Angeles'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 6) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 6) & (dfResult['City'] == 'New York'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 7) & (dfResult['City'] == 'Minniapolis'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 7) & (dfResult['City'] == 'Los Angeles'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 7) & (dfResult['City'] == 'Washington'),
    (dfResult['Year'] == 2020) & (dfResult['Month'] == 7) & (dfResult['City'] == 'New York')]

values = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

dfResult['Факт ЧП'] = np.select(conditions, values)

In [None]:
dfResult

In [None]:
from pandas import read_csv, DataFrame
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [None]:
import statsmodels.api as sm

In [None]:
dfResult.head()

In [None]:
def reg_m(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

In [None]:
dfResult.head()

In [None]:
X = dfResult["Severity"] ## X usually means our input variables (or independent variables)
y = dfResult["ID"] ## Y usually means our output/dependent variable
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model

# Note the difference in argument order
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)




# Print out the statistics
model.summary()

In [None]:
print reg_m(dfResult['ID'], dfResult['Severity'])

In [None]:
dfResult.corr()

In [None]:
df.set_index('Start_Time').groupby(pd.Grouper(freq='D')).mean()


In [None]:
dfResult.head()

In [None]:
dfResult.head()

In [None]:
df1.corr()

In [None]:
df.corr()

In [None]:
df2
df2.info(memory_usage="deep")

In [None]:
df2.Day.value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=df2.Day)
plt.title("Количество ДТП по дням месяца");

In [None]:
df2.Day.value_counts()

In [None]:
plt.style.use("ggplot")
plt.figure(figsize=(12,6))
sns.countplot(x=df2.Year,hue=df2.Month)
plt.title("Количество ДТП по годам и месяцам")
plt.ylabel("Count")
plt.legend(title = "Months",loc="upper left",shadow=True);

In [None]:
df2020ver1 = df2[df2['Year'] == 2019]
df2020 = df2020ver1[df2020ver1['Month'] == 9]
df2020

In [None]:
plt.style.use("ggplot")
plt.figure(figsize=(12,6))
sns.countplot(x=df2020.Day)
plt.title("Количество ДТП в сентябре 2019-го года")
plt.ylabel("Count")
plt.legend(title = "Количество",loc="upper left",shadow=True);

In [None]:
df2.Month.value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df2.Month)
plt.title("Количество ДТП по месяцам");

In [None]:
df2.Hour.value_counts()

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x=df2.Hour)
plt.title("Количество ДТП по часам");

In [None]:
df2.WeekDay.value_counts()

In [None]:
df2.WeekDay.replace([0,1,2,3,4,5,6],["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],inplace=True)

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x=df2.WeekDay,order=["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"])
plt.title("Number of Accidents per Week");

### States that has highest number of accidents 

In [None]:
df.State.value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x= df.State, order = df["State"].value_counts().index,palette="Spectral")
plt.title("Штаты по количеству ДТП");

### Location at which Maximum Accident Happened

In [None]:
lat_lng = df.groupby(["Start_Lat","Start_Lng"]).count()

In [None]:
lat_lng.Severity.nlargest(1)

In [None]:
# generate map of San_Francisco
San_Francisco = folium.Map(width=500, height =300 ,location = [37.808498 , -122.366852],zoom_start=12)
## add a marker to the location 

# create a feature group
Bridge = folium.map.FeatureGroup()
# style the feature group
Bridge.add_child(folium.features.CircleMarker([37.808498 , -122.366852],radius=5,color="red",fill_color="red"))

# add the feature group to the map
San_Francisco.add_child(Bridge)

# label the marker
folium.Marker([37.808498 , -122.366852],popup="San Francisco-Oakland Bay Brg E").add_to(San_Francisco)
San_Francisco

## Factors Affecting Accidents Severity

### Accidents in Day & Night

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df.Sunrise_Sunset)
plt.title("Количество ДТП по показателю День/Ночь");

### Top 10 Temperatures(F) which causes Maximum Accidents

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df["Temperature(F)"].value_counts().index[:10],y=df["Temperature(F)"].value_counts().values[:10],
           order=df["Temperature(F)"].value_counts().index[:10],palette="Spectral")
plt.title("Top 10 Temperatures(F) which causes Maximum Accidents");

- Maximum Accidents happened in 60-80F Temperature

### Accidents due to Whether Condition 

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=df["Weather_Condition"].value_counts().index[:15],y=df["Weather_Condition"].value_counts().values[:15],order=df["Weather_Condition"].value_counts().index[:15])
plt.xticks(rotation=90)
plt.title("Accidents due to Weather Condition");

- Maximum Accidents happened in Fair whether