In [None]:
import numpy as np 
import pandas as pd 
import json
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import datetime
import glob
import seaborn as sns
import re
import os
import io
from scipy.stats import boxcox

In [None]:
# df = pd.read_csv('../input/us-accidents/US_Accidents_Dec20.csv')
raw_df = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv')
print("The shape of data is:",(raw_df.shape))
display(raw_df.head(3))

In [None]:
df = raw_df.loc[(raw_df['City'] == 'Orlando')
                 | (raw_df['City'] == 'Jacksonville')
                 | (raw_df['City'] == 'Chicago')
                 | (raw_df['City'] == 'Houston')
                 | (raw_df['City'] == 'Miami')
                 | (raw_df['City'] == 'Tampa')
                 | (raw_df['City'] == 'New York')
                 | (raw_df['City'] == 'Los Angeles')]
# cdf = raw_df.loc[df['City'] == 'Orlando']
df = df.copy()
print(len(df))
print(df['City'].value_counts())
df.head()

In [None]:
df.to_csv('8_Cities_historical_accidents.csv',index=False)

In [None]:
df['Severity'].value_counts()

In [None]:
# fix datetime type
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])
df['Weather_Timestamp'] = pd.to_datetime(df['Weather_Timestamp'])

# calculate duration as the difference between end time and start time in minute
df['Duration'] = df['End_Time'] - df['Start_Time'] 
df['Duration'] = df['Duration'].apply(lambda x:round(x.total_seconds() / 60) )
print("The overall mean duration is: ", (round(df['Duration'].mean(),3)), 'min')

In [None]:
df = df.drop(['ID','Description','Distance(mi)', 'End_Time', 'Duration', 
              'End_Lat', 'End_Lng'], axis=1)

In [None]:
df.head()

In [None]:
cat_names = ['Side', 'Country', 'Timezone', 'Amenity', 'Bump', 'Crossing', 
             'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 
             'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 
             'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
print("Unique count of categorical features:")
for i in cat_names:
  print(i,df[i].unique().size)

In [None]:
df = df.drop(['Country','Turning_Loop'], axis=1)

In [None]:
print("Wind Direction: ", df['Wind_Direction'].unique())

In [None]:
df.loc[df['Wind_Direction']=='Calm','Wind_Direction'] = 'CALM'
df.loc[(df['Wind_Direction']=='West')|(df['Wind_Direction']=='WSW')|(df['Wind_Direction']=='WNW'),'Wind_Direction'] = 'W'
df.loc[(df['Wind_Direction']=='South')|(df['Wind_Direction']=='SSW')|(df['Wind_Direction']=='SSE'),'Wind_Direction'] = 'S'
df.loc[(df['Wind_Direction']=='North')|(df['Wind_Direction']=='NNW')|(df['Wind_Direction']=='NNE'),'Wind_Direction'] = 'N'
df.loc[(df['Wind_Direction']=='East')|(df['Wind_Direction']=='ESE')|(df['Wind_Direction']=='ENE'),'Wind_Direction'] = 'E'
df.loc[df['Wind_Direction']=='Variable','Wind_Direction'] = 'VAR'
print("Wind Direction after simplification: ", df['Wind_Direction'].unique())

In [None]:
# show distinctive weather conditions 
weather ='!'.join(df['Weather_Condition'].dropna().unique().tolist())
weather = np.unique(np.array(re.split(
    "!|\s/\s|\sand\s|\swith\s|Partly\s|Mostly\s|Blowing\s|Freezing\s", weather))).tolist()
print("Weather Conditions: ", weather)

In [None]:
df['Clear'] = np.where(df['Weather_Condition'].str.contains('Clear', case=False, na = False), True, False)
df['Cloud'] = np.where(df['Weather_Condition'].str.contains('Cloud|Overcast', case=False, na = False), True, False)
df['Rain'] = np.where(df['Weather_Condition'].str.contains('Rain|storm', case=False, na = False), True, False)
df['Heavy_Rain'] = np.where(df['Weather_Condition'].str.contains('Heavy Rain|Rain Shower|Heavy T-Storm|Heavy Thunderstorms', case=False, na = False), True, False)
df['Snow'] = np.where(df['Weather_Condition'].str.contains('Snow|Sleet|Ice', case=False, na = False), True, False)
df['Heavy_Snow'] = np.where(df['Weather_Condition'].str.contains('Heavy Snow|Heavy Sleet|Heavy Ice Pellets|Snow Showers|Squalls', case=False, na = False), True, False)
df['Fog'] = np.where(df['Weather_Condition'].str.contains('Fog', case=False, na = False), True, False)

In [None]:
# Assign NA to created weather features where 'Weather_Condition' is null.
weather = ['Clear','Cloud','Rain','Heavy_Rain','Snow','Heavy_Snow','Fog']
for i in weather:
    df.loc[df['Weather_Condition'].isnull(),i] = df.loc[df['Weather_Condition'].isnull(),'Weather_Condition']
    df[i] = df[i].astype('bool')

df.loc[:,['Weather_Condition'] + weather]

df = df.drop(['Weather_Condition'], axis=1)

In [None]:
# average difference between weather time and start time
print("Mean difference between 'Start_Time' and 'Weather_Timestamp': ", 
(df.Weather_Timestamp - df.Start_Time).mean())

In [None]:
df = df.drop(["Weather_Timestamp"], axis=1)

df['Year'] = df['Start_Time'].dt.year

nmonth = df['Start_Time'].dt.month
df['Month'] = nmonth

df['Weekday']= df['Start_Time'].dt.weekday

days_each_month = np.cumsum(np.array([0,31,28,31,30,31,30,31,31,30,31,30,31]))
nday = [days_each_month[arg-1] for arg in nmonth.values]
nday = nday + df["Start_Time"].dt.day.values
df['Day'] = nday

df['Hour'] = df['Start_Time'].dt.hour

df['Minute']=df['Hour']*60.0+df["Start_Time"].dt.minute

df.loc[:4,['Start_Time', 'Year', 'Month', 'Weekday', 'Day', 'Hour', 'Minute']]

In [None]:
missing = pd.DataFrame(df.isnull().sum()).reset_index()
missing.columns = ['Feature', 'Missing_Percent(%)']
missing['Missing_Percent(%)'] = missing['Missing_Percent(%)'].apply(lambda x: x / df.shape[0] * 100)
missing.loc[missing['Missing_Percent(%)']>0,:]

In [None]:
df = df.drop(['Number','Wind_Chill(F)'], axis=1)

In [None]:
df['Precipitation_NA'] = 0
df.loc[df['Precipitation(in)'].isnull(),'Precipitation_NA'] = 1
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(df['Precipitation(in)'].median())
df.loc[:5,['Precipitation(in)','Precipitation_NA']]

In [None]:
df = df.dropna(subset=['City','Zipcode','Airport_Code',
                       'Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'])

In [None]:
# group data by 'Airport_Code' and 'Start_Month' then fill NAs with median value
Weather_data=['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']
print("The number of remaining missing values: ")
for i in Weather_data:
  df[i] = df.groupby(['Airport_Code','Month'])[i].apply(lambda x: x.fillna(x.median()))
  print( i + " : " + df[i].isnull().sum().astype(str))

In [None]:
df = df.dropna(subset=Weather_data)

In [None]:
# group data by 'Airport_Code' and 'Start_Month' then fill NAs with majority value
from collections import Counter
weather_cat = ['Wind_Direction'] + weather
print("Count of missing values that will be dropped: ")
for i in weather_cat:
  df[i] = df.groupby(['Airport_Code','Month'])[i].apply(lambda x: x.fillna(Counter(x).most_common()[0][0]) if all(x.isnull())==False else x)
  print(i + " : " + df[i].isnull().sum().astype(str))

# drop na
df = df.dropna(subset=weather_cat)

In [None]:
df.Severity.value_counts()

In [None]:
# def resample(dat, col, n):
#     return pd.concat([dat[dat[col]==1].sample(n, replace = True),
#                    dat[dat[col]==0].sample(n)], axis=0)

In [None]:
# df_bl = resample(df, 'Severity', 1000)
# print('resampled data:', df_bl.Severity.value_counts())
df_bl = df.copy()

In [None]:
df_bl.Year = df_bl.Year.astype(str)
sns.countplot(x='Year', hue='Severity', data=df_bl ,palette="Set2")
plt.title('Count of Accidents by Year', size=15, y=1.05)
plt.show()

In [None]:
# create a dataframe used to plot heatmap
df_date = df.loc[:,['Start_Time','Severity']]         # create a new dateframe only containing time and severity
df_date['date'] = df_date['Start_Time'].dt.normalize() # keep only the date part of start time
df_date = df_date.drop(['Start_Time'], axis = 1)
df_date = df_date.groupby('date').sum()                # sum the number of accidents with severity by date
df_date = df_date.reset_index().drop_duplicates()

# join the dataframe with full range of date from 2016 to 2020
full_date = pd.DataFrame(pd.date_range(start="2016-01-02",end="2020-12-31"))    
df_date = full_date.merge(df_date, how = 'left',left_on = 0, right_on = 'date')
df_date['date'] = df_date.iloc[:,0]
df_date = df_date.fillna(0)
df_date = df_date.iloc[:,1:].set_index('date')

# group by date
groups = df_date['Severity'].groupby(pd.Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    if name.year != 2020:
        years[name.year] = np.append(group.values,0)
    else:
        years[name.year] = group.values
  

# plot
years = years.T
plt.matshow(years, interpolation=None, aspect='auto')
plt.title('Time Heatmap of Accident with Severity Levels (raw data)', y=1.2, fontsize=15)
plt.show()

In [None]:
df = df.loc[df['Start_Time'] > "2019-03-10",:]
df = df.drop(['Year', 'Start_Time'], axis=1)
df['Severity'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Month', hue='Severity', data=df_bl ,palette="Set2")
plt.title('Count of Accidents by Month', size=15, y=1.05)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Weekday', hue='Severity', data=df_bl ,palette="Set2")
plt.title('Count of Accidents by Weekday', size=15, y=1.05)
plt.show()

In [None]:
period_features = ['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']
fig, axs = plt.subplots(ncols=1, nrows=4, figsize=(13, 5))

plt.subplots_adjust(wspace = 0.5)
for i, feature in enumerate(period_features, 1):    
    plt.subplot(1, 4, i)
    sns.countplot(x=feature, hue='Severity', data=df_bl ,palette="Set2")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Accident Count', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(['0', '1'], loc='upper right', prop={'size': 10})
    plt.title('Count of Severity in\n{} Feature'.format(feature), size=13, y=1.05)
fig.suptitle('Count of Accidents by Period-of-Day',y=1.08, fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='Hour', hue='Severity', data=df_bl ,palette="Set2")
plt.title('Count of Accidents by Hour', size=15, y=1.05)
plt.show()

In [None]:
# # frequence encoding and log-transform
# df['Minute_Freq'] = df.groupby(['Minute'])['Minute'].transform('count')
# df['Minute_Freq'] = df['Minute_Freq']/df.shape[0]*24*60
# df['Minute_Freq'] = df['Minute_Freq'].apply(lambda x: np.log(x+1))

# # resampling
# df_bl = resample(df, 'Severity', 20000)

# # plot
# df_bl['Severity4'] = df_bl['Severity'].astype('category')
# sns.violinplot(x='Minute_Freq', y="Severity4", data=df_bl, palette="Set2")    
# plt.xlabel('Minute_Fre', size=12, labelpad=3)
# plt.ylabel('Severity4', size=12, labelpad=3)    
# plt.tick_params(axis='x', labelsize=12)
# plt.tick_params(axis='y', labelsize=12)
# plt.title('Minute Frequency by Severity (resampled data)', size=16, y=1.05)
# plt.show()

In [None]:
plt.figure(figsize=(6,5))
chart = sns.countplot(x='Timezone', hue='Severity', data=df_bl ,palette="Set2")
plt.title("Count of Accidents by Timezone", size=15, y=1.05)
plt.show()

In [None]:
fre_list = ['Zipcode', 'Airport_Code','State']
for i in fre_list:
  newname = i + '_Freq'
  df[newname] = df.groupby([i])[i].transform('count')
  df[newname] = df[newname]/df.shape[0]*df[i].unique().sizea
  df[newname] = df[newname].apply(lambda x: np.log(x+1))

In [None]:
type(df.Zipcode_Freq.values)

In [None]:
# resample again
# df_bl = resample(df, 'Severity4', 20000)

df_bl['Severity'] = df_bl['Severity'].astype('category')
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(10, 10))
plt.subplots_adjust(hspace=0.4,wspace = 0.2)
fig.suptitle('Location Frequency by Severity', fontsize=16)
for i, feature in enumerate(fre_list, 1): 
    feature = feature + '_Freq'   
    plt.subplot(2, 3, i)
    sns.violinplot(x=feature, y="Severity", data=df_bl, palette="Set2")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Severity', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)

    plt.title('{}'.format(feature), size=16, y=1.05)
plt.show i'm

In [None]:
df = df.drop(fre_list, axis  = 1)

In [None]:
df['Pressure_bc']= boxcox(df['Pressure(in)'].apply(lambda x: x+1),lmbda=6)
df['Visibility_bc']= boxcox(df['Visibility(mi)'].apply(lambda x: x+1),lmbda = 0.1)
df['Wind_Speed_bc']= boxcox(df['Wind_Speed(mph)'].apply(lambda x: x+1),lmbda=-0.2)
df = df.drop(['Pressure(in)','Visibility(mi)','Wind_Speed(mph)'], axis=1)

In [None]:
# resample again
# df_bl = resample(df, 'Severity', 20000)

df_bl['Severity'] = df_bl['Severity'].astype('category')
num_features = ['Temperature(F)', 'Humidity(%)', 'Pressure_bc', 'Visibility_bc', 'Wind_Speed_bc']
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(15, 10))
plt.subplots_adjust(hspace=0.4,wspace = 0.2)
for i, feature in enumerate(num_features, 1):    
    plt.subplot(2, 3, i)
    sns.violinplot(x=feature, y="Severity", data=df_bl, palette="Set2")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Severity', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)

    plt.title('{} Feature by Severity'.format(feature), size=14, y=1.05)
fig.suptitle('Density of Accidents by Weather Features', fontsize=18)
plt.show()p

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=4, figsize=(15, 10))
plt.subplots_adjust(hspace=0.4,wspace = 0.6)
for i, feature in enumerate(weather, 1):    
    plt.subplot(2, 4, i)
    sns.countplot(x=feature, hue='Severity', data=df_bl ,palette="Set2")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Accident Count', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(['0', '1'], loc='upper right', prop={'size': 10})
    plt.title('Count of Severity in \n {} Feature'.format(feature), size=14, y=1.05)
fig.suptitle('Count of Accidents by Weather Features', fontsize=18)
plt.show()

In [None]:
df = df.drop(['Heavy_Rain','Heavy_Snow','Fog'], axis  = 1)

In [None]:
plt.figure(figsize=(10,5))
chart = sns.countplot(x='Wind_Direction', hue='Severity', data=df_bl ,palette="Set2")
plt.title("Count of Accidents in Wind Direction", size=15, y=1.05)
plt.show()

In [None]:
df = df.drop(['Wind_Direction'], axis=1)

In [None]:
POI_features = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal']

fig, axs = plt.subplots(ncols=3, nrows=4, figsize=(15, 10))

plt.subplots_adjust(hspace=0.5,wspace = 0.5)
for i, feature in enumerate(POI_features, 1):    
    plt.subplot(3, 4, i)
    sns.countplot(x=feature, hue='Severity', data=df_bl ,palette="Set2")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Accident Count', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(['0', '1'], loc='upper right', prop={'size': 10})
    plt.title('Count of Severity in {}'.format(feature), size=14, y=1.05)
fig.suptitle('Count of Accidents in POI Features',y=1.02, fontsize=16)
plt.show()

In [None]:
df= df.drop(['Amenity','Bump','Give_Way','No_Exit','Roundabout','Traffic_Calming'], axis=1)

Correlation analysis

In [None]:
df.head()

Modelling

In [None]:
# one-hot encoding
df[period_features] = df[period_features].astype('category')
df = pd.get_dummies(df, columns=period_features, drop_first=True)

In [None]:
# resample again
# df_bl = resample(df, 'Severity4', 20000)

# plot correlation
df_bl['Severity'] = df_bl['Severity'].astype(int)
plt.figure(figsize=(25,25))
cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
sns.heatmap(df_bl.corr(), annot=True,cmap=cmap, center=0).set_title("Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
df = df.drop(['Temperature(F)', 'Humidity(%)', 'Precipitation(in)', 'Precipitation_NA','Visibility_bc', 'Wind_Speed_bc',
              'Clear','Cloud','Snow','Crossing','Junction','Railway','Month',
              'Hour', 'Day','Minute', 'City_Freq','County_Freq','Airport_Code_Freq','Zipcode_Freq',
              'Sunrise_Sunset_Night', 'Civil_Twilight_Night', 'Nautical_Twilight_Night'], axis=1)

In [None]:
# resample again
# df_bl = resample(df, 'Severity', 20000)

# plot correlation
df_bl['Severity'] = df_bl['Severity'].astype(int)
plt.figure(figsize=(20,20))
cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
sns.heatmap(df_bl.corr(), annot=True,cmap=cmap, center=0).set_title("Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
list(df_bl.columns)

In [None]:
sorted(df_bl)

In [None]:
sorted(df)

In [None]:
df_b2 = df_bl.copy()
df_b2 = df_b2.drop(['Temperature(F)', 'Humidity(%)', 'Precipitation(in)', 'Precipitation_NA',
              'Clear','Cloud','Snow','Crossing','Junction','Railway','Month',
              'Hour', 'Day','Minute', 'Astronomical_Twilight', 'Civil_Twilight', 'Nautical_Twilight'], axis=1)

In [None]:
sorted(df_b2)

In [None]:
# plot correlation
df_b2['Severity'] = df_b2['Severity'].astype(int)
plt.figure(figsize=(25,25))
cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
sns.heatmap(df_bl.corr(), annot=True,cmap=cmap, center=0).set_title("Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
corrrelation    = df_b2.corr(method="pearson");

print("Pearson correlation coefficient:");

print(corrrelation);

 

corrrelation    = df_b2.corr(method="kendall");

print("Kendall Tau correlation coefficient:");

print(corrrelation);

 

corrrelation    = df_b2.corr(method="spearman");

print("Spearman rank correlation:");

print(corrrelation);

In [None]:
sorted(df_b2)

In [None]:
df_b3 = df_b2.copy()
df_b3 = df_b3.drop(['Airport_Code', 'Year','Amenity', 'Bump', 'City', 'County', 'Fog', 'Give_Way', 'No_Exit','Roundabout','Traffic_Calming'], axis=1)

In [None]:
sorted(df_b3)

In [None]:
df_b3.Severity.value_counts()

In [None]:
df_b3.Zipcode.value_counts()

In [None]:
len(df_b3['Severity'])

In [None]:
0.0001 * len(df_b3['Severity'])

In [None]:
sub_df = df_b3[df_b3.groupby('Zipcode').Zipcode.transform('count')>10].copy()
sub_df.Zipcode.value_counts()

In [None]:
# cor = sub_df.corr(method="pearson");

# print("Pearson correlation coefficient:");

# print(cor);

In [None]:
# #Correlation with output variable
# cor_target = abs(cor["Severity"])
# #Selecting highly correlated features
# relevant_features = cor_target[cor_target>0.001]
# relevant_features

In [None]:
sorted(sub_df)

In [None]:
# plot correlation
sub_df['Severity'] = sub_df['Severity'].astype(int)
plt.figure(figsize=(25,25))
cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
sns.heatmap(sub_df.corr(), annot=True,cmap=cmap, center=0).set_title("Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
sub_df.dtypes

In [None]:
# convert all columns of DataFrame
# sub_df = sub_df.apply(pd.to_numeric) # convert all columns of DataFrame

# # convert just columns "a" and "b"
# sub_df[["Street", "Zipcode"]] = sub_df[["Street", "Zipcode"]].apply(pd.to_numeric)
sub_df[["Zipcode"]] = sub_df[["Zipcode"]].apply(pd.to_numeric)

sub_df.dtypes

Sample Real Time API

In [None]:
input = ['weather_Event','time_of_day','day_ofweek', 'landmark', 'temp', 'zipcode']


Severity_type_value = (1,2,3,4)

predict_severity_by_zipcode(input) = Severity_type_value