In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [2]:
pd.options.display.max_columns = 47

In [3]:
data = pd.read_csv('US_Accidents_Dec20_Updated.csv')

In [15]:
def accidents_ETL(accidents):
    
    #drop unnecessary columns
    dropthese = ['ID', 'End_Lat', 'End_Lng', 'End_Time',    'Description', 'Distance(mi)', 'Number', 'Street', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Start_Lat', 'Start_Lng', 'Nautical_Twilight', 'Astronomical_Twilight', 'Wind_Chill(F)', 'Amenity', 'Sunrise_Sunset']
    accidents = accidents.drop(columns = dropthese, axis = 1)

    #Remname Civil Twilight
    accidents = accidents.rename(columns={'Civil_Twilight': 'Day/Night'})

    #modify Start_Time to datetime so we can extract year, month, day, hour
    accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'])

    #extracting year, month, day, hour
    accidents['Year'] = pd.DatetimeIndex(accidents['Start_Time']).year
    accidents['Month'] = pd.DatetimeIndex(accidents['Start_Time']).month
    accidents['Day'] = pd.DatetimeIndex(accidents['Start_Time']).day
    accidents['Hour'] = pd.DatetimeIndex(accidents['Start_Time']).hour
    
    #drop unnecessary Start_Time column
    accidents = accidents.drop(['Start_Time'], axis=1)

    #remove expanded zip code value
    accidents2 = accidents['Zipcode'].str.split('-', expand=True)
    accidents['Zipcode'] = accidents2[0]

    #Combining City/State and County/State to prevent wrong city or county aggregation
    accidents['City_State'] = accidents['City'].astype(str) + ',' + accidents['State'].astype(str)
    accidents['County_State'] = accidents['County'].astype(str) + ',' + accidents['State'].astype(str)
    accidents['City'] = accidents['City_State']
    accidents['County'] = accidents['County_State']
    accidents = accidents.drop(['City_State', 'County_State'], axis=1)

    #Replacing wind directions to cardinal 8 directions
    accidents['Wind_Direction'].replace('CALM', 'Calm', inplace=True)
    accidents['Wind_Direction'].replace(['ENE', 'NNE'], 'NE', inplace=True)
    accidents['Wind_Direction'].replace(['ESE', 'SSE'], 'SE', inplace=True)
    accidents['Wind_Direction'].replace(['WNW', 'NNW'], 'NW', inplace=True)
    accidents['Wind_Direction'].replace(['WSW', 'SSW'], 'SW', inplace=True)
    accidents['Wind_Direction'].replace('North', 'N', inplace=True)
    accidents['Wind_Direction'].replace('East', 'E', inplace=True)
    accidents['Wind_Direction'].replace('South', 'S', inplace=True)
    accidents['Wind_Direction'].replace('West', 'W', inplace=True)
    accidents['Wind_Direction'].replace('VAR', 'Variable', inplace=True)

    #Chaning Weather_Condition NaN to "Unknown"
    accidents['Weather_Condition'] = accidents['Weather_Condition'].fillna('Unkown')

    #Chaning Temperature NaN to "Unknown"
    accidents['Temperature(F)'] = accidents['Temperature(F)'].fillna('Unkown')

    #Chaning Humidity NaN to "Unknown"
    accidents['Humidity(%)'] = accidents['Humidity(%)'].fillna('Unkown')

    #Chaning Pressure NaN to "Unknown"
    accidents['Pressure(in)'] = accidents['Pressure(in)'].fillna('Unkown')

    #Dropping Day/Night NaN values - because daylight hours change and cannot be reliably determined by time
    noDayNight = accidents[accidents['Day/Night'].isnull()]
    accidents = accidents.drop(noDayNight.index)


    #Filling in missing Zipcodes
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'St. Petersburg,FL'), '33713', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'District 4 Kent Island,MD'), '21666', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'Ross Valley,CA'), '94939', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'Springville-Mapleton,UT'), '84663', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'West Contra Costa,CA'), '94530', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'San Mateo,CA'), '94403', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'East Tehama,CA'), '96090', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'Southeast Marin,CA'), '94956', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'District 5,MD'), '20659', accidents.Zipcode)
    accidents['Zipcode'] = np.where((accidents['Zipcode'].isnull()) & (accidents['City'] == 'Avalon-Mulat,FL'), '32583', accidents.Zipcode)

    #Removing remaining NaN Zipcodes:
    no_zipcode = accidents[accidents['Zipcode'].isnull()]
    accidents = accidents.drop(no_zipcode.index)

    #Correcting Precipitation(in) NaN to 0 because if there was precipitation it would have likely been recorded. 
    accidents['Precipitation(in)'] = accidents['Precipitation(in)'].fillna(0.00)

    #Correcting Wind_Speed(mph) NaN to 0 because if there was wind it would have likely been recorded. 
    accidents['Wind_Speed(mph)'] = accidents['Wind_Speed(mph)'].fillna(0.0)

    #Changing unknown Wind_Direction with Wind_Speed(mi)>0 to "Variable"
    accidents['Wind_Direction'] = np.where((accidents['Wind_Direction'].isnull()) & (accidents['Wind_Speed(mph)'] > 0), 'Variable', accidents.Wind_Direction)

    #Changing Wind_Direction NaN with Wind_Speed(mi) of 0 to "Calm"
    accidents['Wind_Direction'] = np.where((accidents['Wind_Direction'].isnull()) & (accidents['Wind_Speed(mph)'] == 0), 'Calm', accidents.Wind_Direction)

    #Change Visibilty(mi) NaN values to the median of 10.0
    accidents['Visibility(mi)'] = accidents['Visibility(mi)'].fillna(10.00)


    #REMOVING OUTLIERS:

    #eliminating Wind_Speed outliers
    high_wind = accidents[accidents['Wind_Speed(mph)'] > 100]
    accidents = accidents.drop(high_wind.index)

    #eliminating Precipitation(in) outliers
    high_rain = accidents[accidents['Precipitation(in)'] > 13]
    accidents = accidents.drop(high_rain.index)


    #Converting Bool columns to integers to prepare for scaling and ML modeling
    bool_cols = accidents.dtypes[accidents.dtypes == 'bool'].index.tolist()
    bool_int = accidents[bool_cols].astype(int)
    encoding = accidents.copy()
    encoding = accidents.drop(bool_int,1)
    encoding = encoding.merge(bool_int, left_index=True, right_index=True)


    #Weather Consolidtation

    #Clear
    accidents['Weather_Condition'].replace('Clear', 'Fair', inplace=True)
    accidents['Weather_Condition'].replace('N/A Precipitation', 'Fair', inplace=True)

    #Unknown
    accidents['Weather_Condition'].replace('N/A Preciptiation', 'Unknown', inplace=True)

    #Cloudy
    accidents['Weather_Condition'].replace('Overcast', 'Mostly Cloudy', inplace=True)
    accidents['Weather_Condition'].replace('Scattered Clouds', 'Partly Cloudy', inplace=True)

    #Rain
    accidents['Weather_Condition'].replace('Light Rain Shower', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Rain Showers', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Drizzle', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Drizzle', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Rain', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Drizzle', 'Light Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Drizzle', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Showers in the Vicinity', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Rain Showers', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Rain Shower', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Freezing Rain', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Rain Shower', 'Heavy Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Freezing Rain', 'Heavy Rain', inplace=True)
    accidents['Weather_Condition'].replace('Freezing Drizzle', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Rain Showers', 'Heavy Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Freezing Drizzle', 'Rain', inplace=True)

    #Fog
    accidents['Weather_Condition'].replace('Patches of Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Partial Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Light Fog', 'Fog', inplace=True)

    #Smoke/Haze
    accidents['Weather_Condition'].replace('Smoke', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Haze', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Light Haze', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Smoke', 'Smoke/Haze', inplace=True)

    #Thunderstorms
    accidents['Weather_Condition'].replace('T-Storm', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Thunder in the Vicinity', 'Thunder', inplace=True)
    accidents['Weather_Condition'].replace('Light Rain with Thunder', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Heavy T-Storm', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorms and Snow', 'Thunderstorms/Snow', inplace=True)
    accidents['Weather_Condition'].replace('Thunderstorms and Snow', 'Thunderstorms/Snow', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorm', 'Thunderstorm', inplace=True)

    #Snow
    accidents['Weather_Condition'].replace('Light Snow Showers', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Snow Grains', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Light Blowing Snow', 'Snow/Windy', inplace=True)
    accidents['Weather_Condition'].replace('Light Snow Grains', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Low Drifting Snow', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Snow Showers', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Blowing Snow', 'Heavy Snow/Windy', inplace=True)
    accidents['Weather_Condition'].replace('Light Snow Shower', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Drifting Snow', 'Snow/Windy', inplace=True)

    #Hail
    accidents['Weather_Condition'].replace('Light Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Small Hail', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Light Hail', 'Hail', inplace=True)

    #Sleet
    accidents['Weather_Condition'].replace('Light Sleet', 'Sleet', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Sleet', 'Sleet', inplace=True)

    #Dust
    accidents['Weather_Condition'].replace('Sand', 'Dust', inplace=True)
    accidents['Weather_Condition'].replace('Widespread Dust', 'Dust', inplace=True)

    #Tornado
    accidents['Weather_Condition'].replace('Tornado', 'Funnel Cloud', inplace=True)                          

    #Encoding Obj columns using OneHotEncoder


    return(accidents)

In [16]:
accidents = accidents_ETL(data)
accidents

Unnamed: 0,Severity,Side,City,County,State,Zipcode,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Day/Night,Year,Month,Day,Hour
0,2,R,"Greenville,SC","Greenville,SC",SC,29607,76.0,52.0,28.91,10.0,N,7.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Day,2019,5,21,8
1,2,R,"Charlotte,NC","Mecklenburg,NC",NC,28270,76.0,62.0,29.3,10.0,Variable,3.0,0.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Day,2019,10,7,17
2,2,R,"Los Gatos,CA","Santa Clara,CA",CA,95033,51.0,80.0,30.17,10.0,W,6.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Night,2020,12,13,21
3,2,R,"Carson City,NV","Douglas,NV",NV,89705,53.6,16.0,30.16,10.0,SW,4.6,0.0,Clear,False,False,False,False,False,False,False,False,False,False,True,False,Day,2018,4,17,16
4,3,R,"Fort Lauderdale,FL","Broward,FL",FL,33324,84.2,84.0,29.92,10.0,SE,13.8,0.0,Overcast,False,False,False,True,False,False,False,False,False,False,True,False,Day,2016,8,31,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906605,2,L,"Houston,TX","Harris,TX",TX,77018,84.2,70.0,30.02,9.0,Variable,5.8,0.0,Clear,False,False,False,False,False,False,False,False,True,False,False,False,Day,2018,6,28,8
2906606,2,R,"Colton,CA","San Bernardino,CA",CA,92324,46.9,74.0,30.14,10.0,Calm,0.0,0.0,Clear,False,False,False,False,False,False,False,False,False,False,False,False,Night,2019,1,10,2
2906607,2,L,"Miami,FL","Miami-Dade,FL",FL,33173,76.0,85.0,30.0,10.0,NW,16.0,0.0,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Day,2020,11,23,12
2906608,2,R,"Salt Lake City,UT","Salt Lake,UT",UT,84129,27.0,81.0,25.81,10.0,SE,8.0,0.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Night,2019,12,29,22


In [17]:
pd.options.display.max_rows = None
print(accidents.dtypes)
pd.options.display.max_rows = 20

Severity               int64
Side                  object
City                  object
County                object
State                 object
Zipcode               object
Temperature(F)        object
Humidity(%)           object
Pressure(in)          object
Visibility(mi)       float64
Wind_Direction        object
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Bump                    bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Roundabout              bool
Station                 bool
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Turning_Loop            bool
Day/Night             object
Year                   int64
Month                  int64
Day                    int64
Hour                   int64
dtype: object


In [18]:
pd.options.display.max_rows = None
print(accidents.isnull().sum())
pd.options.display.max_rows = 20

Severity             0
Side                 0
City                 0
County               0
State                0
Zipcode              0
Temperature(F)       0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Direction       0
Wind_Speed(mph)      0
Precipitation(in)    0
Weather_Condition    0
Bump                 0
Crossing             0
Give_Way             0
Junction             0
No_Exit              0
Railway              0
Roundabout           0
Station              0
Stop                 0
Traffic_Calming      0
Traffic_Signal       0
Turning_Loop         0
Day/Night            0
Year                 0
Month                0
Day                  0
Hour                 0
dtype: int64


In [22]:
pd.options.display.max_rows = None
print(encoding.dtypes)
pd.options.display.max_rows = 20

Severity               int64
Side                  object
City                  object
County                object
State                 object
Zipcode               object
Temperature(F)        object
Humidity(%)           object
Pressure(in)          object
Visibility(mi)       float64
Wind_Direction        object
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Day/Night             object
Year                   int64
Month                  int64
Day                    int64
Hour                   int64
Bump                   int64
Crossing               int64
Give_Way               int64
Junction               int64
No_Exit                int64
Railway                int64
Roundabout             int64
Station                int64
Stop                   int64
Traffic_Calming        int64
Traffic_Signal         int64
Turning_Loop           int64
dtype: object


In [68]:
pd.options.display.max_rows = None
test = accidents['Weather_Condition']

desc = test.describe()
nan = test.isnull().sum()
counts = test.value_counts(dropna=False)

print(f'Stats: {desc}')
print(f'NaN: {nan}')
print(f'Value Counts: {counts}')
pd.options.display.max_rows = 20

Stats: count     2906065
unique         77
top          Fair
freq      1192074
Name: Weather_Condition, dtype: object
NaN: 0
Value Counts: Fair                                   1192074
Mostly Cloudy                           623174
Partly Cloudy                           395915
Cloudy                                  245043
Light Rain                              154686
Unkown                                   71479
Light Snow                               39939
Smoke/Haze                               39033
Fog                                      36033
Rain                                     34130
Thunderstorm                             16437
Heavy Rain                               12342
Fair / Windy                              9099
Thunder                                   5143
Mostly Cloudy / Windy                     5098
Cloudy / Windy                            4769
Snow                                      4617
Partly Cloudy / Windy                     3053
Light Rain / Wi

In [69]:
obj_cols = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
obj_cols

['Side',
 'City',
 'County',
 'State',
 'Zipcode',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Wind_Direction',
 'Weather_Condition',
 'Day/Night']

In [70]:
enc = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(enc.fit_transform(encoding[obj_cols]))
encode_df.columns = enc.get_feature_names(obj_cols)
encode_df.head()

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

In [72]:
accidents

Unnamed: 0,Severity,Side,City,County,State,Zipcode,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Day/Night,Year,Month,Day,Hour
0,2,R,"Greenville,SC","Greenville,SC",SC,29607,76,52,28.91,10.0,N,7.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Day,2019,5,21,8
1,2,R,"Charlotte,NC","Mecklenburg,NC",NC,28270,76,62,29.3,10.0,Variable,3.0,0.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Day,2019,10,7,17
2,2,R,"Los Gatos,CA","Santa Clara,CA",CA,95033,51,80,30.17,10.0,W,6.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Night,2020,12,13,21
3,2,R,"Carson City,NV","Douglas,NV",NV,89705,53.6,16,30.16,10.0,SW,4.6,0.0,Clear,False,False,False,False,False,False,False,False,False,False,True,False,Day,2018,4,17,16
4,3,R,"Fort Lauderdale,FL","Broward,FL",FL,33324,84.2,84,29.92,10.0,SE,13.8,0.0,Overcast,False,False,False,True,False,False,False,False,False,False,True,False,Day,2016,8,31,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906605,2,L,"Houston,TX","Harris,TX",TX,77018,84.2,70,30.02,9.0,Variable,5.8,0.0,Clear,False,False,False,False,False,False,False,False,True,False,False,False,Day,2018,6,28,8
2906606,2,R,"Colton,CA","San Bernardino,CA",CA,92324,46.9,74,30.14,10.0,Calm,,0.0,Clear,False,False,False,False,False,False,False,False,False,False,False,False,Night,2019,1,10,2
2906607,2,L,"Miami,FL","Miami-Dade,FL",FL,33173,76,85,30,10.0,NW,16.0,0.0,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Day,2020,11,23,12
2906608,2,R,"Salt Lake City,UT","Salt Lake,UT",UT,84129,27,81,25.81,10.0,SE,8.0,0.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Night,2019,12,29,22
