Welcome! This notebook contains a lot of code that is intended to allow the user to select options related to the samples that they wish to run the ML models on. Cells with options will most often be commented out until the desird options are selected and uncommented. This includes the options to run this notebook in Colabs or on a local notebook and has ways to import/export data through these systems. 

endpoint: database-1.cguvsmompkjc.us-east-2.rds.amazonaws.com

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import pandas as pd
import psycopg2
import sqlalchemy
import getpass
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

  """)


In [None]:
print('Enter AWS Endpoint:')
endpoint = getpass.getpass()

Enter AWS Endpoint:
··········


In [None]:
print('Enter AWS Database Password: ')
aws_pass = getpass.getpass()

Enter AWS Database Password: 
··········


In [None]:
postgres = 'postgres'

In [None]:
db_string = f"postgresql+psycopg2://{postgres}:{aws_pass}@{endpoint}:5432/bootcampfinalproject_accidents"
engine = sqlalchemy.create_engine(db_string)
con = engine.connect()

In [None]:
table = sqlalchemy.Table('US_Accidents_Dec20_Updated', sqlalchemy.MetaData(bind=None), autoload=True, autoload_with=engine)
statement = sqlalchemy.select([table])

In [None]:
x = con.execute(statement)

In [None]:
data = x.fetchall()

In [None]:
cols = x.keys()

In [None]:
accidents_df= pd.DataFrame(data = data, columns = cols)
#accidents_df.head()

In [None]:
#Local Data Entry
#data = pd.read_csv('US_Accidents_Dec20_Updated.csv')

In [None]:
def accidents_ETL(accidents):
    #drop unnecessary columns
    dropthese = ['ID', 'End_Lat', 'End_Lng', 'End_Time', 'Description', 'Distance(mi)', 'Number', 'Street', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Start_Lat', 'Start_Lng', 'Nautical_Twilight', 'Astronomical_Twilight', 'Wind_Chill(F)', 'Amenity', 'Sunrise_Sunset']
    accidents = accidents.drop(columns = dropthese, axis = 1)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #TWILIGHT:

    #Remname Civil Twilight
    accidents = accidents.rename(columns={'Civil_Twilight': 'Day/Night'})

    #-------------------------------------------------------#
    #-------------------------------------------------------#
    #START TIME: 

    #modify Start_Time to datetime so we can extract year, month, day, hour
    accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'])

    #extracting year, month, day, hour
    accidents['Year'] = pd.DatetimeIndex(accidents['Start_Time']).year
    accidents['Month'] = pd.DatetimeIndex(accidents['Start_Time']).month
    accidents['Day'] = pd.DatetimeIndex(accidents['Start_Time']).day
    accidents['Hour'] = pd.DatetimeIndex(accidents['Start_Time']).hour

    #drop unnecessary Start_Time column
    accidents = accidents.drop(['Start_Time'], axis=1)

    #Set date/time columns as objects to prepare for encoding
    accidents['Year'] = accidents['Year'].astype(object)
    accidents['Month'] = accidents['Month'].astype(object)
    accidents['Day'] = accidents['Day'].astype(object)
    accidents['Hour'] = accidents['Hour'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #SIDE: 
            
    #Changing Side to R=0 and L=1 and making it integers for ML purposes
    accidents['Side'].replace('R', '0', inplace=True)
    accidents['Side'].replace('L', '1', inplace=True)
    accidents['Side'] = np.where((accidents['Side'] == " "), '0', accidents['Side'])
    accidents['Side'] = pd.to_numeric(accidents['Side'], downcast='integer')

    #-------------------------------------------------------#
    #-------------------------------------------------------#
    #CITY AND COUNTY:

    #City and County were determined to be excessive and were dropped following feature testing. 
    accidents = accidents.drop(['City', 'County'], axis=1)
    
    #Combining City/State and County/State to prevent wrong city or county aggregation
    # accidents['City_State'] = accidents['City'].astype(str) + ',' + accidents['State'].astype(str)
    # accidents['County_State'] = accidents['County'].astype(str) + ',' + accidents['State'].astype(str)
    # accidents['City'] = accidents['City_State']
    # accidents['County'] = accidents['County_State']
    # accidents = accidents.drop(['City_State', 'County_State'], axis=1)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
    
    #WIND DIRECTIONS: 

    #Correcting Wind_Speed(mph) NaN to 0 because if there was wind it would have likely been recorded. This is run with Wind Direction due to Direction NaN processing requiring having Wind_Speed NaN set to 0 first. 
    accidents['Wind_Speed(mph)'] = accidents['Wind_Speed(mph)'].fillna(0.0)

    #Replacing wind directions to cardinal 8 directions
    accidents['Wind_Direction'].replace('CALM', 'Calm', inplace=True)
    accidents['Wind_Direction'].replace(['ENE', 'NNE'], 'NE', inplace=True)
    accidents['Wind_Direction'].replace(['ESE', 'SSE'], 'SE', inplace=True)
    accidents['Wind_Direction'].replace(['WNW', 'NNW'], 'NW', inplace=True)
    accidents['Wind_Direction'].replace(['WSW', 'SSW'], 'SW', inplace=True)
    accidents['Wind_Direction'].replace('North', 'N', inplace=True)
    accidents['Wind_Direction'].replace('East', 'E', inplace=True)
    accidents['Wind_Direction'].replace('South', 'S', inplace=True)
    accidents['Wind_Direction'].replace('West', 'W', inplace=True)
    accidents['Wind_Direction'].replace('VAR', 'Variable', inplace=True)

    #Changing unknown Wind_Direction with Wind_Speed(mi)>0 to "Variable"
    accidents['Wind_Direction'] = np.where((accidents['Wind_Direction'].isnull()) & (accidents['Wind_Speed(mph)'] > 0), 'Variable', accidents.Wind_Direction)

    #Changing Wind_Direction NaN with Wind_Speed(mi) of 0 to "Calm"
    accidents['Wind_Direction'] = np.where((accidents['Wind_Direction'].isnull()) & (accidents['Wind_Speed(mph)'] == 0), 'Calm', accidents.Wind_Direction)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #WIND SPEED:

    #eliminating Wind_Speed outliers
    high_wind = accidents[accidents['Wind_Speed(mph)'] > 100]
    accidents = accidents.drop(high_wind.index)

    #Binning Wind_Speed(mph)
    bins = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 101]

    #create labels for the bins
    group_labels = ["0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-100",] 

    #slice the Humidity data and place into bins
    accidents['Wind_Speed(mph)'] = pd.cut(accidents['Wind_Speed(mph)'], bins, labels=group_labels)

    #casting Temperature as object for encoding purposes
    accidents['Wind_Speed(mph)'] = accidents['Wind_Speed(mph)'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #HUMIDITY:

    #Chaning Humidity NaN to "0"
    accidents['Humidity(%)'] = accidents['Humidity(%)'].fillna('101')

    #change column to numerical
    accidents['Humidity(%)'] = accidents['Humidity(%)'].astype(int)

    #Binning Humidity
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 101]

    #create labels for the bins
    group_labels = ["0-9%", "10-19%", "20-29%", "30-39%", "40-49%", "50-59%", "60-69%", "70-79%", "80-89%", "90-100%", "Unknown"] 

    #slice the Humidity data and place into bins
    accidents['Humidity(%)'] = pd.cut(accidents['Humidity(%)'], bins, labels=group_labels)

    #casting Humidity as object for encoding purposes
    accidents['Humidity(%)'] = accidents['Humidity(%)'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
    
    #DAY/NIGHT:

    #Dropping Day/Night NaN values - because daylight hours change and cannot be reliably determined by time and it is 110 values.
    noDayNight = accidents[accidents['Day/Night'].isnull()]
    accidents = accidents.drop(noDayNight.index)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #ZIPCODE:

    #Dropping Zipcodes because of an extremely large number of variables and no extra value added over city, county, state.
    accidents.drop('Zipcode', axis=1, inplace=True)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #PRECIPITATION: 

    #Correcting Precipitation(in) NaN to 0 because if there was precipitation it would have likely been recorded. 
    accidents['Precipitation(in)'] = accidents['Precipitation(in)'].fillna(0.00)

    #eliminating Precipitation(in) outliers
    high_rain = accidents[accidents['Precipitation(in)'] > 13]
    accidents = accidents.drop(high_rain.index)

    # Precipitation Binning
    bins = [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

    #create labels for the bins
    group_labels = ["0-1", "1-2", "2-3", "3-4", "4-5", "5-6", "6-7", "7-8", "8-9", "9-10", "10-11"] 

    #slice the Humidity data and place into bins
    accidents['Precipitation(in)'] = pd.cut(accidents['Precipitation(in)'], bins, labels=group_labels)

    #casting Temperature as object for encoding purposes
    accidents['Precipitation(in)'] = accidents['Precipitation(in)'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
   
    #TEMPERATURE:

    #Changing Temperature NaN to "Unknown"
    accidents['Temperature(F)'] = accidents['Temperature(F)'].fillna('500')

    #converting Temperature(F) to float
    accidents['Temperature(F)'] = accidents['Temperature(F)'].astype(float)

    #dropping values below -30F
    dropping = accidents[accidents['Temperature(F)'] < -30]
    accidents.drop(dropping.index, inplace=True)

    #dropping over 120F
    dropping2 =  accidents[(accidents['Temperature(F)'] > 120) & (accidents['Temperature(F)'] <500)]
    accidents.drop(dropping2.index, inplace=True)

    #Binning Temperature
    bins = [-30, -20, -10, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 121, 500]
    #create labels for the bins
    group_labels = ["-30F:-20F", "-19F:-10F", "-10F:0F", "0F-9F", "10F:19F", "20F:29F", "30F:39F", "40F:49F", "50F:59F", "60F:69F", "70F:79F", "80F:89F", "90F:99F", "100F:109F", "110F:120F", "Unknown"] 

    #slice the Humidity data and place into bins
    accidents['Temperature(F)'] = pd.cut(accidents['Temperature(F)'], bins, labels=group_labels)

    #casting Temperature as object for encoding purposes
    accidents['Temperature(F)'] = accidents['Temperature(F)'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #PRESSURE: 

    #Changing Pressure NaN to "Unknown"
    accidents['Pressure(in)'] = accidents['Pressure(in)'].fillna('500')

    #Casting Pressure as float to ensure dropping works.
    accidents['Pressure(in)'] = accidents['Pressure(in)'].astype(float) 

    #dropping Pressure(in) values over 32in and removing a faulty row
    dropping3 = accidents[(accidents['Pressure(in)'] >32) & (accidents['Pressure(in)'] <500)]
    accidents.drop(dropping3.index, inplace=True)
    accidents.drop(index=879551, inplace=True)

    #Binning Pressure(in)
    bins = [-1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 35, 500]

    #create labels for the bins
    group_labels = ["0-2", "2-4", "4-6", "6-8", "8-10", "10-12", "12-14", "14-16", "16-18", "18-20", "20-22", "22-24", "24-26", "26-28", "28-30", "30-32", "32-34", "Unknown"] 

    #slice the Humidity data and place into bins
    accidents['Pressure(in)'] = pd.cut(accidents['Pressure(in)'], bins, labels=group_labels)

    #casting Temperature as object for encoding purposes
    accidents['Pressure(in)'] = accidents['Pressure(in)'].astype(object) 

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #VISIBILITY:

    #Change Visibilty(mi) NaN values to the median of 10.0
    accidents['Visibility(mi)'] = accidents['Visibility(mi)'].fillna(10.00)

    #Change anything above 10mi to 10mi because that is considered max visibility and nothing beyond that matters to driving.
    accidents['Visibility(mi)'] = np.where((accidents['Visibility(mi)'] > 10) , 10, accidents['Visibility(mi)'])

    #Binning Visibility(mi)
    bins = [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]

    #create labels for the bins
    group_labels = ["0-1", "1-2", "2-3", "3-4", "4-5", "5-6", "6-7", "7-8", "8-9", "9-10"] 

    #slice the Humidity data and place into bins
    accidents['Visibility(mi)'] = pd.cut(accidents['Visibility(mi)'], bins, labels=group_labels)

    #casting Temperature as object for encoding purposes
    accidents['Visibility(mi)'] = accidents['Visibility(mi)'].astype(object)

    #-------------------------------------------------------#
    #-------------------------------------------------------#
 
    #WEATHER CONDITION:

    #Consolidating Variables

    #Clear
    accidents['Weather_Condition'].replace('Clear', 'Fair', inplace=True)
    accidents['Weather_Condition'].replace('N/A Precipitation', 'Fair', inplace=True)

    #Chaning Weather_Condition NaN to "Unknown"
    accidents['Weather_Condition'] = accidents['Weather_Condition'].fillna('Fair')

    #Cloudy
    accidents['Weather_Condition'].replace('Mostly Cloudy', 'Mostly_Cloudy', inplace=True)
    accidents['Weather_Condition'].replace('Partly Cloudy', 'Partly_Cloudy', inplace=True)
    accidents['Weather_Condition'].replace('Overcast', 'Mostly_Cloudy', inplace=True)
    accidents['Weather_Condition'].replace('Scattered Clouds', 'Partly_Cloudy', inplace=True)

    #Rain
    accidents['Weather_Condition'].replace('Light Rain', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Rain', 'Heavy_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Rain Shower', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Rain Showers', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Drizzle', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Drizzle', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Rain', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Drizzle', 'Light_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Drizzle', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Showers in the Vicinity', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Rain Showers', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Rain Shower', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Freezing Rain', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Rain Shower', 'Heavy_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Freezing Rain', 'Heavy_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Freezing Drizzle', 'Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Rain Showers', 'Heavy_Rain', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Freezing Drizzle', 'Rain', inplace=True)

    #Fog
    accidents['Weather_Condition'].replace('Patches of Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Light Freezing Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Partial Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Light Fog', 'Fog', inplace=True)
    accidents['Weather_Condition'].replace('Shallow Fog', 'Shallow_Fog', inplace=True)

    #Smoke/Haze
    accidents['Weather_Condition'].replace('Smoke', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Haze', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Light Haze', 'Smoke/Haze', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Smoke', 'Smoke/Haze', inplace=True)

    #Thunderstorms
    accidents['Weather_Condition'].replace('T-Storm', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Thunder in the Vicinity', 'Thunder', inplace=True)
    accidents['Weather_Condition'].replace('Light Rain with Thunder', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Heavy T-Storm', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Thunderstorms and Rain', 'Thunderstorm', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorms and Snow', 'Thunderstorms/Snow', inplace=True)
    accidents['Weather_Condition'].replace('Thunderstorms and Snow', 'Thunderstorms/Snow', inplace=True)
    accidents['Weather_Condition'].replace('Light Thunderstorm', 'Thunderstorm', inplace=True)

    #Snow
    accidents['Weather_Condition'].replace('Light Snow', 'Light_Snow', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Snow', 'Heavy_Snow', inplace=True)
    accidents['Weather_Condition'].replace('Light Snow Showers', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Snow Grains', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Light Blowing Snow', 'Snow/Windy', inplace=True)
    accidents['Weather_Condition'].replace('Light Snow Grains', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Low Drifting Snow', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Snow Showers', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Blowing Snow', 'Heavy Snow/Windy', inplace=True)
    accidents['Weather_Condition'].replace('Light Snow Shower', 'Snow', inplace=True)
    accidents['Weather_Condition'].replace('Drifting Snow', 'Snow/Windy', inplace=True)

    #Hail
    accidents['Weather_Condition'].replace('Light Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Small Hail', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Ice Pellets', 'Hail', inplace=True)
    accidents['Weather_Condition'].replace('Light Hail', 'Hail', inplace=True)

    #Sleet
    accidents['Weather_Condition'].replace('Light Sleet', 'Sleet', inplace=True)
    accidents['Weather_Condition'].replace('Heavy Sleet', 'Sleet', inplace=True)

    #Dust
    accidents['Weather_Condition'].replace('Sand', 'Dust', inplace=True)
    accidents['Weather_Condition'].replace('Widespread Dust', 'Dust', inplace=True)

    #Tornado
    accidents['Weather_Condition'].replace('Tornado', 'Funnel_Cloud', inplace=True) 
    accidents['Weather_Condition'].replace('Funnel Cloud', 'Funnel_Cloud', inplace=True)    

    #Dust Whirls
    accidents['Weather_Condition'].replace('Dust Whirls', 'Dust_Whirls', inplace=True)

    #Wintry Mix
    accidents['Weather_Condition'].replace('Wintry Mix', 'Wintry_Mix', inplace=True) 

    #Wind
    accidents = accidents.assign(Weather_Condition_Wind = 0)

    #Volcanic Ash
    accidents['Weather_Condition'].replace('Volcanic Ash', 'Volcanic_Ash', inplace=True)


    #-------------------------------------------------------#
    #-------------------------------------------------------#

    accidents = pd.get_dummies(accidents, columns = ['Weather_Condition'])

    for index, row in accidents.iterrows():
        if accidents.loc[index, 'Weather_Condition_Smoke / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
            accidents.loc[index, 'Weather_Condition_Smoke/Haze'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Blowing Dust'] == 1:
            accidents.loc[index, 'Weather_Condition_Dust'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Blowing Dust / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Dust'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Blowing Sand'] == 1:
            accidents.loc[index, 'Weather_Condition_Dust'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Blowing Snow'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Blowing Snow / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Cloudy / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Cloudy'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Drizzle / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Drizzle and Fog'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Fog'] = 1

        elif accidents.loc[index, 'Weather_Condition_Fair / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Fair'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Fog / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Fog'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Freezing Rain / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
            accidents.loc[index, 'Weather_Condition_Rain'] = 1

        elif accidents.loc[index, 'Weather_Condition_Haze / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Smoke/Haze'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Heavy Rain / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Heavy_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Heavy Snow / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Heavy_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
                
        elif accidents.loc[index, 'Weather_Condition_Heavy Snow with Thunder'] == 1:
            accidents.loc[index, 'Weather_Condition_Heavy_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Thunder'] = 1

        elif accidents.loc[index, 'Weather_Condition_Heavy Snow/Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Heavy_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Heavy T-Storm / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
            accidents.loc[index, 'Weather_Condition_Thunderstorm'] = 1

        elif accidents.loc[index, 'Weather_Condition_Heavy Thunderstorms and Snow'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Thunderstorm'] = 1

        elif accidents.loc[index, 'Weather_Condition_Heavy Thunderstorms with Small Hail'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunderstorm'] = 1
            accidents.loc[index, 'Weather_Condition_Hail'] = 1

        elif accidents.loc[index, 'Weather_Condition_Light Drizzle / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Light Freezing Rain / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Light Rain / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Light Rain Shower / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
        
        elif accidents.loc[index, 'Weather_Condition_Light Sleet / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
                        
        elif accidents.loc[index, 'Weather_Condition_Light Snow / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
                        
        elif accidents.loc[index, 'Weather_Condition_Light Snow and Sleet'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1
                        
        elif accidents.loc[index, 'Weather_Condition_Light Snow and Sleet / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
                        
        elif accidents.loc[index, 'Weather_Condition_Light Snow with Thunder'] == 1:
            accidents.loc[index, 'Weather_Condition_Light_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Thunder'] = 1

        elif accidents.loc[index, 'Weather_Condition_Mist / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Mist'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Mostly Cloudy / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Mostly_Cloudy'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Partly Cloudy / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Partly_Cloudy'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Patches of Fog / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Fog'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Rain / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Rain and Sleet'] == 1:
            accidents.loc[index, 'Weather_Condition_Rain'] = 1
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1

        elif accidents.loc[index, 'Weather_Condition_Sand / Dust Whirls Nearby'] == 1:
            accidents.loc[index, 'Weather_Condition_Sand'] = 1
            accidents.loc[index, 'Weather_Condition_Dust_Whirls'] = 1

        elif accidents.loc[index, 'Weather_Condition_Sand / Dust Whirlwinds'] == 1:
            accidents.loc[index, 'Weather_Condition_Sand'] = 1
            accidents.loc[index, 'Weather_Condition_Dust_Whirls'] = 1

        elif accidents.loc[index, 'Weather_Condition_Sleet / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Smoke / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Smoke/Haze'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Snow / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1
                
        elif accidents.loc[index, 'Weather_Condition_Snow and Sleet'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1

        elif accidents.loc[index, 'Weather_Condition_Snow and Sleet / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Sleet'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Snow/Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Snow'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Squalls / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Squalls'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_T-Storm / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunderstorm'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunder / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunder'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunder / Wintry Mix'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunder'] = 1
            accidents.loc[index, 'Weather_Condition_Wintry_Mix'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunder / Wintry Mix / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wintry_Mix'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunder and Hail'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunder'] = 1
            accidents.loc[index, 'Weather_Condition_Hail'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunder and Hail / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Hail'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Thunderstorms/Snow'] == 1:
            accidents.loc[index, 'Weather_Condition_Thunderstorm'] = 1
            accidents.loc[index, 'Weather_Condition_Snow'] = 1

        elif accidents.loc[index, 'Weather_Condition_Widespread Dust / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Dust'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1

        elif accidents.loc[index, 'Weather_Condition_Wintry Mix / Windy'] == 1:
            accidents.loc[index, 'Weather_Condition_Wintry_Mix'] = 1
            accidents.loc[index, 'Weather_Condition_Wind'] = 1    


    #-------------------------------------------------------#
    #-------------------------------------------------------#
  
    #Set columns to drop 
    cols_to_drop = ['Weather_Condition_Smoke / Windy', 'Weather_Condition_Blowing Dust',   'Weather_Condition_Blowing Dust / Windy', 'Weather_Condition_Blowing Sand', 'Weather_Condition_Blowing Snow', 'Weather_Condition_Blowing Snow / Windy', 'Weather_Condition_Cloudy / Windy', 'Weather_Condition_Drizzle / Windy', 'Weather_Condition_Drizzle and Fog', 'Weather_Condition_Fair / Windy', 'Weather_Condition_Fog / Windy', 'Weather_Condition_Freezing Rain / Windy', 'Weather_Condition_Haze / Windy', 'Weather_Condition_Heavy Rain / Windy', 'Weather_Condition_Heavy Snow / Windy', 'Weather_Condition_Heavy Snow with Thunder', 'Weather_Condition_Heavy Snow/Windy', 'Weather_Condition_Heavy T-Storm / Windy', 'Weather_Condition_Heavy Thunderstorms and Snow', 'Weather_Condition_Heavy Thunderstorms with Small Hail', 'Weather_Condition_Light Drizzle / Windy', 'Weather_Condition_Light Freezing Rain / Windy', 'Weather_Condition_Light Rain / Windy', 'Weather_Condition_Light Rain Shower / Windy', 'Weather_Condition_Light Sleet / Windy', 'Weather_Condition_Light Snow / Windy', 'Weather_Condition_Light Snow and Sleet', 'Weather_Condition_Light Snow and Sleet / Windy', 'Weather_Condition_Light Snow with Thunder', 'Weather_Condition_Mist / Windy', 'Weather_Condition_Mostly Cloudy / Windy', 'Weather_Condition_Partly Cloudy / Windy', 'Weather_Condition_Patches of Fog / Windy', 'Weather_Condition_Rain and Sleet', 'Weather_Condition_Sand', 'Weather_Condition_Sand / Dust Whirls Nearby', 'Weather_Condition_Sand / Dust Whirlwinds', 'Weather_Condition_Sleet / Windy', 'Weather_Condition_Smoke / Windy', 'Weather_Condition_Snow / Windy', 'Weather_Condition_Snow and Sleet', 'Weather_Condition_Snow and Sleet / Windy', 'Weather_Condition_Snow/Windy', 'Weather_Condition_Squalls / Windy', 'Weather_Condition_T-Storm / Windy', 'Weather_Condition_Thunder / Windy', 'Weather_Condition_Thunder / Wintry Mix', 'Weather_Condition_Thunder / Wintry Mix / Windy', 'Weather_Condition_Thunder and Hail', 'Weather_Condition_Thunder and Hail / Windy', 'Weather_Condition_Thunderstorms/Snow', 'Weather_Condition_Widespread Dust / Windy', 'Weather_Condition_Wintry Mix / Windy', 'Weather_Condition_Rain / Windy']

    # Drop the columns
    accidents = accidents.drop(columns = cols_to_drop, axis=1)  

    #-------------------------------------------------------#
    #-------------------------------------------------------#
   
    #BOOLEAN COLUMN CONVERSIONS:

    #Converting Bool columns to integers to prepare for scaling and ML modeling
    bool_cols = accidents.dtypes[accidents.dtypes == 'bool'].index.tolist()
    bool_int = accidents[bool_cols].astype(int)
    encoding = accidents.copy()
    encoding = accidents.drop(bool_int,1)

    #encoding = encoding.merge(bool_int, left_index=True, right_index=True)
    accidents = encoding.merge(bool_int, left_index=True, right_index=True)

    #-------------------------------------------------------#
    #-------------------------------------------------------#



    return(accidents)


In [None]:
accidents = accidents_ETL(accidents_df)
accidents

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Day/Night,Year,Month,Day,Hour,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,2,0,CA,60F:69F,60-69%,28-30,9-10,Variable,0-9,0-1,Day,2019,5,13,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2,0,LA,50F:59F,80-89%,30-32,9-10,SE,0-9,0-1,Night,2020,11,20,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4,0,OH,60F:69F,70-79%,28-30,9-10,SW,0-9,0-1,Night,2019,6,26,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,0,SC,30F:39F,80-89%,28-30,5-6,Calm,0-9,0-1,Day,2019,12,9,9,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,2,0,TN,70F:79F,20-29%,28-30,9-10,S,0-9,0-1,Day,2019,4,10,11,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906605,2,1,PA,60F:69F,90-100%,30-32,7-8,Calm,0-9,0-1,Night,2017,8,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2906606,2,1,FL,80F:89F,60-69%,30-32,9-10,Variable,0-9,0-1,Day,2018,7,13,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2906607,3,0,CA,60F:69F,60-69%,28-30,9-10,SW,0-9,0-1,Day,2018,4,2,17,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2906608,3,0,NY,80F:89F,70-79%,30-32,9-10,S,10-19,0-1,Night,2018,9,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Save the processed data prior to encoding for RAM Management purposes
#will save here, reload the instance, and then import this file to remove the previous RAM-using variables from the ETL process

#Local to Local
# output_data_file = "Accident_Data_ETL_Not_Encoded.csv"
# accidents.to_csv(output_data_file, index=False)

#Colabs to Local
# from google.colab import files
# accidents.to_csv('Accident_Data_ETL_Not_Encoded.csv')
# files.download('Accident_Data_ETL_Not_Encoded.csv')

#Colabs to G Drive
# from google.colab import drive
# drive.mount('/content/drive')
# accidents.to_csv('/content/drive/MyDrive/ML_data_and_code/Accident_Data_ETL_Not_Encoded.csv', index=False)

For the purposes of our model, encoding was saved until after the sample variables were chosen to improve processing time. It is left here commented out in case someone would like to encode the entire 2.9M rows.

In [None]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
# sample_size = 100000
# accidents = accidents.sample(n = sample_size, replace=True, random_state=1)
# accidents

In [None]:
    #Encoding Obj columns using OneHotEncoder

    # #Identifying the 'object' columns to encode
    # type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
    # type_cat

    # #Create a OneHotEncoder instance
    # enc = OneHotEncoder(sparse=False)

    # # Fit and transform the OneHotEncoder using the categorical variable list
    # encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))

    # # Add the encoded variable names to the dataframe
    # encode_df.columns = enc.get_feature_names(type_cat)
    # encode_df.head()

    # #Merge one-hot encoded features and drop the originals
    # accidents = accidents.merge(encode_df, left_index=True, right_index=True)
    # accidents = accidents.drop(type_cat,1)

In [None]:
#saving the encoded data for ML use. 
#Colabs to G Drive
# from google.colab import drive
# drive.mount('/content/drive')
# accidents.to_csv('/content/drive/MyDrive/ML_data_and_code/Accident_Data_ETL_FULL_Encoded.csv', index=False)