In [1]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
new_states = {v:k for k,v in us_state_abbrev.items()}

In [2]:
len(new_states.items())

56

In [3]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Import Accident Dataset (1GB)

In [4]:
accidents = pd.read_csv('resources/US_Accidents_Dec20_Updated.csv')
dropthese = ['ID', 'End_Lat', 'End_Lng', 'End_Time', 'Description']
accidents = accidents.drop(columns = dropthese, axis = 1)
accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'])
accidents.head()

Unnamed: 0,Severity,Start_Time,Start_Lat,Start_Lng,Distance(mi),Number,Street,Side,City,County,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,2,2019-05-21 08:29:55,34.809,-82.269,0.0,439.0,Tanner Rd,R,Greenville,Greenville,...,False,False,False,False,False,False,Day,Day,Day,Day
1,2,2019-10-07 17:43:09,35.09,-80.746,0.0,3299.0,Providence Branch Ln,R,Charlotte,Mecklenburg,...,False,False,False,False,False,False,Day,Day,Day,Day
2,2,2020-12-13 21:53:00,37.146,-121.985,1.4,,Santa Cruz Hwy,R,Los Gatos,Santa Clara,...,False,False,False,False,False,False,Night,Night,Night,Night
3,2,2018-04-17 16:51:23,39.11,-119.774,0.0,,US Highway 395 S,R,Carson City,Douglas,...,False,False,False,False,True,False,Day,Day,Day,Day
4,3,2016-08-31 17:40:49,26.103,-80.265,0.0,,I-595 W,R,Fort Lauderdale,Broward,...,False,False,False,False,True,False,Day,Day,Day,Day


In [5]:
len(accidents.index)

2906610

# Function to make Datasets of Accidents by Year (Joined with Vehicle Registration Data of the same year)

In [6]:
def make_df_by_year(year):
    
    registrations = pd.read_excel(f'resources/{year}_vehicle_reg.xlsx', header = 7)
    registrations.columns = [str(i) for i in range(len(registrations.columns))]
    registrations = registrations.set_index('0')

    col1 = ['Automobiles','Buses','Trucks','Motorcycles','All Vehicles']
    col2 = ['PRIVATE/COMMERCIAL','PUBLICALLY_OWNED','TOTAL']
    col_tier = []
    for i in col1:
        for j in col2:
            col_tier.append((i,j))
    registrations.columns = pd.MultiIndex.from_tuples(col_tier)
    starting_point = registrations.index.to_list().index('Alabama')
    registrations = registrations[starting_point:].dropna()

    for i in registrations.columns:
        registrations[i] = pd.to_numeric(registrations[i])
    
    
    year_start = f"{year}-01-01"
    year_end = f"{year+1}-01-01"
    accidents_output = accidents[(accidents['Start_Time'] > year_start) & (accidents['Start_Time'] < year_end)]
    
    transform = accidents_output.groupby(['State','Severity']).count()['Start_Time'].reset_index().rename(columns = {'Start_Time':'Count'})

    count_crashes = accidents_output[['State','Severity']].groupby(['State']).count()
    count_crashes = count_crashes.reset_index()
    count_crashes['State'] = count_crashes['State'].apply(lambda x: new_states[x])
    
    #-- Make Severity
    severity_1 = []
    severity_2 = []
    severity_3 = []
    severity_4 = []

    states = list(transform['State'].unique())
    for i in states:
        state_only = transform[transform['State'] == i]
        sev_1 = state_only[state_only['Severity'] == 1]
        sev_2 = state_only[state_only['Severity'] == 2]
        sev_3 = state_only[state_only['Severity'] == 3]
        sev_4 = state_only[state_only['Severity'] == 4]

        if len(sev_1.index) > 0:
            severity_1.append(int(sev_1['Count']))
        else:
            severity_1.append(0)

        if len(sev_2.index) > 0:
            severity_2.append(int(sev_2['Count']))
        else:
            severity_2.append(0)

        if len(sev_3.index) > 0:
            severity_3.append(int(sev_3['Count']))
        else:
            severity_3.append(0)

        if len(sev_4.index) > 0:
            severity_4.append(int(sev_4['Count']))
        else:
            severity_4.append(0)
            
    state_by_severity = pd.DataFrame({'State': list(transform['State'].unique()),
             'Severity_1': severity_1,
             'Severity_2': severity_2,
             'Severity_3': severity_3,
             'Severity_4': severity_4})
    
    state_by_severity['State'] = state_by_severity['State'].apply(lambda x: new_states[x])
    count_crashes.columns = ['State','Crash Count']
    
    check = registrations['All Vehicles']['TOTAL'].to_dict()
    new_keys = []
    for k,v in check.items():
        if k.endswith(' '):
            new_keys.append(k[:-1])
        elif k.endswith(' (2)'):
            new_keys.append(k.replace(' (2)',''))
        else:
            new_keys.append(k)

    new_out = {}
    for i,j in enumerate(check.values()):
        new_out[new_keys[i]] = j
    new_out['District of Columbia'] = new_out.pop('Dist. of Col.')

    new_out.pop('Total')
    
    new_out = {
        'State': list(new_out.keys()),
        'Registered_Vehicles': list(new_out.values())
    }
    
    vehicle_reg = pd.DataFrame(new_out)
    crash_reg = count_crashes.merge(vehicle_reg, how = 'left', on = 'State').merge(state_by_severity, how = 'left', on ='State')
    crash_reg['Crash_Index'] = crash_reg['Crash Count']/ crash_reg['Registered_Vehicles'] * 10000
    return crash_reg

# Make DFs with the above function

In [7]:
seventeen = make_df_by_year(2017)
eighteen = make_df_by_year(2018)
nineteen = make_df_by_year(2019)

In [8]:
seventeen.head()

Unnamed: 0,State,Crash Count,Registered_Vehicles,Severity_1,Severity_2,Severity_3,Severity_4,Crash_Index
0,Alabama,1813,5056073.0,3,837,842,131,3.586
1,Arkansas,95,2833697.0,0,29,13,53,0.335
2,Arizona,7976,5964434.074,4,5706,1417,849,13.373
3,California,95651,30795141.0,39,62232,31791,1589,31.06
4,Colorado,4702,5259960.0,1,2848,1162,691,8.939


In [9]:
eighteen.head()

Unnamed: 0,State,Crash Count,Registered_Vehicles,Severity_1,Severity_2,Severity_3,Severity_4,Crash_Index
0,Alabama,7544,5300199.154,7,4529,2857,151,14.233
1,Arkansas,491,2817145.296,0,201,98,192,1.743
2,Arizona,13138,5806312.675,2,9479,2803,854,22.627
3,California,98862,31022327.877,23,63912,33351,1576,31.868
4,Colorado,8981,5356017.862,1,4842,3344,794,16.768


In [10]:
nineteen.head()

Unnamed: 0,State,Crash Count,Registered_Vehicles,Severity_1,Severity_2,Severity_3,Severity_4,Crash_Index
0,Alabama,10148,5288208.452,2,7525,2471,150,19.19
1,Arkansas,925,2902110.817,0,516,227,182,3.187
2,Arizona,13633,5982559.286,3,10400,2379,851,22.788
3,California,168526,31247270.046,9,140611,26024,1882,53.933
4,Colorado,11920,5412403.746,1,4254,6455,1210,22.023


In [11]:
avg_registered_vehicles = (seventeen['Registered_Vehicles'] + eighteen['Registered_Vehicles'] + nineteen['Registered_Vehicles']) / 3 

In [12]:
for i in list(seventeen.index):
    if i not in list(eighteen.index):
        print(f'not in eighteen: {i}')
    if i not in list(nineteen.index):
        print(f'not in nineteen: {i}')

# Combining Crash Indexes into one output

In [13]:
index_by_state = pd.DataFrame({'State':seventeen['State'].to_list(),
            '2017':seventeen['Crash_Index'].to_list(),
            '2018':eighteen['Crash_Index'].to_list(),
            '2019':nineteen['Crash_Index'].to_list(),
            'Registered_Vehicles_Avg':avg_registered_vehicles})
index_by_state.head()

Unnamed: 0,State,2017,2018,2019,Registered_Vehicles_Avg
0,Alabama,3.586,14.233,19.19,5214826.869
1,Arkansas,0.335,1.743,3.187,2850984.371
2,Arizona,13.373,22.627,22.788,5917768.678
3,California,31.06,31.868,53.933,31021579.641
4,Colorado,8.939,16.768,22.023,5342793.869


# Average Crash Index as a single Y value

In [14]:
index_by_state['Crash_Index_By_Registration'] = (index_by_state['2017'] + index_by_state['2018'] + index_by_state['2019']) / 3
index_by_state = index_by_state.drop(columns = ['2017','2018', '2019'])
index_by_state

Unnamed: 0,State,Registered_Vehicles_Avg,Crash_Index_By_Registration
0,Alabama,5214826.869,12.336
1,Arkansas,2850984.371,1.755
2,Arizona,5917768.678,19.596
3,California,31021579.641,38.954
4,Colorado,5342793.869,15.91
5,Connecticut,2861566.497,14.507
6,District of Columbia,349470.603,21.075
7,Delaware,998709.332,8.559
8,Florida,17429663.509,21.864
9,Georgia,8516480.615,17.148


# Importing Population Data

In [15]:
pop_data = pd.read_csv('resources/state_population_data_july2020.csv').drop(columns = ['SUMLEV', 'CENSUS2010POP', 'STATE','REGION','DIVISION'], axis = 1)
pop_data.head()

Unnamed: 0,NAME,SEX,ORIGIN,RACE,AGE,POPESTIMATE2019
0,Alabama,0,0,1,0,35594
1,Alabama,0,0,1,1,36679
2,Alabama,0,0,1,2,37904
3,Alabama,0,0,1,3,38510
4,Alabama,0,0,1,4,38476


# We only care about people over 18

## Please note: Sex "0" is total of Sex "1" and Sex "2", we do not need Sex "0"

We also have no crash data for Hawaii or Alaska, they will be dropped when the left merge happens

In [16]:
print(f"Length of Raw State Population Dataset: {len(pop_data.index)}")

Length of Raw State Population Dataset: 236844


In [17]:
pop_data = pop_data[(pop_data['AGE'] > 17) & (pop_data['SEX'] > 0)].rename(columns = {'NAME':'State'})
pop_data.head()

Unnamed: 0,State,SEX,ORIGIN,RACE,AGE,POPESTIMATE2019
1566,Alabama,1,0,1,18,20741
1567,Alabama,1,0,1,19,20398
1568,Alabama,1,0,1,20,20506
1569,Alabama,1,0,1,21,20750
1570,Alabama,1,0,1,22,20968


In [18]:
print(f"Length of Filtered State Population Dataset: {len(pop_data.index)}")

Length of Filtered State Population Dataset: 124848


# Total Pop per state by Sex

In [19]:
sex_by_state = pop_data.groupby(['State','SEX']).sum()['POPESTIMATE2019'].reset_index()
sex1 = sex_by_state[sex_by_state['SEX'] == 1][['State','POPESTIMATE2019']].rename(columns = {'POPESTIMATE2019':'Sex_1_Pop'})
sex2 = sex_by_state[sex_by_state['SEX'] == 2][['State','POPESTIMATE2019']].rename(columns = {'POPESTIMATE2019':'Sex_2_Pop'})
sexes = sex1.merge(sex2, how = 'left', on = 'State')
sexes.head()

Unnamed: 0,State,Sex_1_Pop,Sex_2_Pop
0,Alabama,3630624,3999134
1,Alaska,578074,525050
2,Arizona,5561612,5715350
3,Arkansas,2245960,2389338
4,California,30198162,31037002


# Total Pop per state by Race

In [20]:
race_by_state = pop_data.groupby(['RACE','State']).sum()['POPESTIMATE2019'].reset_index()

# making an output dataframe
out = {'State':race_by_state[race_by_state['RACE'] == 1]['State']}
for i in range(1, max(race_by_state['RACE'].unique()) + 1):
    title = f"Race_{i}_Pop"
    out[title] = race_by_state[race_by_state['RACE'] == i]['POPESTIMATE2019'].to_list()

race_by_state = pd.DataFrame(out)
race_by_state.head()

Unnamed: 0,State,Race_1_Pop,Race_2_Pop,Race_3_Pop,Race_4_Pop,Race_5_Pop,Race_6_Pop
0,Alabama,5372510,1988302,54772,114574,7414,92186
1,Alaska,758470,41374,154106,74990,14218,59966
2,Arizona,9481818,555130,535568,432444,30594,241408
3,Arkansas,3733716,692410,46008,77192,15056,70916
4,California,44041180,3994944,985052,10016564,305326,1892098


# Total Pop per state by Origin

In [21]:
origin_by_state = pop_data.groupby(['ORIGIN','State']).sum()['POPESTIMATE2019'].reset_index()

# making an output dataframe
out = {'State':origin_by_state[origin_by_state['ORIGIN'] == 1]['State']}
for i in range(0, max(origin_by_state['ORIGIN'].unique()) + 1):
    title = f"Origin_{i}_Pop"
    out[title] = origin_by_state[origin_by_state['ORIGIN'] == i]['POPESTIMATE2019'].to_list()

origin_by_state = pd.DataFrame(out)
origin_by_state.head()

Unnamed: 0,State,Origin_0_Pop,Origin_1_Pop,Origin_2_Pop
51,Alabama,3814879,3680839,134040
52,Alaska,551562,516211,35351
53,Arizona,5638481,4057303,1581178
54,Arkansas,2317649,2169720,147929
55,California,30617582,19670255,10947327


# Merging Population Data

In [22]:
index_by_state = index_by_state.merge(sexes, how = 'left', on = 'State').merge(race_by_state, how = 'left', on = 'State').merge(origin_by_state, how = 'left', on = 'State')
index_by_state

Unnamed: 0,State,Registered_Vehicles_Avg,Crash_Index_By_Registration,Sex_1_Pop,Sex_2_Pop,Race_1_Pop,Race_2_Pop,Race_3_Pop,Race_4_Pop,Race_5_Pop,Race_6_Pop,Origin_0_Pop,Origin_1_Pop,Origin_2_Pop
0,Alabama,5214826.869,12.336,3630624,3999134,5372510,1988302,54772,114574,7414,92186,3814879,3680839,134040
1,Arkansas,2850984.371,1.755,2245960,2389338,3733716,692410,46008,77192,15056,70916,2317649,2169720,147929
2,Arizona,5917768.678,19.596,5561612,5715350,9481818,555130,535568,432444,30594,241408,5638481,4057303,1581178
3,California,31021579.641,38.954,30198162,31037002,44041180,3994944,985052,10016564,305326,1892098,30617582,19670255,10947327
4,Colorado,5342793.869,15.91,4515428,4483006,7917570,396642,135742,320454,16828,211198,4499217,3638458,860759
5,Connecticut,2861566.497,14.507,2736330,2939364,4605256,660040,30176,275964,5972,98286,2837847,2420952,416895
6,District of Columbia,349470.603,21.075,539572,615590,557770,507098,6224,55950,1318,26802,577581,519959,57622
7,Delaware,998709.332,8.559,735386,804998,1098566,339728,9884,62892,1522,27792,770192,710390,59802
8,Florida,17429663.509,21.864,16675320,17820296,27301336,5406948,172132,1033870,37850,543480,17247808,12941652,4306156
9,Georgia,8516480.615,17.148,7772816,8454268,9998722,5162238,80770,720994,17238,247122,8113542,7440439,673103


# Making a Total_Population Column

In [23]:
population_columns = [i for i in index_by_state.columns if i.endswith('Pop')]

total_pop = [0 for i in range(len(index_by_state.index))]
for i in population_columns:
    total_pop += index_by_state[i]
    
index_by_state['Total_2019_Pop'] = total_pop

# Importing Calculated Road Distance Dataset

In [24]:
road_dist = pd.read_csv('road_distance_by_state.csv').rename(columns = {'state_name':'State'})
road_dist.head()

Unnamed: 0,State,sum_interstate_distance,sum_highway_distance,sum_other_distance
0,Alabama,36.236,84.315,2055.136
1,Arizona,39.706,37.897,1519.798
2,Arkansas,24.132,86.259,2121.401
3,California,81.545,46.537,4664.485
4,Colorado,34.758,105.26,2022.606


In [25]:
index_by_state = index_by_state.merge(road_dist, how = 'left', on = 'State')
index_by_state

Unnamed: 0,State,Registered_Vehicles_Avg,Crash_Index_By_Registration,Sex_1_Pop,Sex_2_Pop,Race_1_Pop,Race_2_Pop,Race_3_Pop,Race_4_Pop,Race_5_Pop,Race_6_Pop,Origin_0_Pop,Origin_1_Pop,Origin_2_Pop,Total_2019_Pop,sum_interstate_distance,sum_highway_distance,sum_other_distance
0,Alabama,5214826.869,12.336,3630624,3999134,5372510,1988302,54772,114574,7414,92186,3814879,3680839,134040,22889274,36.236,84.315,2055.136
1,Arkansas,2850984.371,1.755,2245960,2389338,3733716,692410,46008,77192,15056,70916,2317649,2169720,147929,13905894,24.132,86.259,2121.401
2,Arizona,5917768.678,19.596,5561612,5715350,9481818,555130,535568,432444,30594,241408,5638481,4057303,1581178,33830886,39.706,37.897,1519.798
3,California,31021579.641,38.954,30198162,31037002,44041180,3994944,985052,10016564,305326,1892098,30617582,19670255,10947327,183705492,81.545,46.537,4664.485
4,Colorado,5342793.869,15.91,4515428,4483006,7917570,396642,135742,320454,16828,211198,4499217,3638458,860759,26995302,34.758,105.26,2022.606
5,Connecticut,2861566.497,14.507,2736330,2939364,4605256,660040,30176,275964,5972,98286,2837847,2420952,416895,17027082,11.91,11.267,471.569
6,District of Columbia,349470.603,21.075,539572,615590,557770,507098,6224,55950,1318,26802,577581,519959,57622,3465486,0.396,0.822,20.553
7,Delaware,998709.332,8.559,735386,804998,1098566,339728,9884,62892,1522,27792,770192,710390,59802,4621152,1.39,7.388,207.438
8,Florida,17429663.509,21.864,16675320,17820296,27301336,5406948,172132,1033870,37850,543480,17247808,12941652,4306156,103486848,46.611,111.016,3012.123
9,Georgia,8516480.615,17.148,7772816,8454268,9998722,5162238,80770,720994,17238,247122,8113542,7440439,673103,48681252,39.59,124.397,2834.434


# Importing the Land Area Dataset made by the SrapingLandArea_byState.ipynb file

In [26]:
land_area = pd.read_csv('land_area_by_state.csv')
land_area.head()

Unnamed: 0,State,Total_SqMi,Total_SqKm,LandArea_SqMi,LandArea_SqKm
0,Alabama,52420,135767,50645,131171
1,Alaska,665384,1723337,570641,1477953
2,Arizona,113990,295234,113594,294207
3,Arkansas,53179,137732,52035,134771
4,California,163695,423967,155779,403466


# We only care about the LandArea_SqMi Column

In [27]:
land_area = land_area[['State','LandArea_SqMi']]
land_area.head()

Unnamed: 0,State,LandArea_SqMi
0,Alabama,50645
1,Alaska,570641
2,Arizona,113594
3,Arkansas,52035
4,California,155779


In [28]:
index_by_state = index_by_state.merge(land_area, how = 'left',on = 'State')
index_by_state

Unnamed: 0,State,Registered_Vehicles_Avg,Crash_Index_By_Registration,Sex_1_Pop,Sex_2_Pop,Race_1_Pop,Race_2_Pop,Race_3_Pop,Race_4_Pop,Race_5_Pop,Race_6_Pop,Origin_0_Pop,Origin_1_Pop,Origin_2_Pop,Total_2019_Pop,sum_interstate_distance,sum_highway_distance,sum_other_distance,LandArea_SqMi
0,Alabama,5214826.869,12.336,3630624,3999134,5372510,1988302,54772,114574,7414,92186,3814879,3680839,134040,22889274,36.236,84.315,2055.136,50645
1,Arkansas,2850984.371,1.755,2245960,2389338,3733716,692410,46008,77192,15056,70916,2317649,2169720,147929,13905894,24.132,86.259,2121.401,52035
2,Arizona,5917768.678,19.596,5561612,5715350,9481818,555130,535568,432444,30594,241408,5638481,4057303,1581178,33830886,39.706,37.897,1519.798,113594
3,California,31021579.641,38.954,30198162,31037002,44041180,3994944,985052,10016564,305326,1892098,30617582,19670255,10947327,183705492,81.545,46.537,4664.485,155779
4,Colorado,5342793.869,15.91,4515428,4483006,7917570,396642,135742,320454,16828,211198,4499217,3638458,860759,26995302,34.758,105.26,2022.606,103642
5,Connecticut,2861566.497,14.507,2736330,2939364,4605256,660040,30176,275964,5972,98286,2837847,2420952,416895,17027082,11.91,11.267,471.569,4842
6,District of Columbia,349470.603,21.075,539572,615590,557770,507098,6224,55950,1318,26802,577581,519959,57622,3465486,0.396,0.822,20.553,61
7,Delaware,998709.332,8.559,735386,804998,1098566,339728,9884,62892,1522,27792,770192,710390,59802,4621152,1.39,7.388,207.438,1949
8,Florida,17429663.509,21.864,16675320,17820296,27301336,5406948,172132,1033870,37850,543480,17247808,12941652,4306156,103486848,46.611,111.016,3012.123,53625
9,Georgia,8516480.615,17.148,7772816,8454268,9998722,5162238,80770,720994,17238,247122,8113542,7440439,673103,48681252,39.59,124.397,2834.434,57513


# Blue Laws
Source: https://worldpopulationreview.com/state-rankings/blue-laws-by-state
<br>
Binning by whether or not a state has restrictive measures against alcohol
<br>
### 0 = No Blue Laws, 1 = Some, 2 = Statewide

In [29]:
blue_df = pd.read_csv('resources/blue_laws.csv')
blue_df.blueLaws.value_counts()

No blue laws                       20
No vehicle sales                   11
Varies by County                   10
No hard liquor sales                5
No off-premise sales                3
No vehicle or hard liquor sales     1
Name: blueLaws, dtype: int64

In [30]:
blue_df

Unnamed: 0,State,blueLaws
0,Oklahoma,Varies by County
1,New Mexico,Varies by County
2,Mississippi,Varies by County
3,Maryland,Varies by County
4,Louisiana,Varies by County
5,Kentucky,Varies by County
6,Georgia,Varies by County
7,Florida,Varies by County
8,Arkansas,Varies by County
9,Alabama,Varies by County


In [31]:
zeros = ['No blue laws',
        'No vehicle sales']
blue_df['blueLaws'] = blue_df['blueLaws'].apply(lambda x: 0 if x in zeros else 1 if x == 'Varies by County' else 2)
blue_df = blue_df.sort_values('State')
blue_df

Unnamed: 0,State,blueLaws
9,Alabama,1
49,Alaska,0
48,Arizona,0
8,Arkansas,1
47,California,0
20,Colorado,0
46,Connecticut,0
45,Delaware,0
7,Florida,1
6,Georgia,1


In [32]:
index_by_state = index_by_state.merge(blue_df, how = 'left', on = 'State')
index_by_state['blueLaws'] = index_by_state['blueLaws'].fillna(0).apply(lambda x: int(x))
index_by_state

Unnamed: 0,State,Registered_Vehicles_Avg,Crash_Index_By_Registration,Sex_1_Pop,Sex_2_Pop,Race_1_Pop,Race_2_Pop,Race_3_Pop,Race_4_Pop,Race_5_Pop,Race_6_Pop,Origin_0_Pop,Origin_1_Pop,Origin_2_Pop,Total_2019_Pop,sum_interstate_distance,sum_highway_distance,sum_other_distance,LandArea_SqMi,blueLaws
0,Alabama,5214826.869,12.336,3630624,3999134,5372510,1988302,54772,114574,7414,92186,3814879,3680839,134040,22889274,36.236,84.315,2055.136,50645,1
1,Arkansas,2850984.371,1.755,2245960,2389338,3733716,692410,46008,77192,15056,70916,2317649,2169720,147929,13905894,24.132,86.259,2121.401,52035,1
2,Arizona,5917768.678,19.596,5561612,5715350,9481818,555130,535568,432444,30594,241408,5638481,4057303,1581178,33830886,39.706,37.897,1519.798,113594,0
3,California,31021579.641,38.954,30198162,31037002,44041180,3994944,985052,10016564,305326,1892098,30617582,19670255,10947327,183705492,81.545,46.537,4664.485,155779,0
4,Colorado,5342793.869,15.91,4515428,4483006,7917570,396642,135742,320454,16828,211198,4499217,3638458,860759,26995302,34.758,105.26,2022.606,103642,0
5,Connecticut,2861566.497,14.507,2736330,2939364,4605256,660040,30176,275964,5972,98286,2837847,2420952,416895,17027082,11.91,11.267,471.569,4842,0
6,District of Columbia,349470.603,21.075,539572,615590,557770,507098,6224,55950,1318,26802,577581,519959,57622,3465486,0.396,0.822,20.553,61,0
7,Delaware,998709.332,8.559,735386,804998,1098566,339728,9884,62892,1522,27792,770192,710390,59802,4621152,1.39,7.388,207.438,1949,0
8,Florida,17429663.509,21.864,16675320,17820296,27301336,5406948,172132,1033870,37850,543480,17247808,12941652,4306156,103486848,46.611,111.016,3012.123,53625,1
9,Georgia,8516480.615,17.148,7772816,8454268,9998722,5162238,80770,720994,17238,247122,8113542,7440439,673103,48681252,39.59,124.397,2834.434,57513,1
