In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
# Load raw csv COUNTY GEOID
COUNTY_GEOID_CSV  = pd.read_csv('../data_files/COUNTY_GeoId.csv', dtype=str)
COUNTY_GEOID_CSV = COUNTY_GEOID_CSV.dropna()
COUNTY_GEOID_CSV

Unnamed: 0,geoid,county
0,01001,"Autauga County, Alabama"
1,01003,"Baldwin County, Alabama"
2,01005,"Barbour County, Alabama"
3,01007,"Bibb County, Alabama"
4,01009,"Blount County, Alabama"
...,...,...
3137,56037,"Sweetwater County, Wyoming"
3138,56039,"Teton County, Wyoming"
3139,56041,"Uinta County, Wyoming"
3140,56043,"Washakie County, Wyoming"


In [5]:
# Load raw csv DEMOGRAPHICS
COUNTY_DEMOGRACHICS_CSV  = pd.read_csv('../data_files/COUNTY_demographics.csv')
COUNTY_DEMOGRACHICS_CSV['county'].unique().shape

(3222,)

In [6]:
# Load raw csv LABOR 
COUNTY_LABOR_CSV  = pd.read_csv('../data_files/COUNTY_Labor.csv')
COUNTY_LABOR_CSV.head(5)

Unnamed: 0,county,labor_force,employed,unemployed,unemployed_pct
0,"Autauga County, AL",25838,24576,1262,4.9
1,"Baldwin County, AL",96763,91338,5425,5.6
2,"Barbour County, AL",8587,7982,605,7.0
3,"Bibb County, AL",8640,8067,573,6.6
4,"Blount County, AL",24661,23653,1008,4.1


In [7]:
# Lookup dict, convert metro state abbreviation to full string
# We have abbreviations, database expects full state name
states = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
    
# invert the dictionary
states = dict(map(reversed, states.items()))


In [8]:
COUNTY_LABOR_CSV['abbr'] = ""
for index, row in COUNTY_LABOR_CSV.iterrows():
    try:
        locationParts = row['county'].split(', ')
        row['abbr']=locationParts[0] + ", "+states[locationParts[1]]
    except:
        print('nope', row)

nope county            District of Columbia
labor_force                     409734
employed                        376839
unemployed                       32895
unemployed_pct                8.0     
abbr                                  
Name: 319, dtype: object
nope county            NaN
labor_force       NaN
employed          NaN
unemployed        NaN
unemployed_pct    NaN
abbr                 
Name: 3219, dtype: object


In [9]:
COUNTY_LABOR_CSV['county'] = COUNTY_LABOR_CSV['abbr']

In [10]:
COUNTY_LABOR_CSV.drop(columns=['abbr'], inplace=True)

In [11]:
COUNTY_LABOR_CSV

Unnamed: 0,county,labor_force,employed,unemployed,unemployed_pct
0,"Autauga County, Alabama",25838,24576,1262,4.9
1,"Baldwin County, Alabama",96763,91338,5425,5.6
2,"Barbour County, Alabama",8587,7982,605,7.0
3,"Bibb County, Alabama",8640,8067,573,6.6
4,"Blount County, Alabama",24661,23653,1008,4.1
...,...,...,...,...,...
3215,"Vieques Municipio, Puerto Rico",N.A.,N.A.,N.A.,N.A.
3216,"Villalba Municipio, Puerto Rico",N.A.,N.A.,N.A.,N.A.
3217,"Yabucoa Municipio, Puerto Rico",N.A.,N.A.,N.A.,N.A.
3218,"Yauco Municipio, Puerto Rico",N.A.,N.A.,N.A.,N.A.


## Merge County Labor with County Demographics

In [12]:
merged_df = pd.merge(COUNTY_LABOR_CSV, COUNTY_DEMOGRACHICS_CSV, on=['county'])

## Merge County GeoId with Merged_df

In [13]:
merged_df = pd.merge(merged_df, COUNTY_GEOID_CSV, on=['county'])

In [14]:
# Drop rows where labor_force is null
merged_df = merged_df[pd.to_numeric(merged_df['labor_force'], errors='coerce').notnull()] 

In [15]:
merged_df.shape

(3192, 14)

In [16]:
merged_df['county'].unique().shape

(3125,)

In [15]:
merged_df.groupby(merged_df['county'],as_index=False).size()

Unnamed: 0,county,size
0,"Abbeville County, South Carolina",1
1,"Acadia Parish, Louisiana",1
2,"Accomack County, Virginia",1
3,"Ada County, Idaho",1
4,"Adair County, Iowa",1
...,...,...
3120,"Yuma County, Arizona",1
3121,"Yuma County, Colorado",1
3122,"Zapata County, Texas",1
3123,"Zavala County, Texas",1


In [17]:
merged_df = merged_df.drop_duplicates()
merged_df

Unnamed: 0,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more,geoid
0,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157,01001
2,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024,01003
4,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700,01005
6,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759,01007
8,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656,01009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187,"Sweetwater County, Wyoming",20840,19308,1532,7.4,42272,34389,366,433,295,70,2508,4211,56037
3188,"Teton County, Wyoming",15204,14292,912,6.0,23331,19202,55,185,317,5,1415,2152,56039
3189,"Uinta County, Wyoming",9312,8730,582,6.3,20450,18007,66,191,114,27,864,1181,56041
3190,"Washakie County, Wyoming",4004,3793,211,5.3,7685,6600,12,77,33,3,388,572,56043


## Rearrange columns

In [18]:
cols = merged_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
merged_df = merged_df[cols]
merged_df

Unnamed: 0,geoid,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more
0,01001,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157
2,01003,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024
4,01005,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700
6,01007,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759
8,01009,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187,56037,"Sweetwater County, Wyoming",20840,19308,1532,7.4,42272,34389,366,433,295,70,2508,4211
3188,56039,"Teton County, Wyoming",15204,14292,912,6.0,23331,19202,55,185,317,5,1415,2152
3189,56041,"Uinta County, Wyoming",9312,8730,582,6.3,20450,18007,66,191,114,27,864,1181
3190,56043,"Washakie County, Wyoming",4004,3793,211,5.3,7685,6600,12,77,33,3,388,572


In [20]:
# write to csv
merged_df.to_csv('../data_files/Employment_by_County.csv') 

In [21]:
merged_df['geoid'].unique().size

3125