# ETL to Retrieve Demographics Data from private API

## Create unique list

Testing process with 2018 and 2019 locations

Will pull from the combined csv

In [2]:
import pandas as pd

In [3]:
df_2019 = pd.read_csv('../data_files/bls_metro_2019_clean.csv', header=0)
df_2018 = pd.read_csv('../data_files/bls_metro_may2018_clean.csv', header=0)

In [4]:
area_title_2019 = df_2019[['area_title']]
unique_area_title_2019 = area_title_2019['area_title'].unique()
# unique_area_title_2019

In [5]:
area_title_2018 = df_2018[['AREA_NAME']]
unique_area_title_2018 = area_title_2018['AREA_NAME'].unique()
# unique_area_title_2018

In [6]:
metro_2018_2019_area_names = [area_title_2019[['area_title']], area_title_2018[['AREA_NAME']]]
# metro_2018_2019_area_names

In [7]:
joined_list = [*unique_area_title_2019, *unique_area_title_2018]
# joined_list

### Dataframes

In [8]:
columns = ['zipcode', 'state', 'city', 'population', 'race_asian', 'race_white', 'race_black', 'race_native', 'race_islander', 'race_other', 'race_two', 'race_hispanic', 'average_household_income'  ]
df = pd.DataFrame(columns = columns)
demographic_by_zipcode_df = pd.DataFrame(columns = columns)

In [9]:
# Lookup dict
states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'}

## Start request to private db

In [10]:
# Intentionally left out credentials
# will update with config next commit
from config import p
import requests
import mysql.connector
servername = "internal-db.s221289.gridserver.com"
username = "db221289_stp"
password = p
dbname = "db221289_stp"
cnx = mysql.connector.connect(user=username, password=password,host=servername,database=dbname)
cursor = cnx.cursor()

In [11]:
# stores any location with issues
# will circle back to locations with multiple city/state concatenated
location_issues_df = pd.DataFrame(columns=['city', 'state'])

### Call to database

In [12]:
def requestDemographicForCityState(_city, _state):
    # check if this state exists (seeing some strange state abbrs)
    if _state.upper() in states :
        search_state =  states[_state.upper()].upper()
        search_city =  _city.upper().replace("'", "''")
        query = (f"SELECT zip_zcta, state_name, city_name, age_total as population, race_and_ethnicity_asian, race_and_ethnicity_white, race_and_ethnicity_black, race_and_ethnicity_native, race_and_ethnicity_islander, race_and_ethnicity_other, race_and_ethnicity_two, race_and_ethnicity_hispanic, average_household_income FROM `zipcodeDemographic` WHERE state_name = '{search_state}' and city_name = '{search_city}' ")
        cursor.execute(query)
        # iterate throw rows and add to df
        for (zipcode) in cursor:
            demographic_by_zipcode_df.loc[len(demographic_by_zipcode_df.index)] = zipcode
    
# iterate through locations list
for location in joined_list:
    locationParts = location.split(', ')
    locationStateParts = locationParts[1].split('-')
    # temp check to make sure the stateParts array only contain 1 value
    # if state array is more than 1 value then add to location_issues_df for later parsing
    if len(locationStateParts) > 1:
        print (f"Issues with {location}")
        location_issues_df.loc[len(location_issues_df.index)] = [locationParts[0], locationParts[1]]
    # else lets make that request
    else:
        city = locationParts[0]
        state = locationParts[1]
        requestDemographicForCityState(city, state)
        

Issues with Allentown-Bethlehem-Easton, PA-NJ
Issues with Augusta-Richmond County, GA-SC
Issues with Cape Girardeau, MO-IL
Issues with Charlotte-Concord-Gastonia, NC-SC
Issues with Chattanooga, TN-GA
Issues with Chicago-Naperville-Elgin, IL-IN-WI
Issues with Cincinnati, OH-KY-IN
Issues with Clarksville, TN-KY
Issues with Columbus, GA-AL
Issues with Cumberland, MD-WV
Issues with Davenport-Moline-Rock Island, IA-IL
Issues with Duluth, MN-WI
Issues with Evansville, IN-KY
Issues with Fargo, ND-MN
Issues with Fayetteville-Springdale-Rogers, AR-MO
Issues with Fort Smith, AR-OK
Issues with Grand Forks, ND-MN
Issues with Hagerstown-Martinsburg, MD-WV
Issues with Huntington-Ashland, WV-KY-OH
Issues with Kansas City, MO-KS
Issues with Kingsport-Bristol-Bristol, TN-VA
Issues with La Crosse-Onalaska, WI-MN
Issues with Lewiston, ID-WA
Issues with Logan, UT-ID
Issues with Louisville/Jefferson County, KY-IN
Issues with Memphis, TN-MS-AR
Issues with Minneapolis-St. Paul-Bloomington, MN-WI
Issues with 

### Demographic by zipcode

In [13]:
demographic_by_zipcode_df

Unnamed: 0,zipcode,state,city,population,race_asian,race_white,race_black,race_native,race_islander,race_other,race_two,race_hispanic,average_household_income
0,79601,Texas,Abilene,28041,546,15155,3887,352,2,83,693,7323,52921
1,79605,Texas,Abilene,29672,569,19773,2168,150,13,111,467,6421,63713
2,79606,Texas,Abilene,23929,988,16483,1959,22,0,6,512,3959,77331
3,79607,Texas,Abilene,3309,150,1890,561,0,0,0,188,520,53954
4,79699,Texas,Abilene,124,0,84,22,0,0,0,0,18,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,53715,Wisconsin,Madison,14246,1697,10474,505,16,0,7,445,1102,48375
1692,53726,Wisconsin,Madison,5869,716,4592,194,27,0,0,101,239,85599
1693,25301,West Virginia,Charleston,2360,48,1601,452,0,0,38,156,65,51371
1694,25304,West Virginia,Charleston,8307,297,6800,921,4,0,15,127,143,92942


In [14]:
# demograhpics by zipcode to csv
demographic_by_zipcode_df.to_csv('../data_files/metro_demographic_by_zipcode.csv') 

In [15]:
demographic_by_zipcode_df.dtypes

zipcode                     object
state                       object
city                        object
population                  object
race_asian                  object
race_white                  object
race_black                  object
race_native                 object
race_islander               object
race_other                  object
race_two                    object
race_hispanic               object
average_household_income    object
dtype: object

In [16]:
# convert objects to int
convert_columns = ['population', 'race_asian', 'race_white', 'race_black', 'race_native', 'race_islander', 'race_other', 'race_two', 'race_hispanic', 'average_household_income']
demographic_by_zipcode_df[convert_columns] = demographic_by_zipcode_df[convert_columns].astype(str).astype(int)


In [17]:
demographic_by_zipcode_df.dtypes

zipcode                     object
state                       object
city                        object
population                   int64
race_asian                   int64
race_white                   int64
race_black                   int64
race_native                  int64
race_islander                int64
race_other                   int64
race_two                     int64
race_hispanic                int64
average_household_income     int64
dtype: object

### Demographic grouped by City and State

In [18]:
grouped_demographic = demographic_by_zipcode_df.groupby(['city', 'state'], as_index=False).agg(
    population = ('population', 'sum'),
    race_asian = ('race_asian', 'sum'),
    race_white = ('race_white', 'sum'),
    race_black = ('race_black', 'sum'),
    race_hispanic = ('race_hispanic', 'sum'),
    race_native = ('race_native', 'sum'),
    race_islander = ('race_islander', 'sum'),
    race_two = ('race_two', 'sum'),
    race_other = ('race_other', 'sum'),
    average_income = ('average_household_income', 'mean'))
grouped_demographic

Unnamed: 0,city,state,population,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other,average_income
0,Abilene,Texas,170150,4506,106770,17194,36482,1048,30,3720,400,49583.600000
1,Akron,Ohio,189788,4328,108980,62986,3522,268,44,9530,130,42943.888889
2,Albany,Georgia,129768,1896,41370,80200,3474,192,48,2482,106,59234.666667
3,Albuquerque,New Mexico,639982,16874,307760,19090,252998,27212,426,13590,2032,61589.000000
4,Alexandria,Louisiana,118368,3712,47766,61566,2988,412,14,1848,62,57892.333333
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Wichita,Kansas,383468,10252,249662,35804,70972,2370,72,14130,206,70005.857143
151,Wichita Falls,Texas,131792,1926,75816,16896,32308,806,52,3360,628,56167.200000
152,Winston-Salem,North Carolina,66164,2702,45852,11472,4198,114,0,1692,134,34263.333333
153,Yakima,Washington,92516,570,41438,680,46548,988,20,2190,82,52787.000000


In [19]:
# write to csv
grouped_demographic.to_csv('../data_files/metro_demographic_by_city_state.csv') 

In [20]:
# shut it down
cnx.close()

#### Need to untangle the locations with multiple city and state

These locations are formatted by their metropolitan area name. Some are just the city, while others are made up of several cities. We should keep these locations together and all locations involved should be represented as a singular metropolitan area.
Need to 
1. parse into list
2. request each
3. add to temp df 
4. add to zipcodes df
5. groupby, sum(), mean() back together
4. add into groups df

Allentown-Bethlehem-Easton, PA-NJ
Wikipedia defines this as: The Lehigh Valley's principal cities are Allentown, Bethlehem and Easton, making up the Allentown–Bethlehem–Easton metropolitan area.

Philadelphia-Camden-Wilmington, PA-NJ-DE-MD

Can parse out to:

- Philadelphia PA
- Camden NJ
- Wilmington DE
- ??? MD 

