# ETL to Retrieve Demographics Data from private API

In [1]:
import pandas as pd

In [2]:
complete = pd.read_csv('../data_files/merged_metrodata_181920.csv', header=0)

## Unique metro points

In [3]:
complete_unique_citi_state = complete['citi_state'].unique()
complete_unique_citi_state.shape

(396,)

### Dataframes

In [4]:
columns = ['zipcode', 'state', 'city', 'population', 'median_age', 'race_asian', 'race_white', 'race_black', 'race_native', 'race_islander', 'race_other', 'race_two', 'race_hispanic', 'average_household_income', 'family_households_total', 'family_poverty_pct', 'educational_attainment_bachelors', 'educational_attainment_graduate' , 'educational_attainment_high_school', 'educational_attainment_no_diploma', 'educational_attainment_some_college' ]
demographic_by_zipcode_df = pd.DataFrame(columns = columns)

In [5]:
# Lookup dict, convert metro state abbreviation to full string
# We have abbreviations, database expects full state name
states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'}

In [6]:
# stores any location with issues
# Next phase of project
# will circle back to locations with multiple city/state concatenated
location_issues_df = pd.DataFrame(columns=['city', 'state'])

## Start request to private db

In [7]:
# Intentionally left out credentials
# will update with config next commit
from config import p
import requests
import mysql.connector
servername = "internal-db.s221289.gridserver.com"
username = "db221289_stp"
password = p
dbname = "db221289_stp"
cnx = mysql.connector.connect(user=username, password=password,host=servername,database=dbname)
cursor = cnx.cursor()

### Call to database

In [8]:
def requestDemographicForCityState(_city, _state):
    # check if this state exists (seeing some strange state abbrs)
    if _state.upper() in states :
        search_state =  states[_state.upper()].upper()
        search_city =  _city.upper().replace("'", "''")
        query = (f"SELECT zip_zcta, state_name, city_name, age_total as population, median_age, race_and_ethnicity_asian, race_and_ethnicity_white, race_and_ethnicity_black, race_and_ethnicity_native, race_and_ethnicity_islander, race_and_ethnicity_other, race_and_ethnicity_two, race_and_ethnicity_hispanic, average_household_income, family_households_total, family_poverty_pct, educational_attainment_bachelors, educational_attainment_graduate , educational_attainment_high_school, educational_attainment_no_diploma, educational_attainment_some_college FROM zipcodeDemographic WHERE state_name = '{search_state}' and city_name = '{search_city}' ")
        cursor.execute(query)
        # iterate throw rows and add to df
        for (zipcode) in cursor:
            demographic_by_zipcode_df.loc[len(demographic_by_zipcode_df.index)] = zipcode
    
# iterate through locations list
for location in complete_unique_citi_state:
    locationParts = location.split(', ')
    locationStateParts = locationParts[1].split('-')
    # temp check to make sure the stateParts array only contain 1 value
    # if state array is more than 1 value then add to location_issues_df for later parsing
    if len(locationStateParts) > 1:
        print (f"Issues with {location}")
        #ocation_issues_df.loc[len(location_issues_df.index)] = [location['city'], location['state']]
        location_issues_df.loc[len(location_issues_df.index)] = [locationParts[0], locationParts[1]]
    # else lets make that request
    else:
        city = locationParts[0]
        state = locationParts[1]
        requestDemographicForCityState(city, state)
        

Issues with Fayetteville-Springdale-Rogers, AR-MO
Issues with Fort Smith, AR-OK
Issues with Norwich-New London-Westerly, CT-RI
Issues with Washington-Arlington-Alexandria, DC-VA-MD-WV
Issues with Augusta-Richmond County, GA-SC
Issues with Columbus, GA-AL
Issues with Davenport-Moline-Rock Island, IA-IL
Issues with Sioux City, IA-NE-SD
Issues with Lewiston, ID-WA
Issues with Chicago-Naperville-Elgin, IL-IN-WI
Issues with Evansville, IN-KY
Issues with South Bend-Mishawaka, IN-MI
Issues with Louisville/Jefferson County, KY-IN
Issues with Boston-Cambridge-Nashua, MA-NH
Issues with Springfield, MA-CT
Issues with Worcester, MA-CT
Issues with Cumberland, MD-WV
Issues with Hagerstown-Martinsburg, MD-WV
Issues with Salisbury, MD-DE
Issues with Duluth, MN-WI
Issues with Minneapolis-St. Paul-Bloomington, MN-WI
Issues with Cape Girardeau, MO-IL
Issues with Kansas City, MO-KS
Issues with St. Joseph, MO-KS
Issues with St. Louis, MO-IL
Issues with Charlotte-Concord-Gastonia, NC-SC
Issues with Fargo, N

### Demographic by zipcode

In [9]:
demographic_by_zipcode_df.head()

Unnamed: 0,zipcode,state,city,population,median_age,race_asian,race_white,race_black,race_native,race_islander,...,race_two,race_hispanic,average_household_income,family_households_total,family_poverty_pct,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college
0,99501,Alaska,Anchorage,17084,34.7,999,9015,1412,2189,539,...,1459,1360,87441,3142,11.4%,20.8%,13.1%,23.7%,9.4%,33.1%
1,99502,Alaska,Anchorage,24505,32.4,2028,15271,857,1257,408,...,2014,2620,111653,5941,4.5%,23.3%,12.7%,24.5%,4.8%,34.8%
2,99503,Alaska,Anchorage,13668,34.5,2102,5944,1257,1548,476,...,961,1337,64578,2522,12.3%,15.6%,6.6%,30.5%,12.2%,35.1%
3,99504,Alaska,Anchorage,42161,32.7,4251,21416,4340,2936,1792,...,3861,3542,92815,10037,5.7%,19.2%,10.7%,25.9%,7.6%,36.5%
4,99505,Alaska,Anchorage,6477,21.5,188,3976,564,14,29,...,362,1284,78229,1069,4.1%,22.3%,12.1%,21.9%,2.2%,41.5%


In [10]:
demographic_by_zipcode_df.dtypes

zipcode                                object
state                                  object
city                                   object
population                             object
median_age                             object
race_asian                             object
race_white                             object
race_black                             object
race_native                            object
race_islander                          object
race_other                             object
race_two                               object
race_hispanic                          object
average_household_income               object
family_households_total                object
family_poverty_pct                     object
educational_attainment_bachelors       object
educational_attainment_graduate        object
educational_attainment_high_school     object
educational_attainment_no_diploma      object
educational_attainment_some_college    object
dtype: object

### Strip the '%'

In [11]:
demographic_by_zipcode_df['family_poverty_pct'] = demographic_by_zipcode_df['family_poverty_pct'].str.replace('%', '')
demographic_by_zipcode_df['educational_attainment_bachelors'] = demographic_by_zipcode_df['educational_attainment_bachelors'].map(lambda x: x.strip('%'))
demographic_by_zipcode_df['educational_attainment_graduate'] = demographic_by_zipcode_df['educational_attainment_graduate'].map(lambda x: x.strip('%'))
demographic_by_zipcode_df['educational_attainment_high_school'] = demographic_by_zipcode_df['educational_attainment_high_school'].map(lambda x: x.strip('%'))
demographic_by_zipcode_df['educational_attainment_no_diploma'] = demographic_by_zipcode_df['educational_attainment_no_diploma'].map(lambda x: x.strip('%'))
demographic_by_zipcode_df['educational_attainment_some_college'] = demographic_by_zipcode_df['educational_attainment_some_college'].map(lambda x: x.strip('%'))
demographic_by_zipcode_df.head(2)

Unnamed: 0,zipcode,state,city,population,median_age,race_asian,race_white,race_black,race_native,race_islander,...,race_two,race_hispanic,average_household_income,family_households_total,family_poverty_pct,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college
0,99501,Alaska,Anchorage,17084,34.7,999,9015,1412,2189,539,...,1459,1360,87441,3142,11.4,20.8,13.1,23.7,9.4,33.1
1,99502,Alaska,Anchorage,24505,32.4,2028,15271,857,1257,408,...,2014,2620,111653,5941,4.5,23.3,12.7,24.5,4.8,34.8


### Convert objects to int

In [12]:
convert_columns_int = [ 'population', 'race_asian', 'race_white', 'race_black', 'race_native', 'race_islander', 'race_other', 'race_two', 'race_hispanic', 'average_household_income', 'family_households_total']
demographic_by_zipcode_df[convert_columns_int] = demographic_by_zipcode_df[convert_columns_int].astype(str).astype(int)

### Calculate columns with percentage values to numeric by population

Adding calculated value as new column for smaller size and view calculations are correct before dropping the percentage columns.

In [13]:
demographic_by_zipcode_df['median_age'] = demographic_by_zipcode_df['median_age'].astype(float).astype(int)
demographic_by_zipcode_df['average_household_income'] = demographic_by_zipcode_df['average_household_income'].astype(float).astype(int)
demographic_by_zipcode_df['family_poverty'] = ((demographic_by_zipcode_df['family_poverty_pct'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)
demographic_by_zipcode_df['educational_attainment_bachelors'] = ((demographic_by_zipcode_df['educational_attainment_bachelors'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)
demographic_by_zipcode_df['educational_attainment_graduate'] = ((demographic_by_zipcode_df['educational_attainment_graduate'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)
demographic_by_zipcode_df['educational_attainment_high_school'] = ((demographic_by_zipcode_df['educational_attainment_high_school'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)
demographic_by_zipcode_df['educational_attainment_no_diploma'] = ((demographic_by_zipcode_df['educational_attainment_no_diploma'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)
demographic_by_zipcode_df['educational_attainment_some_college'] = ((demographic_by_zipcode_df['educational_attainment_some_college'].astype("float") * demographic_by_zipcode_df['population']) / 100).astype(int)

In [14]:
demographic_by_zipcode_df.head(2)

Unnamed: 0,zipcode,state,city,population,median_age,race_asian,race_white,race_black,race_native,race_islander,...,race_hispanic,average_household_income,family_households_total,family_poverty_pct,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,family_poverty
0,99501,Alaska,Anchorage,17084,34,999,9015,1412,2189,539,...,1360,87441,3142,11.4,3553,2238,4048,1605,5654,1947
1,99502,Alaska,Anchorage,24505,32,2028,15271,857,1257,408,...,2620,111653,5941,4.5,5709,3112,6003,1176,8527,1102


### Drop columns only used for calculations or no longer needed

In [15]:
demographic_by_zipcode_df.drop(['family_poverty_pct'], axis=1, inplace=True)

In [16]:
demographic_by_zipcode_df.shape

(848, 21)

### Remove areas with population 0

In [17]:
demographic_by_zipcode_df = demographic_by_zipcode_df[demographic_by_zipcode_df['population'] > 0]
demographic_by_zipcode_df.shape

(818, 21)

### Group zipcode demographics by city and state

**sum()** population, education_*, race_*

**mean()** median_age, average_income

In [18]:
grouped_demographic = demographic_by_zipcode_df.groupby(['city', 'state'], as_index=False).agg(
    population = ('population', 'sum'),
    median_age = ('median_age', 'mean'),
    average_income = ('average_household_income', 'mean'),
    family_poverty = ('family_poverty', 'sum'),
    educational_attainment_bachelors = ('educational_attainment_bachelors', 'sum'),
    educational_attainment_graduate = ('educational_attainment_graduate', 'sum'),
    educational_attainment_high_school = ('educational_attainment_high_school', 'sum'),
    educational_attainment_no_diploma = ('educational_attainment_no_diploma', 'sum'),
    educational_attainment_some_college = ('educational_attainment_some_college', 'sum'),
    race_asian = ('race_asian', 'sum'),
    race_white = ('race_white', 'sum'),
    race_black = ('race_black', 'sum'),
    race_hispanic = ('race_hispanic', 'sum'),
    race_native = ('race_native', 'sum'),
    race_islander = ('race_islander', 'sum'),
    race_two = ('race_two', 'sum'),
    race_other = ('race_other', 'sum'))
grouped_demographic['average_income'] = grouped_demographic['average_income'].astype(float).astype(int)
grouped_demographic['median_age'] = grouped_demographic['median_age'].astype(int)

grouped_demographic

Unnamed: 0,city,state,population,median_age,average_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,Abilene,Texas,85075,27,49583,9368,12687,6935,25576,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,Akron,Ohio,94894,33,42943,25508,10769,5611,33755,13742,31013,2164,54490,31493,1761,134,22,4765,65
2,Albany,Georgia,64884,37,59234,13830,9365,7235,18004,9037,21217,948,20685,40100,1737,96,24,1241,53
3,Albuquerque,New Mexico,319991,35,61589,42306,66219,56998,66455,25103,105387,8437,153880,9545,126499,13606,213,6795,1016
4,Alexandria,Louisiana,59184,38,57892,10955,7768,5315,21019,9059,16030,1856,23883,30783,1494,206,7,924,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Wichita,Kansas,191734,34,75391,28568,34328,15821,52966,26056,62473,5126,124831,17902,35486,1185,36,7065,103
151,Wichita Falls,Texas,65896,31,56167,13248,9631,4039,19443,9759,22998,963,37908,8448,16154,403,26,1680,314
152,Winston-Salem,North Carolina,33082,27,34263,1864,9344,7149,4379,1215,10993,1351,22926,5736,2099,57,0,846,67
153,Yakima,Washington,46258,32,52787,7262,4255,3145,12720,12443,13692,285,20719,340,23274,494,10,1095,41


In [19]:
# demograhpics by zipcode to csv
demographic_by_zipcode_df.to_csv('../data_files/metro_demographic_by_zipcode.csv') 

In [20]:
# write to csv
grouped_demographic.to_csv('../data_files/metro_demographic_by_city_state.csv') 

In [21]:
# shut it down
cnx.close()

#### Need to untangle the locations with multiple city and state

These locations are formatted by their metropolitan area name. Some are just the city, while others are made up of several cities. We should keep these locations together and all locations involved should be represented as a singular metropolitan area.
Need to 
1. parse into list
2. request each
3. add to temp df 
4. add to zipcodes df
5. groupby, sum(), mean() back together
4. add into groups df

Allentown-Bethlehem-Easton, PA-NJ
Wikipedia defines this as: The Lehigh Valley's principal cities are Allentown, Bethlehem and Easton, making up the Allentown–Bethlehem–Easton metropolitan area.

Philadelphia-Camden-Wilmington, PA-NJ-DE-MD

Can parse out to:

- Philadelphia PA
- Camden NJ
- Wilmington DE
- ??? MD 

