# Combine population & demographic data
---

#### _Documentation in separate file_

In [1]:
import pandas as pd, numpy as np
import re

In [2]:
INDEX = ['year', 'county', 'geo'] # constant

def head(dfs):
    if type(dfs) != list:
        dfs = [dfs]
    for df in dfs:
        print(f'{df.shape[1]} cols x {df.shape[0]} rows')
        display(df.head(3))


def add_ordinal(self, col:str, order:list, replace=False) -> pd.DataFrame:
    """ ::pd.DataFrame
    Create ordinal col from existing categorical col. Pass an ascending list
    of categories. Example: Input ['A', 'B', 'C'] -> New column map: {'A': 1, 'B': 2, 'C': 3}
    """
    new = self[col].map({k: i+1 for i, k in enumerate(order)})
    self.insert_at(f'{col}_ord', col, new)
    if replace:
        self = self.drop(columns=col)
    return self


def insert_at(self, name, target, col:pd.Series) -> None:
    """ ::pd.DataFrame
    Like df.insert(), but takes a column name as location, instead of int """
    if isinstance(target, int):
        idx = target
    else:
        idx = list(self.columns).index(target)
    self.insert(idx, name, col)
    return self


def add_binmax(self, name, cols, replace=False) -> pd.DataFrame:
    """ ::pd.DataFrame
    Shorthand for df.idxmax(), but lets you choose location and replace given columns """
    new = self[cols].idxmax(axis=1)
    self.insert_at(name, cols[0], new)
    if replace:
        self = self.drop(columns=cols)
    return self

def move_col(self, name, target) -> pd.DataFrame:
    """ ::pd.DataFrame
    Move named col to right before target col """
    col = self.pop(name)
    if isinstance(target, int):
        idx = target
    else:
        idx = list(self.columns).index(target)
    self.insert(idx, name, col)
    return self


def separate_by(df, to_match, keys=INDEX, keep=[], start=False, end=False, mode="") -> (pd.DataFrame, pd.DataFrame):
    """
    Given a df and a substring, return two dfs:
    1. df containing: county + all columns whose name does NOT contain substring
    2. df containing: county + all columns whose name DOES contain substring
    """
    if type(to_match) != list:
        to_match = [to_match]

    names = [item for sublist in [[c for c in df.columns if (
            c.startswith(txt) if start else c.endswith(txt) if end else txt in c
        )] for txt in to_match] for item in sublist]

    include = df.copy()[keys + keep + names]
    exclude = df.copy().drop(columns = keep + names)
    return include if mode == 'include' else exclude if mode == 'exclude' else (include, exclude)


def combine_cols(self, name=None, cols:list=None, items:dict=None, replace=True, func=sum) -> pd.DataFrame:
    """ ::pd.DataFrame
    Given a list of column names, create a new column with their sum, and
    position it before the first col in 'cols'. So if replace=True, then
    the old columns will effectively be replaced in their original position.
    To do multiple sums, pass 'items' as dict with names as keys and col list as vals
    """
    if not items:
        items = {name: cols}
    for name, cols in items.items():
        new = func([self[c] for c in cols])
        self.insert_at(name, cols[0], new)
        if replace:
            self = self.drop(columns=cols)
    return self


def _add_func_to_object(name, func=None):
    if not func:
        func = globals().get(name)
    doc = func.__doc__
    if doc:
        objects = re.findall(r'^\s*:{2}(\S+)', doc)
        if len(objects) > 0:
            objects = objects[0].split(',')
            for obj in objects:
                setattr(eval(obj), name, func)

def _add_all_funcs_to_objects(obj=None, name=None):
    for name,func in list(globals().items()):
        if callable(func) or isinstance(func, property):
            name = name.rstrip('_')
            _add_func_to_object(name, func)

_add_all_funcs_to_objects()

---
---
---

## Load data

In [3]:
geo = pd.read_csv('../working-data/geo_county_school.csv')
geo = geo.groupby('county').agg('first').reset_index()[['county', 'geo_county']].rename(columns={'geo_county':'geo'})
head(geo)

2 cols x 63 rows


Unnamed: 0,county,geo
0,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...
1,ALAMOSA,MULTIPOLYGON (((-105.59917426201822 37.7521648...
2,ARAPAHOE,MULTIPOLYGON (((-103.70653410023402 39.7398580...


In [4]:
# Population
pop_raw = pd.read_csv("../working-data/county_population.csv")
pop_raw.year = pop_raw.year.astype(int)
head(pop_raw)

11 cols x 3904 rows


Unnamed: 0,year,county,total,male,female,over18,under19,under19_male,under19_female,over18_male,over18_female
0,1990,ADAMS,265709.0,131902.0,133807.0,184665.0,81044.0,41519.0,39525.0,90383.0,94282.0
1,1990,ALAMOSA,13617.0,6677.0,6940.0,9311.0,4306.0,2189.0,2117.0,4488.0,4823.0
2,1990,ARAPAHOE,393289.0,191722.0,201567.0,281301.0,111988.0,57241.0,54747.0,134481.0,146820.0


In [5]:
census_county_file_ids = {
    '2012':'xwky-bmsn',
    '2013':'xymp-u28i',
    '2014':'f8ak-7nmp',
    '2015':'5yyk-mqmn',
    '2016':'sn6p-34bq',
    '2017':'ewkj-ipn7',
    '2018':'xum2-smvh',
    '2019':'8j3i-rjn4',
}
# Each census year comes in a separate dataset
dem = (
    pd.concat([
            pd.read_csv(f'../input-data/census_counties_{year}__{file_id}.csv'
                ).drop(columns=['pop', 'geonum', 'the_geom']
                ).assign(year=year
                ).rename(columns={'civ_ni_':'civ_ni_p'}
                )
            for year, file_id in census_county_file_ids.items()
        ])
    .copy()
    .rename(columns={'geoname':'county'})
    .move_col('year', 0)
)

dem.county = (dem
    .county.str.replace(" County, Colorado", "")
    .str.upper()
)
dem.year = dem.year.astype(int)
dem

Unnamed: 0,year,county,hispanic,white_nh,black_nh,ntvam_nh,asian_nh,hawpi_nh,other_nh,twoplus_nh,...,civ_ni_pop,disabled,pop16_pls,laborforce,civ_lf,emp,unemp,armedfrcs,not_lf,civ_ni_p
0,2012,ARAPAHOE,105174.0,364766.0,55629.0,2211.0,28067.0,1166.0,1267.0,16077.0,...,568663.0,49870.0,444215.0,320199.0,318041.0,292089.0,25952.0,2158.0,124016.0,568663.0
1,2012,MINERAL,15.0,671.0,9.0,5.0,1.0,0.0,0.0,1.0,...,702.0,129.0,681.0,391.0,391.0,370.0,21.0,0.0,290.0,702.0
2,2012,MONTROSE,8037.0,31799.0,186.0,74.0,227.0,49.0,33.0,589.0,...,40552.0,5649.0,32334.0,20137.0,20124.0,18110.0,2014.0,13.0,12197.0,40552.0
3,2012,PARK,777.0,14818.0,5.0,89.0,137.0,24.0,0.0,318.0,...,16042.0,1420.0,13531.0,9599.0,9583.0,8796.0,787.0,16.0,3932.0,16042.0
4,2012,MORGAN,9557.0,17399.0,701.0,153.0,44.0,0.0,0.0,346.0,...,27544.0,2946.0,21273.0,13786.0,13786.0,12758.0,1028.0,0.0,7487.0,27544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2019,CONEJOS,4263.0,3622.0,40.0,72.0,13.0,27.0,20.0,71.0,...,8089.0,1642.0,6216.0,3383.0,3383.0,3029.0,354.0,0.0,2833.0,8089.0
60,2019,ADAMS,201784.0,252170.0,16139.0,2274.0,19493.0,536.0,722.0,10990.0,...,500418.0,52006.0,382633.0,273313.0,272810.0,261893.0,10917.0,503.0,109320.0,500418.0
61,2019,EAGLE,16179.0,36748.0,615.0,110.0,357.0,34.0,0.0,638.0,...,54502.0,2740.0,44064.0,34862.0,34775.0,34128.0,647.0,87.0,9202.0,54502.0
62,2019,MOFFAT,2044.0,10543.0,130.0,71.0,18.0,0.0,37.0,284.0,...,12968.0,1466.0,10130.0,6480.0,6445.0,6163.0,282.0,35.0,3650.0,12968.0


## Combine population data and census data
- Looking in the original demographic data, most population groups are present: gender, age, etc. So why not use those?
- A couple reasons.
  - The population dataset is likely more accurate, claiming to provide "actual" numbers, whereas the census data provides "estimates"
  - The population dataset is more precise, with age groups of each individual age number, allowing us to make our own aggregated bins (adult, minor). The census data has age groups defined already, but in increments of 5, so the middle group is "15 to 19", but we need 18 and under!
  - The population dataset offers sub-aggregations: we have `minor_female` and `minor_male`, for instance, whereas the census data only offers age populations and gender populations separately
- So instead, we will use population dataset first, and add in additional groups from census data
---

#### Select desired columns from census data

In [6]:
df = dem.copy()
df = df[['year', 'county', 'med_age',
    'households', 'avghhsize',
    'civ_lf', 'emp', 'unemp',
    'hispanic', 'white_nh', 'black_nh', 'asian_nh', 'ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh',
    'pop25plus', 'hsgrad_sc',
    'med_hh_inc', 'per_cap_in',
    'citz_birth', 'citz_nat', 'born_in_co',
    'pop_3pl', 'enrolled', 'undergrad',
    'gr_1_4', 'gr_5_8', 'gr_9_12',
    'med_hm_val', 'med_yr_blt',
    'housing_un', 'occ_hu',
    'own_occ_hu', 'v_l_50k', 'v50k_100k', 'v100k_150k', 'v150k_200k', 'v200k_250k', 'v250k_300k',
    'v300k_400k', 'v400k_500k', 'v500k_750k', 'v750k_1m', 'v_1m_plus',
    'b2000_2009', 'b1990_1999', 'b1980_1989', 'b1970_1979',
    'b1960_1969', 'b1950_1959', 'b1940_1949', 'b1939_e',
    'ps_uni', 'ps_below',
    'tot_l18', 'pov_l18',
]]

#### Group bins together

In [7]:
# Create new variable for total citizens. Place it next to citz_birth
df = (df
    .insert_at('citz', 'citz_birth', df.citz_birth + df.citz_nat)
    .drop(columns='citz_nat')
    .combine_cols(items={
        'race_other': ['ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh'],
        'b1949_e': ['b1939_e', 'b1940_1949'],
        'v50k_150k':  ['v50k_100k', 'v100k_150k'],
        'v150k_250k': ['v150k_200k', 'v200k_250k'],
        'v250k_400k': ['v250k_300k', 'v300k_400k'],
        'v400k_750k': ['v400k_500k', 'v500k_750k'],
        'v750k_plus': ['v750k_1m', 'v_1m_plus'],
    })
)
df

Unnamed: 0,year,county,med_age,households,avghhsize,civ_lf,emp,unemp,hispanic,white_nh,...,b1990_1999,b1980_1989,b1970_1979,b1960_1969,b1950_1959,b1949_e,ps_uni,ps_below,tot_l18,pov_l18
0,2012,ARAPAHOE,35.7,223747.0,2.55,318041.0,292089.0,25952.0,105174.0,364766.0,...,33989.0,56011.0,62253.0,22258.0,16519.0,7165.0,568999.0,66945.0,144576.0,23054.0
1,2012,MINERAL,60.3,363.0,1.83,391.0,370.0,21.0,15.0,671.0,...,232.0,239.0,203.0,100.0,75.0,240.0,702.0,47.0,26.0,0.0
2,2012,MONTROSE,42.6,16732.0,2.41,20124.0,18110.0,2014.0,8037.0,31799.0,...,3750.0,2106.0,3581.0,1298.0,920.0,2333.0,40368.0,5565.0,9788.0,1927.0
3,2012,PARK,47.0,6997.0,2.29,9583.0,8796.0,787.0,777.0,14818.0,...,3567.0,2374.0,2952.0,1051.0,693.0,939.0,16049.0,1355.0,3049.0,276.0
4,2012,MORGAN,36.0,10489.0,2.62,13786.0,12758.0,1028.0,9557.0,17399.0,...,1195.0,984.0,2282.0,1078.0,1858.0,2703.0,27416.0,4002.0,7670.0,1454.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2019,CONEJOS,38.8,3183.0,2.53,3383.0,3029.0,354.0,4263.0,3622.0,...,694.0,565.0,665.0,342.0,265.0,1358.0,8089.0,1756.0,2164.0,636.0
60,2019,ADAMS,33.8,166450.0,3.00,272810.0,261893.0,10917.0,201784.0,252170.0,...,28459.0,22069.0,29342.0,17026.0,19373.0,6023.0,499315.0,54159.0,134212.0,19943.0
61,2019,EAGLE,37.0,18171.0,3.00,34775.0,34128.0,647.0,16179.0,36748.0,...,9377.0,7555.0,6022.0,907.0,318.0,934.0,54401.0,4354.0,11805.0,1184.0
62,2019,MOFFAT,36.6,5366.0,2.42,6445.0,6163.0,282.0,2044.0,10543.0,...,804.0,751.0,2168.0,567.0,472.0,761.0,13003.0,2206.0,3361.0,680.0


#### Create nominal variables for housing price and housing age
- First, create a categorical variable whose values are the COLUMN NAME of the bin with the max value. For instance, if a given county has more houses in the `v50k_100k` range than any other range, the value at that row in the new column will be "v50k_100k"
- Next, create a nominal column from that categorical column, ordered so that a lower number means less desirable. So for prices, "v_l_50k" -> 1, and for year built, "b1939_e" -> 1

In [8]:
blt_ascending = ['b1949_e','b1950_1959','b1960_1969','b1970_1979','b1980_1989','b1990_1999','b2000_2009']
prices_ascending = ['v_l_50k', 'v50k_150k', 'v150k_250k', 'v250k_400k', 'v400k_750k', 'v750k_plus']
df = (df
    .add_binmax('blt_freq_yr', blt_ascending)
    .add_ordinal('blt_freq_yr', blt_ascending)
    .add_binmax('hu_freq_val', prices_ascending)
    .add_ordinal('hu_freq_val', prices_ascending)
)

---

#### Rename everything, with a naming system that let's us easily select sub-groups of columns with a simple string match

In [9]:
# If you're wondering why we're doing all this renaming, look at the
# beginning of each new name. Notice a pattern?
pop = pop_raw.rename(columns={
    'total':            'pop',

    'male':             'gend_m',
    'female':           'gend_f',

    'over18':           'age_over18',
    'under19':          'age_undr19',

    'over18_male':      'gend_m_age_over18',
    'over18_female':    'gend_f_age_over18',
    'under19_male':     'gend_m_age_undr19',
    'under19_female':   'gend_f_age_undr19',
})
df = df.rename(columns={
    'med_age':      'age_median',

    'per_cap_in':   'inc_per_cap',
    'med_hh_inc':   'inc_hh_median',

    'households':   'hh',
    'avghhsize':    'hh_size_avg',

    'pop25plus':    'hsgrad_pool',
    'hsgrad_sc':    'hsgrad_graduated',

    'born_in_co':   'citz_co',
    'citz_birth':   'citz_birth',

    'emp':          'civ_lf_employed',

    'hispanic':     'race_hispanic',
    'white_nh':     'race_white',
    'black_nh':     'race_black',
    'asian_nh':     'race_asian',

    'ps_uni':       'ps_known',
    'ps_below':     'ps_below',
    'tot_l18':      'ps_undr18_known',
    'pov_l18':      'ps_undr18_below',

    'pop_3pl':      'stud_enroll_pool',
    'enrolled':     'stud_enrolled',
    'undergrad':    'stud_undergrad',
    'gr_1_4':       'stud_1_4',
    'gr_5_8':       'stud_5_8',
    'gr_9_12':      'stud_9_12',

    'housing_un':   'hu',
    'occ_hu':       'hu_occ',

    'blt_freq_yr':  'hu_blt_freq_yr',
    'blt_freq_yr_ord':'hu_blt_freq_yr_ord',
    'b1949_e':      'hu_blt_lt_1950',
    'b1950_1959':   'hu_blt_1950_1959',
    'b1960_1969':   'hu_blt_1960_1969',
    'b1970_1979':   'hu_blt_1970_1979',
    'b1980_1989':   'hu_blt_1980_1989',
    'b1990_1999':   'hu_blt_1990_1999',
    'b2000_2009':   'hu_blt_2000_plus',

    'own_occ_hu':   'hu_oo',
    'hu_freq_val':  'hu_oo_freq_val',
    'hu_freq_val_ord':'hu_oo_freq_val_ord',
    'v_l_50k':      'hu_oo_lt_50',
    'v50k_150k':    'hu_oo_50_150',
    'v150k_250k':   'hu_oo_150_250',
    'v250k_400k':   'hu_oo_250_400',
    'v400k_750k':   'hu_oo_400_750',
    'v750k_plus':   'hu_oo_750_plus',
})

## Merge population and census data

In [10]:
head([pop, df])
df.dtypes

11 cols x 3904 rows


Unnamed: 0,year,county,pop,gend_m,gend_f,age_over18,age_undr19,gend_m_age_undr19,gend_f_age_undr19,gend_m_age_over18,gend_f_age_over18
0,1990,ADAMS,265709.0,131902.0,133807.0,184665.0,81044.0,41519.0,39525.0,90383.0,94282.0
1,1990,ALAMOSA,13617.0,6677.0,6940.0,9311.0,4306.0,2189.0,2117.0,4488.0,4823.0
2,1990,ARAPAHOE,393289.0,191722.0,201567.0,281301.0,111988.0,57241.0,54747.0,134481.0,146820.0


52 cols x 512 rows


Unnamed: 0,year,county,age_median,hh,hh_size_avg,civ_lf,civ_lf_employed,unemp,race_hispanic,race_white,...,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ARAPAHOE,35.7,223747.0,2.55,318041.0,292089.0,25952.0,105174.0,364766.0,...,62253.0,22258.0,16519.0,4,b1970_1979,7165.0,568999.0,66945.0,144576.0,23054.0
1,2012,MINERAL,60.3,363.0,1.83,391.0,370.0,21.0,15.0,671.0,...,203.0,100.0,75.0,1,b1949_e,240.0,702.0,47.0,26.0,0.0
2,2012,MONTROSE,42.6,16732.0,2.41,20124.0,18110.0,2014.0,8037.0,31799.0,...,3581.0,1298.0,920.0,7,b2000_2009,2333.0,40368.0,5565.0,9788.0,1927.0


year                    int64
county                 object
age_median            float64
hh                    float64
hh_size_avg           float64
civ_lf                float64
civ_lf_employed       float64
unemp                 float64
race_hispanic         float64
race_white            float64
race_black            float64
race_asian            float64
race_other            float64
hsgrad_pool           float64
hsgrad_graduated      float64
inc_hh_median         float64
inc_per_cap           float64
citz                  float64
citz_birth            float64
citz_co               float64
stud_enroll_pool      float64
stud_enrolled         float64
stud_undergrad        float64
stud_1_4              float64
stud_5_8              float64
stud_9_12             float64
med_hm_val            float64
med_yr_blt            float64
hu                    float64
hu_occ                float64
hu_oo                 float64
hu_oo_freq_val_ord      int64
hu_oo_freq_val         object
hu_oo_lt_5

In [11]:
# main = pop.merge(df, on=['year', 'county'])
# main = main.merge(geo, on='county')
# main = move_col(main, 'geo', 'pop')
# main = move_col(main, 'age_median', 'age_over18')

main = (pop
    .merge(df, on=['year', 'county'])
    .merge(geo, on='county')
    .move_col('geo', 'pop')
    .move_col('age_median', 'age_over18')
)
main

Unnamed: 0,year,county,geo,pop,gend_m,gend_f,age_median,age_over18,age_undr19,gend_m_age_undr19,...,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,460468.0,231571.0,228902.0,32.4,324757.0,135711.0,69462.0,...,30185.0,19615.0,20369.0,7,b2000_2009,6158.0,438171.0,62008.0,124375.0,25278.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,469878.0,236235.0,233638.0,32.6,332568.0,137310.0,70288.0,...,28972.0,20124.0,20131.0,7,b2000_2009,6473.0,447014.0,63540.0,126128.0,25247.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,479904.0,241181.0,238716.0,32.8,341149.0,138755.0,71006.0,...,29550.0,19704.0,19775.0,7,b2000_2009,6818.0,456829.0,64599.0,128235.0,25000.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,490355.0,246361.0,243987.0,33.0,349808.0,140547.0,71903.0,...,29020.0,19119.0,19447.0,7,b2000_2009,6690.0,466690.0,64241.0,130178.0,24906.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,497822.0,250012.0,247808.0,33.3,356927.0,140895.0,72109.0,...,29377.0,18354.0,19601.0,7,b2000_2009,6330.0,475099.0,61265.0,131457.0,23301.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,2015,YUMA,MULTIPOLYGON (((-102.0515511138711 40.43985046...,10041.0,4977.0,5063.0,36.0,7304.0,2737.0,1341.0,...,795.0,602.0,409.0,1,b1949_e,1715.0,10030.0,1451.0,2705.0,607.0
500,2016,YUMA,MULTIPOLYGON (((-102.0515511138711 40.43985046...,10051.0,4984.0,5071.0,37.2,7303.0,2748.0,1342.0,...,845.0,666.0,436.0,1,b1949_e,1626.0,10003.0,1459.0,2718.0,604.0
501,2017,YUMA,MULTIPOLYGON (((-102.0515511138711 40.43985046...,9970.0,4936.0,5034.0,35.8,7230.0,2740.0,1338.0,...,845.0,674.0,586.0,1,b1949_e,1568.0,9967.0,1425.0,2696.0,586.0
502,2018,YUMA,MULTIPOLYGON (((-102.0515511138711 40.43985046...,9956.0,4936.0,5024.0,38.4,7211.0,2745.0,1342.0,...,792.0,478.0,552.0,1,b1949_e,1636.0,9925.0,1438.0,2677.0,557.0


## Calculations for groups
---

In [12]:
from grouped_df import GroupedDF
GroupedDF.default_index = INDEX
GroupedDF.set_groups(['age', 'gend', 'race', 'inc', 'hh', 'citz', 'hsgrad', 'civ_lf', 'ps', 'stud', 'hu', 'hu_blt', 'hu_oo'])

In [13]:
gd = GroupedDF(main, INDEX, custom={'hu': INDEX + ['hu', 'hu_occ']})
# gd.df
gd.display(5)

age: 


Unnamed: 0,year,county,geo,age_median,age_over18,age_undr19
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,32.4,324757.0,135711.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,32.6,332568.0,137310.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,32.8,341149.0,138755.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,33.0,349808.0,140547.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,33.3,356927.0,140895.0



gend: 


Unnamed: 0,year,county,geo,gend_m,gend_f,gend_m_age_undr19,gend_f_age_undr19,gend_m_age_over18,gend_f_age_over18
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,231571.0,228902.0,69462.0,66249.0,162109.0,162653.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,236235.0,233638.0,70288.0,67022.0,165947.0,166616.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,241181.0,238716.0,71006.0,67749.0,170175.0,170967.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,246361.0,243987.0,71903.0,68640.0,174458.0,175347.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,250012.0,247808.0,72109.0,68782.0,177903.0,179026.0



race: 


Unnamed: 0,year,county,geo,race_hispanic,race_white,race_black,race_asian,race_other
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,167556.0,235991.0,12970.0,15304.0,11175.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,172472.0,239023.0,13247.0,16152.0,11136.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,177234.0,242237.0,13753.0,16518.0,11816.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,182114.0,245330.0,14234.0,17233.0,12295.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,186852.0,247948.0,14393.0,17632.0,13152.0



inc: 


Unnamed: 0,year,county,geo,inc_hh_median,inc_per_cap
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,56633.0,24357.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,56270.0,24195.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,57421.0,24667.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,58946.0,25039.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,61444.0,26051.0



hh: 


Unnamed: 0,year,county,geo,hh,hh_size_avg
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,151034.0,2.91
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,152803.0,2.93
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,155047.0,2.95
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,156628.0,2.98
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,158748.0,3.0



citz: 


Unnamed: 0,year,county,geo,citz,citz_birth,citz_co
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,396172.0,376454.0,223907.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,404808.0,384839.0,230234.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,412749.0,391951.0,236818.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,421902.0,400256.0,243654.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,429036.0,406693.0,249825.0



hsgrad: 


Unnamed: 0,year,county,geo,hsgrad_pool,hsgrad_graduated
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,275628.0,166731.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,282256.0,170636.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,289449.0,174042.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,296842.0,178231.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,303509.0,181479.0



civ_lf: 


Unnamed: 0,year,county,geo,civ_lf,civ_lf_employed
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,236110.0,213794.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,238934.0,215131.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,243577.0,220941.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,249542.0,229743.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,254215.0,238687.0



ps: 


Unnamed: 0,year,county,geo,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,438171.0,62008.0,124375.0,25278.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,447014.0,63540.0,126128.0,25247.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,456829.0,64599.0,128235.0,25000.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,466690.0,64241.0,130178.0,24906.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,475099.0,61265.0,131457.0,23301.0



stud: 


Unnamed: 0,year,county,geo,stud_enroll_pool,stud_enrolled,stud_undergrad,stud_1_4,stud_5_8,stud_9_12
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,420756.0,117499.0,19299.0,28761.0,26645.0,24342.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,429458.0,121425.0,20427.0,28454.0,28230.0,25717.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,438753.0,124736.0,21352.0,28217.0,29562.0,26255.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,448914.0,127655.0,21803.0,29421.0,29236.0,27120.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,457942.0,129057.0,21302.0,29081.0,30456.0,28273.0



hu: 


Unnamed: 0,year,county,geo,hu,hu_occ
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,163245.0,151034.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,163512.0,152803.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,164384.0,155047.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,165046.0,156628.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,166058.0,158748.0



hu_blt: 


Unnamed: 0,year,county,geo,hu_blt_2000_plus,hu_blt_1990_1999,hu_blt_1980_1989,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,38682.0,27598.0,20368.0,30185.0,19615.0,20369.0,7,b2000_2009,6158.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,38904.0,27310.0,20929.0,28972.0,20124.0,20131.0,7,b2000_2009,6473.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,39149.0,26953.0,20664.0,29550.0,19704.0,19775.0,7,b2000_2009,6818.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,39300.0,27228.0,21165.0,29020.0,19119.0,19447.0,7,b2000_2009,6690.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,39120.0,27797.0,20746.0,29377.0,18354.0,19601.0,7,b2000_2009,6330.0



hu_oo: 


Unnamed: 0,year,county,geo,hu_oo,hu_oo_freq_val_ord,hu_oo_freq_val,hu_oo_lt_50,hu_oo_50_150,hu_oo_150_250,hu_oo_250_400,hu_oo_400_750,hu_oo_750_plus
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,100108.0,3,v150k_250k,8578.0,19838.0,47583.0,17779.0,5427.0,903.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,100221.0,3,v150k_250k,8896.0,21037.0,46182.0,17989.0,5284.0,833.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,100071.0,3,v150k_250k,8640.0,19548.0,46688.0,18869.0,5375.0,951.0
3,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,101043.0,3,v150k_250k,8599.0,17412.0,44850.0,22615.0,6520.0,1047.0
4,2016,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,102279.0,3,v150k_250k,8198.0,14500.0,41247.0,28561.0,8506.0,1267.0





## Calculations
---

- **age, and gend**
  - `age_median`: (Existing)
  - `age_undr19_prop`: What percent of the population is under 19?
  - `gend_m_prop`: What percent of the population is male?
  - `age_undr19_gend_m_prop`: What percent of under-19 year old are male? (divide m_undr19 by undr19)
- **inc**
  - `inc_hh_med`: (Existing) Median household income
  - `inc_per_cap`: (Existing) Per capita income
- **hh**
  - `hh_size_avg`: (Existing) Average household size
- **race**
  - `race_{x}_prop`: What percent of the population is race x?
  - `race_prop_stdev`: What is the standard deviation of the race proportions? We need to calculate the proportions first, to normalize for the population size, that way, we can compare the standard deviations across groups
- **hsgrad**
  - `hsgrad_graduated_prop`: What percent of adults (age 25+) have a high school diploma or equivalent?
- **civ_lf**
  - `civ_lf_prop`: What percent of the population is in the civilian labor force?
  - `civ_lf_employed_prop`: What percent of the civilian labor force is employed?
- **ps**
  - `ps_total_prop`: What percent of people whose poverty status is known are below the poverty line?
  - `ps_undr18_total_prop`: What percent of under-18 people whose poverty status is known are below the poverty line?
  - `ps_undr18_prop`: What percent of people below the poverty line are under 18?
- **stud**
  - `stud_enrolled_prop`: Percent of people who could be enrolled in school that actually are enrolled
  - `stud_hs_prop`: What percent of gradeschool students (1-12) are high schoolers? (lower number indicates dropouts, which may associate with crime)
  - `stud_undergrad_prop`: What percent of enrolled students are undergraduates?
- **citz**
  - `citz_prop`: What percent of the population is a us citizen?
  - `citz_birth_prop`: What percent of us citizens were born in the us?
  - `citz_co_prop`: What percent of citizens were born in Colorado?
- **hu**
  - `hu_occ_prop`: Percent of homes which are occupied
  - `hu_blt_after1989`: Percent of homes which were built in the past 20 years
  - `hu_blt_nominal`: Convert hu_blt_mode_range to nominal, where the highest number corresponds to highest year range
- **hu_oo**
  - `hu_oo_prop`: Percent of occupied properties occupied by owner. The remaining percent is renter occupied
  - `hu_oo_lt_50_prop`: Percent of owner occupied properties worth less than $50,000
  - `hu_oo_750_plus_prop`: Percent of owner occupied properties worth $750,000 or more


In [14]:
df = main.copy()

df['age_over18_prop'] = df.age_over18 / df['pop']
df['age_undr19_prop'] = df.age_undr19 / df['pop']
df['gend_m_prop'] = df.gend_m / df['pop']
df['gend_f_prop'] = df.gend_f / df['pop']
df['age_undr19_gend_m_prop'] = df.gend_m_age_undr19 / df.age_undr19
df['age_undr19_gend_f_prop'] = df.gend_f_age_undr19 / df.age_undr19
df['age_over18_gend_m_prop'] = df.gend_m_age_over18 / df.age_over18
df['age_over18_gend_f_prop'] = df.gend_f_age_over18 / df.age_over18

df['gend_m_age_undr19_prop'] = df.gend_m_age_undr19 / df.gend_m
df['gend_m_age_over18_prop'] = df.gend_m_age_over18 / df.gend_m
df['gend_f_age_undr19_prop'] = df.gend_f_age_undr19 / df.gend_f
df['gend_f_age_over18_prop'] = df.gend_f_age_over18 / df.gend_f

race_base = GroupedDF(df, INDEX).race
race = df.copy()[INDEX]
for c in [c for c in race_base.columns if c not in INDEX]:
    race[f'{c}_prop'] = race_base[c] / df['pop']

race['race_prop_stdev'] = np.std(race.drop(columns=INDEX), axis=1)
df = df.merge(race, how='inner', on=INDEX)

df['hsgrad_graduated_prop'] = df.hsgrad_graduated / df.hsgrad_pool

df['civ_lf_prop'] = df.civ_lf / df['pop']
df['civ_lf_employed_prop'] = df.civ_lf_employed / df.civ_lf

df['ps_total_prop'] = df.ps_below / df.ps_known
df['ps_undr18_total_prop'] = df.ps_undr18_below / df.ps_undr18_known
df['ps_undr18_prop'] = df.ps_undr18_below / df.ps_below

df['stud_enrolled_prop'] = df.stud_enrolled / df.stud_enroll_pool
df['stud_hs_prop'] = df.stud_9_12 / (df.stud_1_4 + df.stud_5_8 + df.stud_9_12)
df['stud_undergrad_prop'] = df.stud_undergrad / df.stud_enrolled

df['citz_per_cap'] = df.citz / df['pop']
df['citz_birth_prop'] = df.citz_birth / df.citz
df['citz_co_prop'] = df.citz_co / df.citz

df['hu_per_cap'] = df.hu / df['pop']
df['hu_occ_prop'] = df.hu_occ / df.hu
df['hu_blt_2000_plus_prop'] = df.hu_blt_2000_plus / df.hu

df['hu_oo_prop'] = df.hu_oo / df.hu_occ

for hval in ['hu_oo_lt_50', 'hu_oo_50_150', 'hu_oo_150_250', 'hu_oo_250_400', 'hu_oo_400_750', 'hu_oo_750_plus']:
    df[f'{hval}_prop'] = df[hval] / df.hu_oo

for hyear in [
        'hu_blt_lt_1950', 'hu_blt_1950_1959', 'hu_blt_1960_1969',
        'hu_blt_1970_1979', 'hu_blt_1980_1989', 'hu_blt_1990_1999', 'hu_blt_2000_plus'
    ]:
    df[f'{hyear}_prop'] = df[hyear] / df.hu

prop, counts = separate_by(df, ['prop', 'per_cap', 'median', 'avg', 'freq', 'med_hm_val', 'med_yr_blt'])

prop.to_csv('../working-data/county_stats_normalized.csv', index=False)
counts.to_csv('../working-data/county_stats_counts.csv', index=False)
df.to_csv('../working-data/county_stats.csv', index=False)

gprop = GroupedDF(prop, INDEX, custom={'hu': INDEX + ['hu_per_cap', 'hu_occ_prop']})
gprop.display()

age: 


Unnamed: 0,year,county,geo,age_over18_prop,age_undr19_prop,age_undr19_gend_m_prop,age_undr19_gend_f_prop,age_over18_gend_m_prop,age_over18_gend_f_prop,age_median
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.705276,0.294724,0.511838,0.488162,0.49917,0.500845,32.4
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.707775,0.292225,0.511893,0.488107,0.498987,0.500998,32.6
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.710869,0.289131,0.511737,0.488263,0.498829,0.501151,32.8



gend: 


Unnamed: 0,year,county,geo,gend_m_prop,gend_f_prop,gend_m_age_undr19_prop,gend_m_age_over18_prop,gend_f_age_undr19_prop,gend_f_age_over18_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.502904,0.497107,0.29996,0.70004,0.289421,0.710579
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.502758,0.497231,0.297534,0.702466,0.286863,0.713137
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.502561,0.497424,0.29441,0.70559,0.283806,0.716194



race: 


Unnamed: 0,year,county,geo,race_hispanic_prop,race_white_prop,race_black_prop,race_asian_prop,race_other_prop,race_prop_stdev
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.363882,0.512502,0.028167,0.033236,0.024269,0.206129
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.367057,0.508692,0.028192,0.034375,0.0237,0.205398
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.369311,0.504761,0.028658,0.034419,0.024622,0.204346



inc: 


Unnamed: 0,year,county,geo,inc_per_cap,inc_hh_median
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,24357.0,56633.0
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,24195.0,56270.0
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,24667.0,57421.0



hh: 


Unnamed: 0,year,county,geo,hh_size_avg
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,2.91
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,2.93
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,2.95



citz: 


Unnamed: 0,year,county,geo,citz_birth_prop,citz_co_prop,citz_per_cap
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.950229,0.565176,0.860368
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.95067,0.568749,0.861517
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.949611,0.573758,0.860066



hsgrad: 


Unnamed: 0,year,county,geo,hsgrad_graduated_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.604913
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.604543
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.601287



civ_lf: 


Unnamed: 0,year,county,geo,civ_lf_prop,civ_lf_employed_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.512761,0.905485
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.508502,0.900378
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.507554,0.907068



ps: 


Unnamed: 0,year,county,geo,ps_total_prop,ps_undr18_total_prop,ps_undr18_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.141516,0.20324,0.407657
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.142143,0.20017,0.39734
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.141407,0.194955,0.387003



stud: 


Unnamed: 0,year,county,geo,stud_enrolled_prop,stud_hs_prop,stud_undergrad_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.279257,0.305236,0.164248
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.28274,0.312096,0.168227
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.284297,0.312433,0.171178



hu: 


Unnamed: 0,year,county,geo,hu_per_cap,hu_occ_prop
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.35452,0.925198
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.347988,0.934506
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.342535,0.9432



hu_blt: 


Unnamed: 0,year,county,geo,hu_blt_2000_plus_prop,hu_blt_lt_1950_prop,hu_blt_1950_1959_prop,hu_blt_1960_1969_prop,hu_blt_1970_1979_prop,hu_blt_1980_1989_prop,hu_blt_1990_1999_prop,hu_blt_freq_yr_ord,hu_blt_freq_yr
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.236957,0.037722,0.124776,0.120157,0.184906,0.12477,0.169059,7,b2000_2009
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.237927,0.039587,0.123116,0.123074,0.177186,0.127997,0.167021,7,b2000_2009
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.238156,0.041476,0.120298,0.119866,0.179762,0.125706,0.163964,7,b2000_2009



hu_oo: 


Unnamed: 0,year,county,geo,hu_oo_prop,hu_oo_lt_50_prop,hu_oo_50_150_prop,hu_oo_150_250_prop,hu_oo_250_400_prop,hu_oo_400_750_prop,hu_oo_750_plus_prop,hu_oo_freq_val_ord,hu_oo_freq_val
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.662818,0.085687,0.198166,0.475317,0.177598,0.054211,0.00902,3,v150k_250k
1,2013,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.655884,0.088764,0.209906,0.460802,0.179493,0.052723,0.008312,3,v150k_250k
2,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.645424,0.086339,0.195341,0.466549,0.188556,0.053712,0.009503,3,v150k_250k



