## Population

> This data will supplement our census data in the next step of data prep. We're using the Population dataset because it's more accurate (census is just estimates), and it lets us create the age grouping ourselves. There are nearly 400,000 rows, because they give us population by year, county, and EACH individual age. In our case, we want to create an age grouping that separates students in school, and adults. So we chose >= 19, and < 19. It also has a 60 year timeframe. So number of rows = 60 years * 64 counties * 90 years of age.

In [11]:
import pandas as pd, numpy as np
df_raw = pd.read_csv('../input-data/county_population__eeah-cmy8.csv')
df_raw

Unnamed: 0,county,fipscode,year,age,malepopulation,femalepopulation,totalpopulation
0,Adams,1.0,1990.0,0.0,2354.0,2404.0,4758.0
1,Adams,1.0,1990.0,1.0,2345.0,2375.0,4720.0
2,Adams,1.0,1990.0,2.0,2413.0,2219.0,4632.0
3,Adams,1.0,1990.0,3.0,2321.0,2261.0,4582.0
4,Adams,1.0,1990.0,4.0,2433.0,2302.0,4735.0
...,...,...,...,...,...,...,...
381499,Yuma,125.0,2030.0,64.0,53.0,51.0,104.0
381500,Yuma,125.0,2023.0,30.0,59.0,62.0,121.0
381501,Yuma,125.0,2049.0,0.0,73.0,69.0,142.0
381502,Yuma,125.0,2020.0,26.0,50.0,48.0,98.0


## Create age groups (< 19, >= 19)

In [12]:
df = df_raw.copy()
df.county = df.county.str.upper()
df['age_range'] = "over18"
df.loc[df.age <= 18, 'age_range'] = 'under19'
df.insert(2, 'age_range', df.pop('age_range'))

df_with_groups = df
df_with_groups

Unnamed: 0,county,fipscode,age_range,year,age,malepopulation,femalepopulation,totalpopulation
0,ADAMS,1.0,under19,1990.0,0.0,2354.0,2404.0,4758.0
1,ADAMS,1.0,under19,1990.0,1.0,2345.0,2375.0,4720.0
2,ADAMS,1.0,under19,1990.0,2.0,2413.0,2219.0,4632.0
3,ADAMS,1.0,under19,1990.0,3.0,2321.0,2261.0,4582.0
4,ADAMS,1.0,under19,1990.0,4.0,2433.0,2302.0,4735.0
...,...,...,...,...,...,...,...,...
381499,YUMA,125.0,over18,2030.0,64.0,53.0,51.0,104.0
381500,YUMA,125.0,over18,2023.0,30.0,59.0,62.0,121.0
381501,YUMA,125.0,under19,2049.0,0.0,73.0,69.0,142.0
381502,YUMA,125.0,over18,2020.0,26.0,50.0,48.0,98.0


In [13]:
df = (df_with_groups.copy()
    .drop(columns=['age', 'fipscode'])
    .rename(columns={
        'malepopulation': 'male',
        'femalepopulation': 'female',
        'totalpopulation': 'total',
    })
    .groupby(['year', 'county', 'age_range'])
    .sum()
    .reset_index()
)
df_grouped = df
df_grouped

Unnamed: 0,year,county,age_range,male,female,total
0,1990.0,ADAMS,over18,90383.0,94282.0,184665.0
1,1990.0,ADAMS,under19,41519.0,39525.0,81044.0
2,1990.0,ALAMOSA,over18,4488.0,4823.0,9311.0
3,1990.0,ALAMOSA,under19,2189.0,2117.0,4306.0
4,1990.0,ARAPAHOE,over18,134481.0,146820.0,281301.0
...,...,...,...,...,...,...
7803,2050.0,WASHINGTON,under19,353.0,334.0,686.0
7804,2050.0,WELD,over18,237224.0,236964.0,474189.0
7805,2050.0,WELD,under19,74010.0,71425.0,145435.0
7806,2050.0,YUMA,over18,3462.0,3572.0,7031.0


### Notice the `age_range` column. We should pivot those values out to their own columns, and mix with our existing columns
- First, pivot age_range into the male, female, and total columns
- We're left with a multilevel column index, so we drop a level and rename everything by hand.
- Lastly, restore the total, male, and female columns since they got split in half when pivoting.

In [14]:
df = (df_grouped.copy()
    .pivot(
        index=['year', 'county'],
        columns='age_range',
        values=['male', 'female', 'total']
    ).reset_index()
)
# Flatten multi-level columns and rename them
df.columns = df.columns.droplevel()
df.columns.name = None
df.columns = ['year', 'county', 'over18_male', 'under19_male', 'over18_female', 'under19_female', 'over18', 'under19']

df_pivoted = df
df_pivoted

Unnamed: 0,year,county,over18_male,under19_male,over18_female,under19_female,over18,under19
0,1990.0,ADAMS,90383.0,41519.0,94282.0,39525.0,184665.0,81044.0
1,1990.0,ALAMOSA,4488.0,2189.0,4823.0,2117.0,9311.0,4306.0
2,1990.0,ARAPAHOE,134481.0,57241.0,146820.0,54747.0,281301.0,111988.0
3,1990.0,ARCHULETA,1835.0,890.0,1874.0,753.0,3709.0,1643.0
4,1990.0,BACA,1647.0,597.0,1713.0,599.0,3360.0,1196.0
...,...,...,...,...,...,...,...,...
3899,2050.0,SUMMIT,16671.0,2541.0,15620.0,2428.0,32286.0,4969.0
3900,2050.0,TELLER,12072.0,2433.0,12349.0,2349.0,24411.0,4781.0
3901,2050.0,WASHINGTON,1848.0,353.0,1742.0,334.0,3591.0,686.0
3902,2050.0,WELD,237224.0,74010.0,236964.0,71425.0,474189.0,145435.0


In [15]:
# New calculated columns
df = df_pivoted.copy()
df['total'] = df.under19 + df.over18
df['male'] = df.under19_male + df.over18_male
df['female'] = df.under19_female + df.over18_female

df = df[['year', 'county', 'total', 'male', 'female', 'over18', 'under19', 'under19_male', 'under19_female', 'over18_male', 'over18_female']]
df_with_calculated_cols = df
df_with_calculated_cols

Unnamed: 0,year,county,total,male,female,over18,under19,under19_male,under19_female,over18_male,over18_female
0,1990.0,ADAMS,265709.0,131902.0,133807.0,184665.0,81044.0,41519.0,39525.0,90383.0,94282.0
1,1990.0,ALAMOSA,13617.0,6677.0,6940.0,9311.0,4306.0,2189.0,2117.0,4488.0,4823.0
2,1990.0,ARAPAHOE,393289.0,191722.0,201567.0,281301.0,111988.0,57241.0,54747.0,134481.0,146820.0
3,1990.0,ARCHULETA,5352.0,2725.0,2627.0,3709.0,1643.0,890.0,753.0,1835.0,1874.0
4,1990.0,BACA,4556.0,2244.0,2312.0,3360.0,1196.0,597.0,599.0,1647.0,1713.0
...,...,...,...,...,...,...,...,...,...,...,...
3899,2050.0,SUMMIT,37255.0,19212.0,18048.0,32286.0,4969.0,2541.0,2428.0,16671.0,15620.0
3900,2050.0,TELLER,29192.0,14505.0,14698.0,24411.0,4781.0,2433.0,2349.0,12072.0,12349.0
3901,2050.0,WASHINGTON,4277.0,2201.0,2076.0,3591.0,686.0,353.0,334.0,1848.0,1742.0
3902,2050.0,WELD,619624.0,311234.0,308389.0,474189.0,145435.0,74010.0,71425.0,237224.0,236964.0


In [16]:
df_with_calculated_cols.to_csv('../working-data/county_population.csv', index=False)