# In This Notebook

The population analytics data set would benefit from some additional features. 

**N.B. THE DATA DIRECTORY NEEDS TO BE RECREATED WITH THE NECESSARY DATA FILES COPIED FROM THE MASTER 'data' FOLDER, WHICH IS ON THE SAME LEVEL AS THE 'Notebooks' DIRECTORY.**


# Setup

In [5]:
import os
import sys

import numpy as np
import pandas as pd

In [27]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# Load Data

In [28]:
datapath = './data'
# load population-eda.csv
pfile = os.path.join(datapath, 'pop_eda.csv')
pop = pd.read_csv(pfile, index_col=0)
pop.name = 'population-data'

# load income-data.csv
ifile = os.path.join(datapath, 'income-data.csv')
inc = pd.read_csv(ifile)
inc.name = 'income-data'

# load social-data.csv
sfile = os.path.join(datapath, 'social-data.csv')
soc = pd.read_csv(sfile)
soc.name = 'social-data'

# load education-data.csv
efile = os.path.join(datapath, 'education-data.csv')
edu = pd.read_csv(efile)
edu.name = 'education-data'

In [29]:
pop.head()

Unnamed: 0,year,total_m,total_f,under_18_m,x18_to_29_m,x30_to_49_m,x50_to_64_m,x65_plus_m,under_18_f,x18_to_29_f,x30_to_49_f,x50_to_64_f,x65_plus_f,post_code,loc_id,lat,long,city,state,zip_code,num_employees,num_breweries
1,2011,22989,24480,10460,7190,10544,4975,2297,9970,7044,10947,5934,2973,147.0,2737.0,18.180103,-66.74947,Adjuntas,PR,601,,
2,2012,23166,24426,10365,7208,10512,5070,2402,9867,7008,10901,5972,3058,147.0,2737.0,18.180103,-66.74947,Adjuntas,PR,601,,
3,2013,23001,24389,10279,7144,10557,5085,2157,9817,6971,10840,5994,3181,147.0,2737.0,18.180103,-66.74947,Adjuntas,PR,601,,
4,2014,22849,24194,10165,6924,10492,5118,2443,9713,6940,10772,5979,3170,147.0,2737.0,18.180103,-66.74947,Adjuntas,PR,601,,
5,2015,22815,24130,10071,6938,10421,5166,1685,9657,6837,10771,6005,3202,147.0,2737.0,18.180103,-66.74947,Adjuntas,PR,601,,


In [30]:
frames = [pop, inc, soc, edu]
for frame in frames:
    print(frame.name)
    print('-'*70)
    for i,c in enumerate(frame.columns.tolist()):
        print('{}: {}'.format(i,c))
    print('\n')

population-data
----------------------------------------------------------------------
0: year
1: total_m
2: total_f
3: under_18_m
4: x18_to_29_m
5: x30_to_49_m
6: x50_to_64_m
7: x65_plus_m
8: under_18_f
9: x18_to_29_f
10: x30_to_49_f
11: x50_to_64_f
12: x65_plus_f
13: post_code
14: loc_id
15: lat
16: long
17: city
18: state
19: zip_code
20: num_employees
21: num_breweries


income-data
----------------------------------------------------------------------
0: zip
1: num_households
2: median_household_income
3: pct_white_households
4: median_white_household_income
5: year


social-data
----------------------------------------------------------------------
0: zip
1: num_households
2: num_family
3: num_married_family
4: num_nonfamily
5: num_living_alone
6: avg_household_size
7: avg_family_size
8: never_married
9: married_but_separated
10: separated
11: widowed
12: divorced
13: foreign_born
14: us_citizen
15: english_speakers
16: other_language_speakers
17: american_ancestry
18: year


edu

# Transform Data

In [31]:
# convert zip fields to string and zero-pad
pop.zip_code = pop.zip_code.apply(lambda x: str(x).zfill(5))

inc.zip = inc.zip.apply(lambda x: str(x).zfill(5))
inc['num_white_households'] = inc.num_households * inc.pct_white_households
inc.num_white_households = inc.num_white_households.apply(lambda x: int(x) * 1.0 if pd.notnull(x) else np.NaN)
inc = inc.iloc[:, [0,1,2,5,6]]
inc.columns = ['zip_code', 'num_households', 'median_household_income', 'year', 'num_white_households']

soc.zip = soc.zip.apply(lambda x: str(x).zfill(5))
soc = soc.iloc[:, [0,2,6,14,15,17,18]]
soc.columns = ['zip_code', 'num_family_households', 'avg_household_size', 
               'num_us_citizens', 'num_english_speakers', 'num_with_american_ancestry', 'year']
edu.zip = edu.zip.apply(lambda x: str(x).zfill(5))
edu['num_hs_grads'] = edu.hs_diploma + edu.ged + edu.assoc_degree
edu['num_bach_degree'] = edu.bachelors
edu['num_post_grad'] = edu.masters + edu.professional + edu.doctorate
edu = edu.loc[:, ['zip', 'year', 'num_hs_grads', 'num_bach_degree', 'num_post_grad']]
edu = edu.rename(columns={'zip': 'zip_code'})

In [35]:
soc[soc.zip_code=='00727']

Unnamed: 0,zip_code,num_family_households,avg_household_size,num_us_citizens,num_english_speakers,num_with_american_ancestry,year


In [32]:
frames = [pop, inc, soc, edu]
df = reduce(lambda left,right: pd.merge(left,right,how='left', on=['zip_code', 'year']), frames)

In [36]:
df.to_csv(os.path.join(datapath, 'location-features.csv'), index=False)