In [1]:
import pandas as pd
import numpy as np
import pickle

We are looking at three datasets from the 2011-2015 American Community Survey 5-Year Estimates:  

[MEDIAN INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS)](https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_15_5YR_S1903&prodType=table)   

[FIELD OF BACHELOR'S DEGREE FOR FIRST MAJOR](https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_15_5YR_S1502&prodType=table)  

[AGE AND SEX](https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_15_5YR_S0101&prodType=table)

In [14]:
directory_filepath = '/Users/Joe/Documents/Metis/Projects/metis-one-Benson/'
age_sex_filepath = 'census-data-by-zip-code/ACS_15_5YR_S0101/ACS_15_5YR_S0101_with_ann.csv'
major_filepath = 'census-data-by-zip-code/ACS_15_5YR_S1502/ACS_15_5YR_S1502_with_ann.csv'
income_filepath = 'census-data-by-zip-code/ACS_15_5YR_S1903/ACS_15_5YR_S1903_with_ann.csv'

In [15]:
def census_data(filepath):
    """
    This function reads in any ACS data and returns a dataframe with the index set to zip code.
    ---
    IN: csv file
    OUT: pandas dataframe
    """
    df = pd.read_csv(filepath, header=1)
    
    column_rename = {'Geography': 'Zip Code'}
    df.rename(columns = column_rename, inplace = True)
    df['Zip Code'] = df['Zip Code'].str.split().str[1]
    df['Zip Code'] = df['Zip Code'].apply(pd.to_numeric)
    df.set_index(df['Zip Code'], inplace = True)
    
    return df

In [16]:
def parse_age_sex(filepath):
    df = census_data(filepath)
    
    columns_to_keep = ['Total; Estimate; Total population',
                       'Female; Estimate; AGE - 20 to 24 years',
                       'Female; Estimate; AGE - 25 to 29 years',
                       'Female; Estimate; AGE - 30 to 34 years']
    df = df[columns_to_keep]
    
    # Only keep values that aren't equal to zero, then convert from string to num
    # (Before this, when the zip code was '0', the populations were '-')
    df = df[df['Total; Estimate; Total population'] != 0]
    df = df[df['Female; Estimate; AGE - 20 to 24 years'] != '-']
    df = df.apply(pd.to_numeric)

    df['Percent Female Aged 20 to 34'] = 100*(df['Female; Estimate; AGE - 20 to 24 years'] + 
                                              df['Female; Estimate; AGE - 25 to 29 years'] +
                                              df['Female; Estimate; AGE - 30 to 34 years'])/df['Total; Estimate; Total population']
    
    df.drop(columns_to_keep, axis=1, inplace=True)
    
    return df

In [17]:
def parse_major(filepath):
    df = census_data(filepath)
    
    column_to_keep = ['Percent Females; Estimate; Science and Engineering Related Fields']
    df = df[column_to_keep]
    df = df[df[column_to_keep] != '-'].apply(pd.to_numeric)
    return df

In [18]:
def demographic():
    sex = parse_age_sex(directory_filepath + age_sex_filepath)
    major = parse_major(directory_filepath + major_filepath)
    demographic = pd.merge(sex, major, how = 'inner', left_index=True, right_index=True)
    
    # Both of these are in percentages, so I must divide by 100 to get another percentage
    demographic['Demographic'] = (demographic['Percent Female Aged 20 to 34']*
                                  demographic['Percent Females; Estimate; Science and Engineering Related Fields']/100)
    
    return demographic

In [19]:
def parse_income(filepath):
    df = census_data(filepath)
    
    column_to_keep = 'Median income (dollars); Estimate; Households'
    df = df[[column_to_keep]]
    df = df[df[column_to_keep] != '-']
    df = df[df[column_to_keep] != '(X)']
    df[column_to_keep] = df[column_to_keep].str.strip('+')
    df[column_to_keep] = df[column_to_keep].str.replace(',','')
    df = df.apply(pd.to_numeric)
    
    return df   

In [20]:
census_dataframe = pd.merge(demographic(), parse_income(directory_filepath + income_filepath), how='inner', left_index=True, right_index=True)

Income values of 250000 were originally $250,000+

In [21]:
census_dataframe.sort_values('Median income (dollars); Estimate; Households', ascending=False).head()

Unnamed: 0_level_0,Percent Female Aged 20 to 34,Percent Females; Estimate; Science and Engineering Related Fields,Demographic,Median income (dollars); Estimate; Households
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10282,0.528796,5.2,0.027497,250000
10007,0.431937,2.5,0.010798,250000
10577,0.456368,7.8,0.035597,216250
10514,0.064359,4.9,0.003154,212528
11962,0.409639,0.0,0.0,209125


The census data is rather interesting. There are some zip codes that report over 100% of the population is women between 20 and 34. We have doubled checked the census data, and it reports higher female pop. values than total pop. values. We have decided to keep this data, because we don't really care about the true percentage, we just care about how a zip code ranks relative to other zip codes.

In [22]:
census_dataframe.sort_values('Demographic', ascending=False).head()

Unnamed: 0_level_0,Percent Female Aged 20 to 34,Percent Females; Estimate; Science and Engineering Related Fields,Demographic,Median income (dollars); Estimate; Households
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13623,146.976744,50.0,73.488372,78750
13124,26.410256,100.0,26.410256,46094
14511,12.905983,100.0,12.905983,45769
14130,11.511628,100.0,11.511628,47679
12416,22.519685,45.5,10.246457,25050


Move zip codes from index to column

In [23]:
census_dataframe['zip code'] = census_dataframe.index

Save this data

In [24]:
def save_file(df, filename):
    filepath = '/Users/Joe/Documents/Metis/Projects/metis-one-Benson/pickled-dataframes/'
    with open(filepath + filename, 'wb') as picklefile:
        pickle.dump(df, picklefile)

In [25]:
save_file(census_dataframe, 'census_pkl.pkl')