In [24]:
import numpy as np
import pandas as pd

from census import Census

In [25]:
social = []
economic = []
housing = []

In [26]:
files = ['12','14','16','17']

In [27]:
# Read in files

for file in files:
    social.append(pd.read_csv('/Users/Stephen/project-mcnulty/data/ACS_{}_1YR_DP02_with_ann.csv'.format(file)))
    economic.append(pd.read_csv('/Users/Stephen/project-mcnulty/data/ACS_{}_1YR_DP03_with_ann.csv'.format(file)))
    housing.append(pd.read_csv('/Users/Stephen/project-mcnulty/data/ACS_{}_1YR_DP04_with_ann.csv'.format(file)))

In [28]:
# Reset column names

for frame in social:
    frame.columns = frame.iloc[0]
    frame.drop(0, inplace=True)
    frame.reset_index(inplace=True)
    
for frame in economic:
    frame.columns = frame.iloc[0]
    frame.drop(0, inplace=True)
    frame.reset_index(inplace=True)
    frame.drop(436, inplace=True)

for frame in housing:
    frame.columns = frame.iloc[0]
    frame.drop(0, inplace=True)
    frame.reset_index(inplace=True)
    frame.drop(436, inplace=True)  

In [29]:
frames = [social, economic, housing]
years = [2012, 2014, 2016, 2018]

In [30]:
# Set year field

for index, frame in enumerate(social):
    frame['Type'] = 'Social'
    frame['Year'] = years[index]
    
for index, frame in enumerate(economic):
    frame['Type'] = 'Economic'
    frame['Year'] = years[index]

for index, frame in enumerate(housing):
    frame['Type'] = 'Housing'
    frame['Year'] = years[index]

In [31]:
# Check the number of columns in each frame

for frame in social:
    print(frame['Year'].iloc[1], len(frame.columns))
    
for frame in economic:
    print(frame['Year'].iloc[1], len(frame.columns))
    
for frame in housing:
    print(frame['Year'].iloc[1], len(frame.columns))

2012 602
2014 614
2016 614
2018 614
2012 554
2014 554
2016 554
2018 554
2012 570
2014 570
2016 578
2018 578


In [32]:
# Drop any column that represents a margin of error

for frame in social:
    drop_cols = []
    for col in frame.columns:
        if 'Margin of Error' in col:
            drop_cols.append(col)
    frame.drop(drop_cols, axis=1, inplace=True)

for frame in economic:
    drop_cols = []
    for col in frame.columns:
        if 'Margin of Error' in col:
            drop_cols.append(col)
    frame.drop(drop_cols, axis=1, inplace=True)

for frame in housing:
    drop_cols = []
    for col in frame.columns:
        if 'Margin of Error' in col:
            drop_cols.append(col)
    frame.drop(drop_cols, axis=1, inplace=True)

In [33]:
# Check column lengths between frames

for frame in social:
    print(len(frame.columns))

for frame in economic:
    print(len(frame.columns))
    
for frame in housing:
    print(len(frame.columns))

304
310
310
310
280
280
280
280
288
288
292
292


In [39]:
# Realign housing columns by dropping columns not represented in all frames

housing[2].drop(['Estimate; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later'], axis=1,inplace=True)
housing[2].drop(['Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later'], axis=1,inplace=True)

housing[2].drop(['Estimate; SELECTED MONTHLY OWNER COSTS (SMOC) - Housing units without a mortgage - $800 to $999'],
                axis=1,inplace=True)
housing[2].drop(['Percent; SELECTED MONTHLY OWNER COSTS (SMOC) - Housing units without a mortgage - $800 to $999'],
                axis=1,inplace=True)

housing[3].drop(['Estimate; SELECTED MONTHLY OWNER COSTS (SMOC) - Housing units without a mortgage - $800 to $999'],
                axis=1,inplace=True)
housing[3].drop(['Percent; SELECTED MONTHLY OWNER COSTS (SMOC) - Housing units without a mortgage - $800 to $999'],
                axis=1,inplace=True)

housing[3].drop(['Estimate; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later'], axis=1,inplace=True)
housing[3].drop(['Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later'], axis=1,inplace=True)


In [40]:
# Realign columns in social frames by dropping columns

social[1].drop(['Estimate; COMPUTERS AND INTERNET USE - Total Households',
                'Estimate; COMPUTERS AND INTERNET USE - Total Households - With a computer',
                'Estimate; COMPUTERS AND INTERNET USE - Total Households - With a broadband Internet subscription'],
                axis=1, inplace=True)
social[1].drop(['Percent; COMPUTERS AND INTERNET USE - Total Households',
                'Percent; COMPUTERS AND INTERNET USE - Total Households - With a computer',
                'Percent; COMPUTERS AND INTERNET USE - Total Households - With a broadband Internet subscription'],
                axis=1, inplace=True)

social[2].drop(['Estimate; COMPUTERS AND INTERNET USE - Total households',
                'Estimate; COMPUTERS AND INTERNET USE - Total households - With a computer',
                'Estimate; COMPUTERS AND INTERNET USE - Total households - With a broadband Internet subscription'],
                axis = 1, inplace=True)
social[2].drop(['Percent; COMPUTERS AND INTERNET USE - Total households',
                'Percent; COMPUTERS AND INTERNET USE - Total households - With a computer',
                'Percent; COMPUTERS AND INTERNET USE - Total households - With a broadband Internet subscription'],
                axis = 1, inplace=True)

social[3].drop(['Estimate; COMPUTERS AND INTERNET USE - Total households',
                'Estimate; COMPUTERS AND INTERNET USE - Total households - With a computer',
                'Estimate; COMPUTERS AND INTERNET USE - Total households - With a broadband Internet subscription'],
                axis=1, inplace=True)
social[3].drop(['Percent; COMPUTERS AND INTERNET USE - Total households',
                'Percent; COMPUTERS AND INTERNET USE - Total households - With a computer',
                'Percent; COMPUTERS AND INTERNET USE - Total households - With a broadband Internet subscription'],
                axis=1, inplace=True)

Note: Determining which columns to drop was a laborious task involving printing every column name for every frame and lining them up next to each other to identify exactly where the columns had shifted. I've eliminated this code for visual clarity. 

In [41]:
# Ensure equal number of columns in each frame

for frame in social:
    print(len(frame.columns))

for frame in economic:
    print(len(frame.columns))
    
for frame in housing:
    print(len(frame.columns))

304
304
304
304
280
280
280
280
288
288
288
288


In [42]:
# Set all column names to the first frame's column names

social[1].columns = social[0].columns
social[2].columns = social[0].columns
social[3].columns = social[0].columns

In [43]:
economic[1].columns = economic[0].columns
economic[2].columns = economic[0].columns
economic[3].columns = economic[0].columns

In [44]:
housing[1].columns = housing[0].columns
housing[2].columns = housing[0].columns
housing[3].columns = housing[0].columns

In [61]:
for df in social:
    print(len(df))
    
for df in economic:
    print(len(df))
    
for df in housing:
    print(len(df))

436
436
436
436
436
436
436
436
436
436
436
436


In [62]:
# Drop Washington D.C. from the frames

for frame in social:
    frame.reset_index(inplace=True)
    frame.drop(87, inplace=True)

for frame in economic:
    frame.reset_index(inplace=True)
    frame.drop(87, inplace=True)
    
for frame in housing:
    frame.reset_index(inplace=True)
    frame.drop(87, inplace=True)

In [63]:
# Read in congressional district names and states

c = Census("72986bd6983e4c4882706fb7ebadf907915d4f31")

names = c.acs1dp.state_district('NAME','*','*',year = 2016)

In [64]:
names.pop(87)

{'NAME': 'Delegate District (at Large) (115th Congress), District of Columbia',
 'state': '11',
 'congressional district': '98'}

In [65]:
names.pop(435)

{'NAME': 'Resident Commissioner District (at Large) (115th Congress), Puerto Rico',
 'state': '72',
 'congressional district': '98'}

In [66]:
len(names)

435

In [67]:
# Create state and district columns

for frame in social:
    frame['District'] = 0
    frame['State'] = 0

for frame in economic:
    frame['District'] = 0
    frame['State'] = 0
    
for frame in housing:
    frame['District'] = 0
    frame['State'] = 0

In [73]:
for frame in social:
    frame.drop(['level_0', 'index'], axis=1, inplace=True)
    frame.reset_index(inplace=True)
    
for frame in economic:
    frame.drop(['level_0', 'index'], axis=1, inplace=True)
    frame.reset_index(inplace=True)
    
for frame in housing:
    frame.drop(['level_0', 'index'], axis=1, inplace=True)
    frame.reset_index(inplace=True)

In [74]:
# Create lists of states and districts

states = []
districts = []
for district in names:
    states.append(district['state'])
    districts.append(district['congressional district'])

In [75]:
# Align lists of states and districts in each frame

for df in social:
    df['District'] = districts
    df['State'] = states
    
for df in economic:
    df['District'] = districts
    df['State'] = states

for df in housing:
    df['District'] = districts
    df['State'] = states

In [76]:
# Set district, state, and year columns to be numeric

for df in social:
    df['District'] = pd.to_numeric(df['District'])
    df['State'] = pd.to_numeric(df['State'])
    df['Year'] = pd.to_numeric(df['Year'])
    
for df in economic:
    df['District'] = pd.to_numeric(df['District'])
    df['State'] = pd.to_numeric(df['State'])
    df['Year'] = pd.to_numeric(df['Year'])

for df in housing:
    df['District'] = pd.to_numeric(df['District'])
    df['State'] = pd.to_numeric(df['State'])
    df['Year'] = pd.to_numeric(df['Year'])

In [84]:
# Ensure all frames have same number of columns

for df in social:
    print(len(df.columns))

for df in economic:
    print(len(df.columns))

for df in housing:
    print(len(df.columns))

306
306
306
306
282
282
282
282
290
290
290
290


In [94]:
# Ensure all frames are 435 rows

for df in social:
    print(len(df))

for df in economic:
    print(len(df))

for df in housing:
    print(len(df))

435
435
435
435
435
435
435
435
435
435
435
435


In [85]:
# Append the frames together

social_data = social[0].append(social[1])
social_data = social_data.append(social[2])
social_data = social_data.append(social[3])

In [86]:
economic_data = economic[0].append(economic[1])
economic_data = economic_data.append(economic[2])
economic_data = economic_data.append(economic[3])

In [87]:
housing_data = housing[0].append(housing[1])
housing_data = housing_data.append(housing[2])
housing_data = housing_data.append(housing[3])

In [95]:
# Create the full ACS1 data set

housing_social = pd.merge(social_data, housing_data, how='outer', on=['State','District','Year'])
acs1_data = housing_social = pd.merge(housing_social, economic_data, how='outer', on = ['State','District','Year'])

In [101]:
print(len(social_data))
print(len(economic_data))
print(len(housing_data))
print(len(acs1_data))

1740
1740
1740
1740


In [None]:
social_data.to_csv('social_data.csv')
economic_data.to_csv('economic_data.csv')
housing_data.to_csv('housing_data.csv')
acs1_data.to_csv('acs1_data.csv')