This notebook contains code to open ACS census block group cvs files and reduce reduce table width by removing extraneous columns (so far, only removing columns that have margin of error data). It also removes the extra first row of header data from the csvs, keeping the 2nd row of the csv for the header in the cleaned version. 

In [60]:
import csv
import os
import pprint
import pandas as pd
from os import listdir
from os.path import isfile, join


In [61]:
# an example census table
filename = 'ACS_15_5YR_B01001_with_ann.csv'

In [34]:
# script to open one csv file and remove the margin of error columns
#def open_clean(file):
#    data = []
#    with open(file, 'rb') as f:
#        r = csv.DictReader(f)
#        for line in r:
#            data.append(line)
#    return data

f = pd.read_csv(filename, skiprows=[0]) #skips first row since these csvs have two header rows, 
#skips first row since these csvs have two header rows, 2nd row has the header we want

f.head(1)

Unnamed: 0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - Under 5 years,Margin of Error; Male: - Under 5 years,Estimate; Male: - 5 to 9 years,...,Estimate; Female: - 67 to 69 years,Margin of Error; Female: - 67 to 69 years,Estimate; Female: - 70 to 74 years,Margin of Error; Female: - 70 to 74 years,Estimate; Female: - 75 to 79 years,Margin of Error; Female: - 75 to 79 years,Estimate; Female: - 80 to 84 years,Margin of Error; Female: - 80 to 84 years,Estimate; Female: - 85 years and over,Margin of Error; Female: - 85 years and over
0,1500000US410510001001,410510001001,"Block Group 1, Census Tract 1, Multnomah Count...",625,173,335,121,17,27,0,...,0,12,26,22,0,12,0,12,0,12


In [64]:
# open csv file and remove columns that have margin of error

f = pd.read_csv(filename, skiprows=[0])
keep_col = []
header_list = list(f)
print(header_list)
for header in header_list:
    if 'Margin' not in header:
        keep_col.append(header)
print(keep_col)
new_f = f[keep_col]
new_f.to_csv('clean_'+filename, index=False)
            

['Id', 'Id2', 'Geography', 'Estimate; Total:', 'Margin of Error; Total:', 'Estimate; Male:', 'Margin of Error; Male:', 'Estimate; Male: - Under 5 years', 'Margin of Error; Male: - Under 5 years', 'Estimate; Male: - 5 to 9 years', 'Margin of Error; Male: - 5 to 9 years', 'Estimate; Male: - 10 to 14 years', 'Margin of Error; Male: - 10 to 14 years', 'Estimate; Male: - 15 to 17 years', 'Margin of Error; Male: - 15 to 17 years', 'Estimate; Male: - 18 and 19 years', 'Margin of Error; Male: - 18 and 19 years', 'Estimate; Male: - 20 years', 'Margin of Error; Male: - 20 years', 'Estimate; Male: - 21 years', 'Margin of Error; Male: - 21 years', 'Estimate; Male: - 22 to 24 years', 'Margin of Error; Male: - 22 to 24 years', 'Estimate; Male: - 25 to 29 years', 'Margin of Error; Male: - 25 to 29 years', 'Estimate; Male: - 30 to 34 years', 'Margin of Error; Male: - 30 to 34 years', 'Estimate; Male: - 35 to 39 years', 'Margin of Error; Male: - 35 to 39 years', 'Estimate; Male: - 40 to 44 years', 'Mar

In [49]:
# make a list of all files in current notebook directory
onlyfiles = [f for f in listdir() if isfile(join(f))]
onlyfiles

['ACS_15_5YR_B01001.txt',
 'ACS_15_5YR_B01001_metadata.csv',
 'ACS_15_5YR_B01001_with_ann.csv',
 'ACS_15_5YR_B01003.txt',
 'ACS_15_5YR_B01003_metadata.csv',
 'ACS_15_5YR_B01003_with_ann.csv',
 'ACS_15_5YR_B02001.txt',
 'ACS_15_5YR_B02001_metadata.csv',
 'ACS_15_5YR_B02001_with_ann.csv',
 'ACS_15_5YR_B08135.txt',
 'ACS_15_5YR_B08135_metadata.csv',
 'ACS_15_5YR_B08135_with_ann.csv',
 'ACS_15_5YR_B11007.txt',
 'ACS_15_5YR_B11007_metadata.csv',
 'ACS_15_5YR_B11007_with_ann.csv',
 'ACS_15_5YR_B16002.txt',
 'ACS_15_5YR_B16002_metadata.csv',
 'ACS_15_5YR_B16002_with_ann.csv',
 'ACS_15_5YR_B17021.txt',
 'ACS_15_5YR_B17021_metadata.csv',
 'ACS_15_5YR_B17021_with_ann.csv',
 'ACS_15_5YR_B19001.txt',
 'ACS_15_5YR_B19001_metadata.csv',
 'ACS_15_5YR_B19001_with_ann.csv',
 'ACS_15_5YR_B19013.txt',
 'ACS_15_5YR_B19013_metadata.csv',
 'ACS_15_5YR_B19013_with_ann.csv',
 'ACS_15_5YR_B25003.txt',
 'ACS_15_5YR_B25003_metadata.csv',
 'ACS_15_5YR_B25003_with_ann.csv',
 'aff_download.zip',
 'aff_download_read

In [50]:
# subset this list by actual census data tables (_with_ann.csv suffix)
census_tables = [file for file in onlyfiles if '_with_ann.csv' in file]
census_tables

['ACS_15_5YR_B01001_with_ann.csv',
 'ACS_15_5YR_B01003_with_ann.csv',
 'ACS_15_5YR_B02001_with_ann.csv',
 'ACS_15_5YR_B08135_with_ann.csv',
 'ACS_15_5YR_B11007_with_ann.csv',
 'ACS_15_5YR_B16002_with_ann.csv',
 'ACS_15_5YR_B17021_with_ann.csv',
 'ACS_15_5YR_B19001_with_ann.csv',
 'ACS_15_5YR_B19013_with_ann.csv',
 'ACS_15_5YR_B25003_with_ann.csv']

In [58]:
# make a function to clean all files in the census_tables list and write new clean cvs file

def open_clean(file_list):
    for file in census_tables:
        f = pd.read_csv(file, skiprows=[0])
        keep_col = [] # list of column names I want to keep
        header_list = list(f)
        #print(header_list)
        for header in header_list:
            if 'Margin' not in header:
                keep_col.append(header)
        #print(keep_col)
        new_f = f[keep_col]
        new_f.to_csv('clean_'+file, index=False)
        print('cleaned and saved: '+'clean_'+file)

In [59]:
open_clean(census_tables)

cleaned and saved: clean_ACS_15_5YR_B01001_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B01003_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B02001_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B08135_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B11007_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B16002_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B17021_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B19001_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B19013_with_ann.csv
cleaned and saved: clean_ACS_15_5YR_B25003_with_ann.csv
