In [1]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from collections import namedtuple

In [2]:
filenames = glob("data/raw/BP_*_00A1.csv")

In [3]:
dataframes = [pd.read_csv(f, encoding='latin-1') for f in filenames]

In [4]:
# remove punctuation from column names
for dataframe in dataframes:
    newcols = []
    for column in dataframe.columns:
        column = column.replace(".","").replace("-","").replace("_","")
        newcols.append(column)
    dataframe.columns = newcols    

In [5]:
#

dataframes2 =[]


for dataframe in dataframes:
    
    #rename columns
    dataframe = dataframe[["GEOid2",
                           "GEOdisplaylabel",
                           "GEOfipsstatecode",
                           "GEOfipscountycode",
                           "NAICSdisplaylabel",
                           "ESTAB",
                           "EMP",
                           "YEARid"]]
    dataframe.columns = ["geo_id",
                       "county_name",
                       "state_fips",
                       "county_fips",
                       "naics_industry",
                       "establishments",
                       "employees",
                       "year"]
    
    #exclude state totals
    dataframe = dataframe[dataframe.county_fips != 999]
    
    #exclude puerto rico
    dataframe = dataframe[dataframe.state_fips != 72] 
    
    #exclude retired Alaskan Provinces
    dataframe = dataframe[dataframe.geo_id != 2201]
    dataframe = dataframe[dataframe.geo_id != 2232]
    dataframe = dataframe[dataframe.geo_id != 2280]
    
    #exclude totals
    dataframe = dataframe[dataframe.naics_industry != "Total for all sectors"]
    
    #apply updated NAICS sector code name
    dataframe.naics_industry = dataframe.naics_industry.str.replace("Mining","Mining, quarrying, and oil and gas extraction")

    #source typo corrections
    dataframe.county_name = dataframe.county_name.str.replace("Dona Ana County, New Mexico","Doña Ana County, New Mexico")
    dataframe.county_name = dataframe.county_name.str.replace("DoÃ±a Ana County, New Mexico","Doña Ana County, New Mexico")
    dataframe.county_name = dataframe.county_name.str.replace("La Salle County, Illinois","LaSalle County, Illinois")
    dataframe.county_name = dataframe.county_name.str.replace("La Salle Parish, Louisiana","LaSalle Parish, Louisiana")    
    
    dataframes2.append(dataframe)
    
    

In [6]:
stacked = pd.concat(dataframes2)

In [7]:
#fill in 0s for missing Loving County entries

loving = stacked[stacked.county_name == 'Loving County, Texas']
loving15 = loving.groupby(["geo_id", 
                          "county_name", 
                          "state_fips", 
                          "county_fips", 
                          "naics_industry"])["year", "establishments","employees"]
loving2 = loving15.apply(lambda x: x.set_index("year").reindex(range(2005, 2016), fill_value=0)).reset_index()
loving3 = stacked.merge(loving2, how="outer", on=["year",
                                                  "geo_id",
                                                  "county_name",
                                                  "state_fips",
                                                  "county_fips",
                                                  "naics_industry",
                                                  "establishments",
                                                  "employees"])

In [8]:
#fill in 0s for missing Kalawao County entries

kalawao = loving3[loving3.county_name == 'Kalawao County, Hawaii']
kalawao15 = kalawao.groupby(["geo_id", 
                          "county_name", 
                          "state_fips", 
                          "county_fips", 
                          "naics_industry"])["year", "establishments","employees"]
kalawao2 = kalawao15.apply(lambda x: x.set_index("year").reindex(range(2005, 2016), fill_value=0)).reset_index()
kalawao3 = loving3.merge(kalawao2, how="outer", on=["year",
                                                  "geo_id",
                                                  "county_name",
                                                  "state_fips",
                                                  "county_fips",
                                                  "naics_industry",
                                                  "establishments",
                                                  "employees"])

In [9]:
#backfill the five new alaskan counties

fill_counties = ['Petersburg Census Area, Alaska', 'Skagway Municipality, Alaska', 'Prince of Wales-Hyder Census Area, Alaska', 'Wrangell City and Borough, Alaska', 'Hoonah-Angoon Census Area, Alaska']
dataframeA = kalawao3
    
for fill_county in fill_counties:
    dataframeB = dataframeA[dataframeA.county_name == fill_county].groupby(["geo_id", 
                          "county_name", 
                          "state_fips", 
                          "county_fips", 
                          "naics_industry"])["year", "establishments","employees"].apply(lambda x: x.set_index("year").reindex(range(2005, 2016), method="bfill")).reset_index().merge(dataframeA, how="outer", on=["year",
                                                  "geo_id",
                                                  "county_name",
                                                  "state_fips",
                                                  "county_fips",
                                                  "naics_industry",
                                                  "establishments",
                                                  "employees"])
    dataframeA = dataframeB


In [11]:
#fill privacy-concerned NaNs with 0s, zip est & emp into a tuple column, drop est & emp cols

dataframeA = dataframeA.fillna(0)

#reshaping dataframe
dataframeA = dataframeA.pivot_table(index=["geo_id", "county_name", "state_fips", "county_fips", "year"], 
                       columns="naics_industry", 
                       values=["establishments", "employees"], 
                       fill_value=0)
dataframeA = dataframeA.reset_index(["state_fips", "county_fips"])