In [1]:
import pandas as pd

## Dataset Description and Notes

- understanding the type of population of each county (households, relationships, industry, education levels, etc)
- which types contribute to the spread most and which have managed the spread best?
- may be useful in deriving commonalities in counties in order to develop nuanced intervention approaches in different areas
- population estimates for counties w/ 90% confindence level

## Reading in datasets

In [None]:
# read data set
economic = pd.read_csv("../../data/acs/economic/economic.csv", header=1)
social = pd.read_csv("../../data/acs/social/social.csv", header=1)
# social_pr = pd.read_csv("../../data/acs/social_pr.csv")
overall = pd.read_csv("../../data/superCOVID-19datafame.csv", index_col = "countyFIPS")

## Calculating State Trends Over Past Week - Georgia

In [31]:
ga = overall[overall['State'] == "GA"]

In [32]:
ga_deaths = ga[death_cols]
death_cols = ["deaths_9/13/20", "deaths_9/14/20", "deaths_9/15/20", "deaths_9/16/20", "deaths_9/17/20", "deaths_9/18/20", "deaths_9/19/20"]
death_totals = ga_deaths.agg(sum)
death_totals.diff()

deaths_9/13/20    nan
deaths_9/14/20   20.0
deaths_9/15/20   47.0
deaths_9/16/20   21.0
deaths_9/17/20   54.0
deaths_9/18/20   64.0
deaths_9/19/20   62.0
dtype: float64

In [33]:
confirmed_cols = ["confirmed_9/13/20", "confirmed_9/14/20", "confirmed_9/15/20", "confirmed_9/16/20", "confirmed_9/17/20", "confirmed_9/18/20", "confirmed_9/19/20"]
ga_confirmed = ga[confirmed_cols]
confirmed_totals = ga_confirmed.agg(sum)
confirmed_totals.diff()

confirmed_9/13/20       nan
confirmed_9/14/20   1,020.0
confirmed_9/15/20   1,505.0
confirmed_9/16/20   2,215.0
confirmed_9/17/20   1,855.0
confirmed_9/18/20   1,817.0
confirmed_9/19/20   2,302.0
dtype: float64

### During the week in question, the number of confirmed cases and deaths in the state of Georgia were both determined to be increasing

In [38]:
# Total jump in cases:
confirmed_totals.max() - confirmed_totals.min()

10714

In [39]:
# Total jump in deaths
death_totals.max() - death_totals.min()

268

## Manipulating "id" column to match "countyFIPS" in main dataset for merging

In [8]:
economic['id'] = economic['id'].str.replace("0500000US", "")
social['id'] = social['id'].str.replace("0500000US", "")

economic['id'] = economic['id'].astype('int64')
social['id'] = social['id'].astype('int64')

economic.rename(columns = {"id": "countyFIPS"}, inplace=True)
social.rename(columns = {"id": "countyFIPS"}, inplace=True)

economic.set_index("countyFIPS", inplace=True)
social.set_index("countyFIPS", inplace=True)

In [9]:
# social[["County Name", "State"]] = social["Geographic Area Name"].str.split(", ", expand=True)
# economic[["County Name", "State"]] = economic["Geographic Area Name"].str.split(", ", expand=True)

# social.drop(columns=['Geographic Area Name'], inplace=True)
# economic.drop(columns=['Geographic Area Name'], inplace=True)

# social.set_index(["County Name", "State"], inplace=True)
# economic.set_index(["County Name", "State"], inplace=True)

## Merging with super dataset

In [10]:
merged = social.merge(economic, how="outer", left_index=True, right_index=True)

In [11]:
merged = merged.merge(overall, left_index=True, right_index=True)

## Removing unnecessary columns

In [12]:
cols = merged.columns.tolist()
print(len(cols))
for col in cols:
    if 'error' in col.lower():
        cols.remove(col)
print(len(cols))

1646
1068


In [13]:
merged = merged[cols]

## Final Result

In [14]:
merged

Unnamed: 0_level_0,Geographic Area Name_x,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families),Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years,Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family,Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family,Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family!!With own children of the householder under 18 years,...,deaths_9/10/20,deaths_9/11/20,deaths_9/12/20,deaths_9/13/20,deaths_9/14/20,deaths_9/15/20,deaths_9/16/20,deaths_9/17/20,deaths_9/18/20,deaths_9/19/20
countyFIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,"Autauga County, Alabama",21115.0,21115.0,15161.0,71.8,6787.0,32.1,11988.0,56.8,5201.0,...,24,24,24,24,24,24,24,24,24,24
1003,"Baldwin County, Alabama",78622.0,78622.0,51359.0,65.3,18035.0,22.9,41452.0,52.7,13527.0,...,46,47,47,47,47,47,47,48,48,49
1005,"Barbour County, Alabama",9186.0,9186.0,6030.0,65.6,2423.0,26.4,3908.0,42.5,1249.0,...,7,7,7,7,7,7,7,7,7,7
1007,"Bibb County, Alabama",6840.0,6840.0,4947.0,72.3,1728.0,25.3,3626.0,53.0,1321.0,...,9,9,9,9,9,9,9,9,10,10
1009,"Blount County, Alabama",20600.0,20600.0,15104.0,73.3,5566.0,27.0,11942.0,58.0,4129.0,...,12,12,13,13,13,13,13,13,14,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,"Sweetwater County, Wyoming",15871.0,15871.0,10824.0,68.2,5448.0,34.3,8380.0,52.8,3854.0,...,2,2,2,2,2,2,2,2,2,2
56039,"Teton County, Wyoming",9158.0,9158.0,5305.0,57.9,2254.0,24.6,4452.0,48.6,1670.0,...,1,1,1,1,1,1,1,1,1,1
56041,"Uinta County, Wyoming",7735.0,7735.0,5355.0,69.2,2611.0,33.8,4391.0,56.8,2084.0,...,2,2,2,2,2,2,2,2,2,2
56043,"Washakie County, Wyoming",3422.0,3422.0,2302.0,67.3,895.0,26.2,1920.0,56.1,654.0,...,6,6,6,6,6,6,6,6,6,6


## Exporting

In [15]:
# merged.to_csv("../../data/social_and_economic.csv")

In [16]:
merged.columns

Index(['Geographic Area Name_x',
       'Estimate!!HOUSEHOLDS BY TYPE!!Total households',
       'Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households',
       'Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)',
       'Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)',
       'Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years',
       'Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!With own children of the householder under 18 years',
       'Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family',
       'Percent Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family',
       'Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Family households (families)!!Married-couple family!!With own children of the householder u

## Renaming columns

In [17]:
cols = [x.replace("Estimate!!", "est_") for x in merged.columns]
# print(cols[0:20])
cols = [x.replace("HOUSEHOLDS BY TYPE!!", "households_") for x in cols]
# print(cols[0:20])
cols = [x.replace("Total households!!", "total_") for x in cols]
# print(cols[0:20])
cols = [x.replace("Family households (families)!!", "families_") for x in cols]
# print(cols[0:20])
cols = [x.replace("Married-couple family", "married_") for x in cols]
# print(cols[0:20])

merged.columns = cols
merged

Unnamed: 0_level_0,Geographic Area Name_x,est_households_Total households,Percent est_households_Total households,est_households_total_Family households (families),Percent est_households_total_Family households (families),est_households_total_families_With own children of the householder under 18 years,Percent est_households_total_families_With own children of the householder under 18 years,est_households_total_families_married_,Percent est_households_total_families_married_,est_households_total_families_married_!!With own children of the householder under 18 years,...,deaths_9/10/20,deaths_9/11/20,deaths_9/12/20,deaths_9/13/20,deaths_9/14/20,deaths_9/15/20,deaths_9/16/20,deaths_9/17/20,deaths_9/18/20,deaths_9/19/20
countyFIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,"Autauga County, Alabama",21115.0,21115.0,15161.0,71.8,6787.0,32.1,11988.0,56.8,5201.0,...,24,24,24,24,24,24,24,24,24,24
1003,"Baldwin County, Alabama",78622.0,78622.0,51359.0,65.3,18035.0,22.9,41452.0,52.7,13527.0,...,46,47,47,47,47,47,47,48,48,49
1005,"Barbour County, Alabama",9186.0,9186.0,6030.0,65.6,2423.0,26.4,3908.0,42.5,1249.0,...,7,7,7,7,7,7,7,7,7,7
1007,"Bibb County, Alabama",6840.0,6840.0,4947.0,72.3,1728.0,25.3,3626.0,53.0,1321.0,...,9,9,9,9,9,9,9,9,10,10
1009,"Blount County, Alabama",20600.0,20600.0,15104.0,73.3,5566.0,27.0,11942.0,58.0,4129.0,...,12,12,13,13,13,13,13,13,14,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,"Sweetwater County, Wyoming",15871.0,15871.0,10824.0,68.2,5448.0,34.3,8380.0,52.8,3854.0,...,2,2,2,2,2,2,2,2,2,2
56039,"Teton County, Wyoming",9158.0,9158.0,5305.0,57.9,2254.0,24.6,4452.0,48.6,1670.0,...,1,1,1,1,1,1,1,1,1,1
56041,"Uinta County, Wyoming",7735.0,7735.0,5355.0,69.2,2611.0,33.8,4391.0,56.8,2084.0,...,2,2,2,2,2,2,2,2,2,2
56043,"Washakie County, Wyoming",3422.0,3422.0,2302.0,67.3,895.0,26.2,1920.0,56.1,654.0,...,6,6,6,6,6,6,6,6,6,6


In [41]:
pd.set_option('display.float_format', lambda x: "{:,}".format(x))
merged.describe()

Unnamed: 0,est_households_Total households,Percent est_households_Total households,est_households_total_Family households (families),Percent est_households_total_Family households (families),est_households_total_families_With own children of the householder under 18 years,Percent est_households_total_families_With own children of the householder under 18 years,est_households_total_families_married_,Percent est_households_total_families_married_,est_households_total_families_married_!!With own children of the householder under 18 years,Percent est_households_total_families_married_!!With own children of the householder under 18 years,...,deaths_9/10/20,deaths_9/11/20,deaths_9/12/20,deaths_9/13/20,deaths_9/14/20,deaths_9/15/20,deaths_9/16/20,deaths_9/17/20,deaths_9/18/20,deaths_9/19/20
count,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,...,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0,3126.0
mean,38058.407869481765,38058.407869481765,25025.079334612925,66.5853486884197,10629.49648112604,25.9254638515675,18391.69385796545,50.86487523992323,7232.709532949456,17.607517594369803,...,60.55246321177224,60.90115163147793,61.12380038387716,61.24728087012157,61.38003838771593,61.720409468969926,62.07389635316699,62.34069097888676,62.632437619961614,62.85668586052464
std,115950.17719782228,115950.17719782228,75811.44084617589,5.398893782098096,33627.203087612084,5.150218731740709,53276.89465715665,6.735369170018073,22988.810229690484,4.638016587690296,...,317.0348891316706,318.0022663848063,318.664023743755,318.8479401103353,319.1521450415248,319.84438998381745,320.600293913748,321.09799932810915,321.6791836203644,322.1271648502885
min,33.0,33.0,8.0,17.0,0.0,0.0,8.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4221.5,4221.5,2806.25,63.5,1044.25,22.9,2165.0,47.125,687.25,14.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,9812.0,9812.0,6603.5,66.6,2499.5,25.8,4998.5,51.3,1666.5,17.3,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
75%,25760.75,25760.75,17107.5,69.9,6833.25,28.6,12777.75,55.1,4588.75,19.9,...,24.0,24.0,24.0,24.0,25.0,25.0,25.0,25.0,26.0,26.0
max,3306109.0,3306109.0,2207265.0,89.9,952402.0,52.9,1485293.0,84.0,642979.0,43.5,...,7299.0,7299.0,7306.0,7306.0,7306.0,7306.0,7311.0,7312.0,7314.0,7315.0


## Challenges

- how to rename columns and how to do so efficiently/programmatically
- 1000+
- many different variable groups, many more subgroups
- finding which columns or most relevant and which can be removed w/o losing critical info
- which pandas methods work best for certain purposes
- how to arrange and group data
- displaying numbers in readable format (not scientific notation)

## Considerations
- Break dataframes down in layers starting w/ highest order / root variable?
    - This would produce almost 20 dataframe which would then be broken down further
- Remove percent estimate columns (may be extracted by leveraging total estimates)
    - This would cut the number of columns by 50%

## Issues
- total househoulds estimates and percent estimates are the same, what went wrong?