In [2]:
import pandas as pd

**Import Data**

In [3]:
census = pd.read_csv('../data/census_bureau_acs_county_2018_5yr.csv')
covid = pd.read_csv('../data/covid19_nyt_us_counties.csv')
hospital = pd.read_csv('../data/us_healthcare_capacity-county-CovidCareMap.csv')

**Covid Data**

In [4]:
# interested in county level data
# grab max as data is already additive
covid = covid.groupby('county_fips_code').max()

In [5]:
covid.isna().sum() / covid.shape[0]

date               0.000000
county             0.000000
state_name         0.000000
confirmed_cases    0.000000
deaths             0.024239
dtype: float64

In [6]:
# check null values of deaths for pattern
covid[covid['deaths'].isna()]['state_name'].value_counts()

Puerto Rico    78
Name: state_name, dtype: int64

In [7]:
# all data is related to Peurto Rico and less than 3%
covid.dropna(inplace=True)

In [8]:
# create target
covid['death_rate'] = covid['deaths'] / covid['confirmed_cases']
covid = covid[['death_rate']]
covid.describe()

Unnamed: 0,death_rate
count,3140.0
mean,0.019864
std,0.010036
min,0.0
25%,0.013215
50%,0.018458
75%,0.024953
max,0.097561


In [9]:
covid.to_csv('../data/sam_covid.csv')

**Hospital Data**

In [10]:
hospital.isna().sum() / hospital.shape[0]

fips_code                                     0.000000
State                                         0.000000
County Name                                   0.000000
Staffed All Beds                              0.000000
Staffed ICU Beds                              0.000000
Licensed All Beds                             0.000000
All Bed Occupancy Rate                        0.030351
ICU Bed Occupancy Rate                        0.444225
Population                                    0.000000
Population (20+)                              0.000000
Population (65+)                              0.000000
Staffed All Beds [Per 1000 People]            0.011037
Staffed All Beds [Per 1000 Adults (20+)]      0.011037
Staffed All Beds [Per 1000 Elderly (65+)]     0.011037
Staffed ICU Beds [Per 1000 People]            0.011037
Staffed ICU Beds [Per 1000 Adults (20+)]      0.011037
Staffed ICU Beds [Per 1000 Elderly (65+)]     0.011037
Licensed All Beds [Per 1000 People]           0.011037
Licensed A

In [11]:
# drop null columns then rows
hospital.drop(columns=['ICU Bed Occupancy Rate', 
                       'ICU Bed Source Last Updated'], inplace=True)
hospital.dropna(inplace=True)

# set index in in preparation to merge with covid data
hospital.set_index('fips_code', inplace=True)

In [12]:
hospital.columns

Index(['State', 'County Name', 'Staffed All Beds', 'Staffed ICU Beds',
       'Licensed All Beds', 'All Bed Occupancy Rate', 'Population',
       'Population (20+)', 'Population (65+)',
       'Staffed All Beds [Per 1000 People]',
       'Staffed All Beds [Per 1000 Adults (20+)]',
       'Staffed All Beds [Per 1000 Elderly (65+)]',
       'Staffed ICU Beds [Per 1000 People]',
       'Staffed ICU Beds [Per 1000 Adults (20+)]',
       'Staffed ICU Beds [Per 1000 Elderly (65+)]',
       'Licensed All Beds [Per 1000 People]',
       'Licensed All Beds [Per 1000 Adults (20+)]',
       'Licensed All Beds [Per 1000 Elderly (65+)]', 'ICU Bed Source'],
      dtype='object')

In [13]:
hospital = hospital[['Staffed All Beds',
                     'Staffed ICU Beds',
                     'Licensed All Beds',
                     'All Bed Occupancy Rate',
                     'Population',
                     'Population (20+)',
                     'Population (65+)',
                     'Staffed All Beds [Per 1000 People]',
                     'Staffed All Beds [Per 1000 Adults (20+)]',
                     'Staffed All Beds [Per 1000 Elderly (65+)]',
                     'Staffed ICU Beds [Per 1000 People]',
                     'Staffed ICU Beds [Per 1000 Adults (20+)]',
                     'Staffed ICU Beds [Per 1000 Elderly (65+)]',
                     'Licensed All Beds [Per 1000 People]',
                     'Licensed All Beds [Per 1000 Adults (20+)]',
                     'Licensed All Beds [Per 1000 Elderly (65+)]']]
hospital.describe()

Unnamed: 0,Staffed All Beds,Staffed ICU Beds,Licensed All Beds,All Bed Occupancy Rate,Population,Population (20+),Population (65+),Staffed All Beds [Per 1000 People],Staffed All Beds [Per 1000 Adults (20+)],Staffed All Beds [Per 1000 Elderly (65+)],Staffed ICU Beds [Per 1000 People],Staffed ICU Beds [Per 1000 Adults (20+)],Staffed ICU Beds [Per 1000 Elderly (65+)],Licensed All Beds [Per 1000 People],Licensed All Beds [Per 1000 Adults (20+)],Licensed All Beds [Per 1000 Elderly (65+)]
count,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0,2436.0
mean,332.106322,32.389163,428.863711,0.416203,129636.6,97113.67,20644.83,2.665162,3.547758,14.597846,0.164152,0.21861,0.961966,3.453977,4.594261,18.99413
std,1003.550685,103.038696,1279.834426,0.209624,374755.2,280999.6,53501.84,2.417202,3.175213,12.894066,0.231258,0.307273,1.375051,3.906481,5.076848,20.905135
min,2.0,0.0,0.0,0.0,1087.0,880.0,254.0,0.092,0.128,0.493,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.0,0.0,25.0,0.24,15504.5,11710.75,3064.0,1.291,1.7105,6.9405,0.0,0.0,0.0,1.588,2.11375,8.511
50%,50.0,6.0,76.0,0.4,35633.5,26880.5,6643.5,2.0385,2.7205,11.203,0.119,0.16,0.658,2.6265,3.506,14.336
75%,209.0,20.0,272.25,0.59,96818.75,72292.0,16374.25,3.138,4.21,17.79275,0.24225,0.32225,1.3545,4.0815,5.4415,22.71725
max,23071.0,2305.0,27400.0,1.0,10105520.0,7658126.0,1375957.0,30.373,39.077,155.666,3.728,4.797,18.824,97.226,120.423,463.326


**Census Data**

In [14]:
len(census.columns[census.isna().sum() > 0])

121

In [15]:
census[census.columns[census.isna().sum() > 0]].isna().sum().describe()

count     121.000000
mean      274.206612
std       888.196768
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max      3220.000000
dtype: float64

In [16]:
one_null_lst = ['geo_id'] + list(census.columns[census.isna().sum() == 1])
census[one_null_lst][census[one_null_lst].isna().any(axis=1)]

Unnamed: 0,geo_id,pop_16_over,pop_25_64,median_income,income_per_capita,income_less_10000,income_10000_14999,income_15000_19999,income_20000_24999,income_25000_29999,...,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed
0,35039,,,,,,,,,,...,,,,,,,,,,


In [17]:
census.shape

(3220, 242)

In [18]:
census[list(census.columns[(census.isna().sum() > 5) & (census.isna().sum() < 3220)])].isna().sum()

population_1_year_and_over                  79
not_us_citizen_pop                          78
different_house_year_ago_different_city     79
different_house_year_ago_same_city          79
aggregate_travel_time_to_work              158
less_than_high_school_graduate              79
high_school_including_ged                   79
bachelors_degree_2                          79
graduate_professional_degree                79
some_college_and_associates_degree          79
dtype: int64

In [19]:
census[census.columns[census.isna().sum() == 1]].dropna().shape

(3219, 95)

In [20]:
census.shape

(3220, 242)

In [21]:
census = census.drop(list(census.columns[census.isna().sum() > 100]), axis=1)
census = census.dropna()

In [22]:
# poverty rate == poverty / pop_determined_poverty_status
census['poverty_rate'] = census['poverty'] / census['pop_determined_poverty_status']

In [23]:
census = census.set_index('geo_id')

In [24]:
census.to_csv('../data/sam_census.csv')

**Merge Datasets**

Merge on fips code
* covid
* census
* hospital

In [93]:
df = census.merge(covid, left_index=True, right_index=True)
df = df.merge(hospital, left_index=True, right_index=True)

In [94]:
df.shape

(2430, 248)

**Save Dataset**

In [95]:
df.to_csv('../data/sam_dataset.csv')

## Checking Colinearity

In [75]:
def check_vif(df, drop=None):
    X = df.copy()
    if drop:
        X = X.drop(drop, axis=1)
    vif_data = pd.DataFrame()
    vif_data['features'] = X.columns
    vif_data['VIF'] = [vif(X.values, i) for i in range(len(X.columns))]
    return vif_data

In [76]:
check_vif(hospital)

Unnamed: 0,features,VIF
0,Staffed All Beds,139.614618
1,Staffed ICU Beds,36.295288
2,Licensed All Beds,97.308265
3,All Bed Occupancy Rate,3.085035
4,Population,1528.107979
5,Population (20+),1897.727269
6,Population (65+),69.766229
7,Staffed All Beds [Per 1000 People],3286.278987
8,Staffed All Beds [Per 1000 Adults (20+)],4396.739176
9,Staffed All Beds [Per 1000 Elderly (65+)],338.673213


In [77]:
check_vif(hospital, drop='All Bed Occupancy Rate')

Unnamed: 0,features,VIF
0,Staffed All Beds,136.397337
1,Staffed ICU Beds,36.018933
2,Licensed All Beds,95.734126
3,Population,1502.209887
4,Population (20+),1857.253617
5,Population (65+),62.751255
6,Staffed All Beds [Per 1000 People],3263.328079
7,Staffed All Beds [Per 1000 Adults (20+)],4360.853258
8,Staffed All Beds [Per 1000 Elderly (65+)],334.254105
9,Staffed ICU Beds [Per 1000 People],1774.835717


In [78]:
list(census.columns)

['do_date',
 'total_pop',
 'households',
 'male_pop',
 'female_pop',
 'median_age',
 'male_under_5',
 'male_5_to_9',
 'male_10_to_14',
 'male_15_to_17',
 'male_18_to_19',
 'male_20',
 'male_21',
 'male_22_to_24',
 'male_25_to_29',
 'male_30_to_34',
 'male_35_to_39',
 'male_40_to_44',
 'male_45_to_49',
 'male_50_to_54',
 'male_55_to_59',
 'male_60_to_61',
 'male_62_to_64',
 'male_65_to_66',
 'male_67_to_69',
 'male_70_to_74',
 'male_75_to_79',
 'male_80_to_84',
 'male_85_and_over',
 'female_under_5',
 'female_5_to_9',
 'female_10_to_14',
 'female_15_to_17',
 'female_18_to_19',
 'female_20',
 'female_21',
 'female_22_to_24',
 'female_25_to_29',
 'female_30_to_34',
 'female_35_to_39',
 'female_40_to_44',
 'female_45_to_49',
 'female_50_to_54',
 'female_55_to_59',
 'female_60_to_61',
 'female_62_to_64',
 'female_65_to_66',
 'female_67_to_69',
 'female_70_to_74',
 'female_75_to_79',
 'female_80_to_84',
 'female_85_and_over',
 'white_pop',
 'population_1_year_and_over',
 'population_3_years_

In [80]:
features = [
    'total_pop',
    'households',
    'male_pop',
    'median_age',
    'white_pop',
    'not_us_citizen_pop',
    'median_income',
    'income_per_capita',
    'gini_index',
    'housing_units',
    'occupied_housing_units',
    'median_year_structure_built',
    'households_public_asst_or_food_stamps',
    'median_rent',
    'percent_income_spent_on_rent',
    'walked_to_work',
    'worked_at_home',
    'no_car',
    'commuters_by_public_transportation',
    'group_quarters',
    'bachelors_degree_2',
    'employed_pop',
    'in_school',
    'poverty_rate',
    'age_65_and_over'
]

subset = [
    'male_65_to_66',
    'male_67_to_69',
    'male_70_to_74',
    'male_75_to_79',
    'male_80_to_84',
    'male_85_and_over',
    'female_65_to_66',
    'female_67_to_69',
    'female_70_to_74',
    'female_75_to_79',
    'female_80_to_84',
    'female_85_and_over'
]

In [81]:
temp = census.copy()
temp['age_65_and_over'] = temp[subset].sum(axis=1)
temp = temp[features]

In [90]:
res = check_vif(temp)
res

Unnamed: 0,features,VIF
0,total_pop,30610.81
1,households,inf
2,male_pop,16181.68
3,median_age,95.90471
4,white_pop,68.56711
5,not_us_citizen_pop,60.61686
6,median_income,156.7094
7,income_per_capita,217.2043
8,gini_index,341.305
9,housing_units,942.4512
