In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data.csv')

## County Selection

In [3]:
# number of dates included in the data
print(len(list(data.date.unique())))

# start and end date
print(data.date.min())
print(data.date.max())

# number of dates each county has on record
county_date = data.groupby(['state', 'county']).count().loc[:, ['date']].sort_values(
    by = 'date', ascending = False).reset_index().rename(columns = {'date':'num_date'})

county_date.head(10)

319
2020-01-21
2020-12-04


Unnamed: 0,state,county,num_date
0,Washington,Snohomish,319
1,Illinois,Cook,316
2,California,Orange,315
3,California,Los Angeles,314
4,Arizona,Maricopa,314
5,California,Santa Clara,309
6,Massachusetts,Suffolk,308
7,California,San Francisco,307
8,Wisconsin,Dane,304
9,California,San Diego,299


In [4]:
# choose counties with at least ~70% of date observations
    # the ~70% threshold may be adjusted later
data = county_date[county_date['num_date'] >= 220].merge(data, on = ['state', 'county']).drop(
    columns = 'num_date')

## Data Inspection - Erica

In [5]:
sub_data = data.iloc[:, :57]
sub_data['entity'] = sub_data['state'] + '-' + sub_data['county']

In [6]:
# number of NAs for each column
num_na_df = sub_data.isna().sum().reset_index().rename(
    columns = {'index':'col', 0:'num_na'}).sort_values(by = 'num_na')

# columns with NAs
num_na_df = num_na_df[num_na_df['num_na'] != 0]
num_na_df

Unnamed: 0,col,num_na
32,percent_uninsured,729
43,num_some_college,729
44,population,729
45,percent_some_college,729
46,num_unemployed_CHR,729
47,labor_force,729
48,percent_unemployed_CHR,729
49,percent_children_in_poverty,729
53,num_single_parent_households_CHR,729
54,num_households_CHR,729


In [7]:
def stationary_check(df):
    non_stationary = []
    for i in df.entity.unique():
        entity_df = df[df['entity'] == i]
        for j in entity_df.columns.difference(['entity']):
            num_unique = len(entity_df[j].unique())
            if num_unique != 1:
                non_stationary.append([i, j])
    return non_stationary

In [8]:
# columns with NA values
na_cols = num_na_df.col.to_list()

# checking whether columns with missing vals are stationary or time series
# use this to determine the 'location' of NA values
    # missing certain dates of a county
    # or missing for the county across dates
stationary_check(sub_data.loc[:, ['entity'] + na_cols])

[]

In [19]:
# features with missing values are all stationary
df = sub_data.loc[:, ['entity'] + na_cols].drop_duplicates()

county_na = []
for i in df.entity:
    feature_df = df[df['entity'] == i]
    na_feature_num = feature_df.isna().sum(axis = 1).to_list()[0]
    county_na.append([i, na_feature_num])

In [20]:
# number of missing features by entity
pd.DataFrame(county_na, columns = ['entity', 'na_num']).sort_values(by = 'na_num', ascending = False)

# suggest removing St.John, St. Croix, and Thomas in Virgin Islands

Unnamed: 0,entity,na_num
2432,Virgin Islands-St. John,49
2431,Virgin Islands-St. Croix,49
2424,Virgin Islands-St. Thomas,49
1890,Virginia-Manassas Park city,16
2423,Alaska-Petersburg Borough,13
...,...,...
1157,South Carolina-Oconee,0
1158,Georgia-Morgan,0
1159,Colorado-Teller,0
1160,Kansas-Bourbon,0


In [21]:
df = sub_data.loc[-sub_data['entity'].isin(['Virgin Islands-St. John', 'Virgin Islands-St. Croix', 
                       'Virgin Islands-St. Thomas']), ['entity'] + na_cols].drop_duplicates()

In [22]:
feature_na = []
for i in df.columns.difference(['entity']):
    feature_df = df.loc[:, ['entity', i]]
    na_num = len(feature_df[feature_df[i].isna()].entity.to_list())
    feature_na.append([i, na_num])

In [23]:
# number of missing entity by feature
pd.DataFrame(feature_na, columns = ['feature', 'na_num']).sort_values(by = 'na_num', ascending = False)

# do not suggest removing any feature given that the number of missing county is <10%
# also some information, like percent_vaccinated, may be found elsewhere

Unnamed: 0,feature,na_num
12,mental_health_provider_rate,143
20,num_mental_health_providers,143
48,years_of_potential_life_lost_rate,131
16,num_deaths,131
21,num_primary_care_physicians,109
44,primary_care_physicians_rate,107
4,dentist_rate,74
17,num_dentists,74
3,chlamydia_rate,62
15,num_chlamydia_cases,62
