In [9]:
import pandas as pd, numpy as np
import random

In [10]:
in_file = "./raw/COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv"

sample_options = {
    "50k":["./raw/sample50k.csv",50000],
    "500k":["./raw/sample500k.csv", 500000],
    "1kk":["./raw/sample1kk.csv", 1000000],
    "5kk":["./raw/sample5kk.csv", 5000000],
}

sample_file = sample_options["5kk"][0]
sample_size = sample_options["5kk"][1]

In [31]:
## use shuf to generate samples before import (install `coreutils`, `gshuf` in macos)
#!head -n1 $in_file > $sample_file # get the header
#!gshuf -n $sample_size $in_file >> $sample_file

In [14]:
sample_file = "./raw/sample5kk.csv"
my_df = pd.read_csv(sample_file, header=0, nrows=1000)
my_df.shape

(1000, 19)

In [17]:
my_df.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-12,MI,26,OCEANA,26127.0,18 to 49 years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
1,2022-01,IL,17,FRANKLIN,17055.0,50 to 64 years,Female,Missing,Missing,,0.0,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
2,2022-01,MA,25,ESSEX,25009.0,18 to 49 years,Female,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
3,2022-01,GA,13,FULTON,13121.0,18 to 49 years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,
4,2021-07,FL,12,POLK,12105.0,18 to 49 years,Female,White,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,


In [6]:
# count rows that do not contain missing values
my_df.dropna().shape[0]

96750

In [38]:
# count ratio of missing values in each col
my_df.isna().sum()/my_df.shape[0]

case_month                         0.000000
res_state                          0.000016
state_fips_code                    0.000016
res_county                         0.067335
county_fips_code                   0.067335
age_group                          0.019734
sex                                0.039485
race                               0.369996
ethnicity                          0.432406
case_positive_specimen_interval    0.642981
case_onset_interval                0.585819
process                            0.961552
exposure_yn                        0.947470
current_status                     0.000000
symptom_status                     0.607779
hosp_yn                            0.611965
icu_yn                             0.963516
death_yn                           0.672541
underlying_conditions_yn           0.958144
dtype: float64

In [39]:
# examine cols for data distribution
# list of cols with factor values
ls_cols = ["case_month","res_state","age_group","sex","race","ethnicity","exposure_yn","current_status","symptom_status","hosp_yn","icu_yn","death_yn","underlying_conditions_yn"]
# generate the col value count
for col in ls_cols:
    print("\n==== " + col + " ====")
    print(my_df[col].value_counts(dropna=False))


==== case_month ====
2022-01    1108488
2021-12     471912
2020-12     434382
2021-01     367371
2020-11     327749
2021-08     274025
2021-09     258084
2021-11     186011
2021-10     171834
2020-10     153727
2021-03     142504
2021-02     142109
2021-04     120131
2020-07     119141
2022-02     110567
2021-07     105218
2020-08      91866
2020-09      88523
2020-06      79084
2021-05      66127
2020-04      62305
2020-05      56013
2021-06      32632
2020-03      29506
2020-02        347
2020-01        344
Name: case_month, dtype: int64

==== res_state ====
CA     701022
NY     385673
FL     259356
IL     239351
PA     215743
OH     208125
NC     201568
NJ     172286
GA     161698
TN     155540
AZ     152058
IN     133606
MA     131286
SC     115332
WA     111241
MN     111063
VA     110486
MO     110158
MI     104377
CO     103078
AL      99494
LA      91072
WI      79340
TX      66927
AR      63942
KS      60650
UT      58518
OR      53834
OK      52721
MD      52210
KY      4941

In [40]:
my_df.dtypes

case_month                          object
res_state                           object
state_fips_code                    float64
res_county                          object
county_fips_code                   float64
age_group                           object
sex                                 object
race                                object
ethnicity                           object
case_positive_specimen_interval    float64
case_onset_interval                float64
process                             object
exposure_yn                         object
current_status                      object
symptom_status                      object
hosp_yn                             object
icu_yn                              object
death_yn                            object
underlying_conditions_yn            object
dtype: object

In [49]:
# check for special cases of na values
# state not null, county is null
my_df[(my_df["res_county"].isnull()) & (my_df["res_state"].notnull())]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
21,2022-01,AL,1.0,,,,Male,,,,,,,Laboratory-confirmed case,,,,,
40,2020-07,MO,29.0,,,50 to 64 years,Male,White,Non-Hispanic/Latino,0.0,,,,Laboratory-confirmed case,,,,,
57,2021-09,OH,39.0,,,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,0.0,Clinical evaluation,,Laboratory-confirmed case,Symptomatic,No,,No,Yes
64,2022-01,HI,15.0,,,18 to 49 years,Male,Asian,,,,,,Laboratory-confirmed case,,,,,
68,2020-06,AL,1.0,,,50 to 64 years,,,,,0.0,,,Laboratory-confirmed case,Symptomatic,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999917,2020-05,VA,51.0,,,18 to 49 years,Male,Black,Non-Hispanic/Latino,,,,,Laboratory-confirmed case,,,,,
4999923,2022-01,ND,38.0,,,18 to 49 years,Female,White,,,0.0,,,Probable Case,Symptomatic,,,,
4999966,2020-07,NJ,34.0,,,50 to 64 years,Female,White,,,,,,Laboratory-confirmed case,,,,No,
4999969,2021-09,MO,29.0,,,18 to 49 years,Female,,,0.0,,,,Laboratory-confirmed case,,,,,


In [51]:
my_df[(my_df["county_fips_code"].isnull()) & (my_df["res_county"].notnull())]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn


In [52]:
my_df[(my_df["state_fips_code"].isnull()) & (my_df["res_state"].notnull())]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn


In [57]:
my_df[(my_df["race"]!="White") & (my_df["race"].notnull()) & (my_df["ethnicity"]=="Hispanic/Latino")]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
718,2022-01,MA,25.0,BRISTOL,25005.0,0 - 17 years,Female,Black,Hispanic/Latino,,,,,Probable Case,,,,,
963,2021-01,FL,12.0,ORANGE,12095.0,18 to 49 years,Female,Black,Hispanic/Latino,,0.0,,,Laboratory-confirmed case,Symptomatic,No,,,
1194,2021-01,NY,36.0,QUEENS,36081.0,0 - 17 years,Male,Black,Hispanic/Latino,0.0,,,,Laboratory-confirmed case,Asymptomatic,,,,
1279,2021-12,IL,17.0,COOK,17031.0,50 to 64 years,Female,Black,Hispanic/Latino,,0.0,,,Laboratory-confirmed case,,,,,
1301,2021-01,NC,37.0,MECKLENBURG,37119.0,0 - 17 years,Female,American Indian/Alaska Native,Hispanic/Latino,0.0,,,,Laboratory-confirmed case,,,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4998650,2021-12,NY,36.0,BRONX,36005.0,18 to 49 years,Female,Black,Hispanic/Latino,0.0,,,,Probable Case,,,,,
4999072,2022-01,CA,6.0,RIVERSIDE,6065.0,18 to 49 years,Male,American Indian/Alaska Native,Hispanic/Latino,,,,,Laboratory-confirmed case,,,,,
4999341,2021-08,NY,36.0,NEW YORK,36061.0,18 to 49 years,Female,Black,Hispanic/Latino,0.0,,,,Laboratory-confirmed case,Asymptomatic,,,,
4999405,2022-01,CA,6.0,SAN BERNARDINO,6071.0,0 - 17 years,Female,Asian,Hispanic/Latino,,,,,Laboratory-confirmed case,,,,,


In [53]:
my_df.isna().sum()

case_month                               0
res_state                               78
state_fips_code                         78
res_county                          336673
county_fips_code                    336673
age_group                            98668
sex                                 197427
race                               1849981
ethnicity                          2162028
case_positive_specimen_interval    3214903
case_onset_interval                2929095
process                            4807761
exposure_yn                        4737348
current_status                           0
symptom_status                     3038896
hosp_yn                            3059825
icu_yn                             4817578
death_yn                           3362704
underlying_conditions_yn           4790719
dtype: int64

In [4]:
my_df[(my_df["res_state"].isnull()) & (my_df["res_county"].notnull())]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
