In [1]:
# Data cleaning notebook for Case Surveillance Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
import warnings
warnings.simplefilter("ignore")
import statsmodels.api as sm

In [2]:
required_columns = ['case_month','res_state','age_group','exposure_yn','current_status','symptom_status','hosp_yn',
                    'icu_yn','death_yn','underlying_conditions_yn']

column_datatypes = {"case_month": "str", "res_state": "str", "age_group": "category", "exposure_yn":"category",
             "current_status": "category", "symptom_status": "category", "hosp_yn": "category", "icu_yn": "category", "death_yn":"category",
             "underlying_conditions_yn": "category"}

In [3]:
%%time
case_data =  pd.read_csv("case_surveillance.csv",usecols=required_columns, dtype=column_datatypes,
                         memory_map=True)

CPU times: user 1min 19s, sys: 15.1 s, total: 1min 34s
Wall time: 1min 56s


In [4]:
case_data.head(10)

Unnamed: 0,case_month,res_state,age_group,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2022-03,CA,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,
1,2021-07,KS,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
2,2022-05,NC,50 to 64 years,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,No,
3,2022-06,RI,65+ years,Missing,Probable Case,Missing,Missing,Missing,No,
4,2020-08,FL,18 to 49 years,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
5,2021-09,KY,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
6,2020-11,MN,65+ years,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
7,2020-10,IL,65+ years,Missing,Probable Case,Missing,Missing,Missing,,
8,2021-05,WY,,Missing,Laboratory-confirmed case,Missing,Missing,Missing,,
9,2022-01,MI,18 to 49 years,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,


In [5]:
case_df = case_data

In [6]:
# Null values
case_df.isna().sum()

case_month                         9
res_state                       1288
age_group                     723020
exposure_yn                        0
current_status                     0
symptom_status                     0
hosp_yn                            0
icu_yn                             0
death_yn                     2826297
underlying_conditions_yn    87213425
dtype: int64

In [7]:
case_df.dropna(subset=['case_month','res_state','age_group','death_yn','underlying_conditions_yn'],inplace=True)

In [8]:
case_df.isna().sum()

case_month                  0
res_state                   0
age_group                   0
exposure_yn                 0
current_status              0
symptom_status              0
hosp_yn                     0
icu_yn                      0
death_yn                    0
underlying_conditions_yn    0
dtype: int64

In [9]:
case_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2703284 entries, 27 to 90183145
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   case_month                object  
 1   res_state                 object  
 2   age_group                 category
 3   exposure_yn               category
 4   current_status            category
 5   symptom_status            category
 6   hosp_yn                   category
 7   icu_yn                    category
 8   death_yn                  category
 9   underlying_conditions_yn  category
dtypes: category(8), object(2)
memory usage: 82.5+ MB


In [13]:
case_df['case_month'] = pd.to_datetime(case_df['case_month'], format='%Y-%m')
case_df['case_month'] = case_df['case_month'].dt.strftime('%Y-%m')
case_df['res_state'] = case_df['res_state'].astype('string')

In [14]:
case_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2703284 entries, 27 to 90183145
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   case_month                object  
 1   res_state                 string  
 2   age_group                 category
 3   exposure_yn               category
 4   current_status            category
 5   symptom_status            category
 6   hosp_yn                   category
 7   icu_yn                    category
 8   death_yn                  category
 9   underlying_conditions_yn  category
dtypes: category(8), object(1), string(1)
memory usage: 82.5+ MB


In [15]:
case_df.head(5)

Unnamed: 0,case_month,res_state,age_group,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
27,2020-12,WA,65+ years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Missing,Yes
42,2022-01,PA,65+ years,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
103,2020-07,WA,65+ years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Unknown,Yes
188,2022-02,WY,18 to 49 years,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,Yes
190,2022-02,NC,50 to 64 years,Unknown,Probable Case,Symptomatic,No,Unknown,No,Yes


In [21]:
case_df.to_csv('cleaned_case_data_new.csv', sep=',', quotechar='"', encoding='utf-8', index=False)