In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
import warnings
warnings.simplefilter("ignore")
import statsmodels.api as sm

### Reading the dataset ###

In [29]:
# First read the data in a dataframe
conditions_data = pd.read_csv("conditions.csv")
# See how the data looks like
conditions_data.head()

#case_surveillance.csv

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1454.0,1520.0,
1,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5666.0,5880.0,
2,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14785.0,15395.0,
3,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,36798.0,38235.0,
4,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,80764.0,83685.0,


In [30]:
### Data Preprocessing ###

In [4]:
conditions_data.isna().sum()

Data As Of                 0
Start Date                 0
End Date                   0
Group                      0
Year                   12420
Month                  49680
State                      0
Condition Group            0
Condition                  0
ICD10_codes                0
Age Group                  0
COVID-19 Deaths       141685
Number of Mentions    137129
Flag                  342695
dtype: int64

In [20]:
### Removing unnecessary columns

conditions_data = conditions_data.drop(['Flag','Number of Mentions'], axis=1)

In [21]:
conditions_data.head()

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
0,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1454.0
1,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5666.0
2,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14785.0
3,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,36798.0
4,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,80764.0


### Removing Null Values ###

In [17]:
conditions_data.isna().sum()

Data As Of                 0
Start Date                 0
End Date                   0
Group                      0
State                      0
Condition Group            0
Condition                  0
ICD10_codes                0
Age Group                  0
COVID-19 Deaths       141685
Number of Mentions    137129
Flag                  342695
dtype: int64

In [22]:
### Removing null values from the columns COVID-19 Deaths, Year, Month
df = conditions_data.dropna(subset=['COVID-19 Deaths','Year', 'Month'])

In [23]:
df.head()

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
49680,11/27/2022,01/01/2020,01/31/2020,By Month,2020.0,1.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.0
49681,11/27/2022,02/01/2020,02/29/2020,By Month,2020.0,2.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.0
49682,11/27/2022,03/01/2020,03/31/2020,By Month,2020.0,3.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,9.0
49683,11/27/2022,04/01/2020,04/30/2020,By Month,2020.0,4.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,27.0
49684,11/27/2022,05/01/2020,05/31/2020,By Month,2020.0,5.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,19.0


In [25]:
df.isna().sum()

Data As Of         0
Start Date         0
End Date           0
Group              0
Year               0
Month              0
State              0
Condition Group    0
Condition          0
ICD10_codes        0
Age Group          0
COVID-19 Deaths    0
dtype: int64

In [5]:
%%time
required_columns = ['case_month','res_state','age_group','exposure_yn','current_status','symptom_status','hosp_yn',
                    'icu_yn','death_yn','underlying_conditions_yn']

column_datatypes = {"case_month": "str", "res_state": "str", "age_group": "category", "exposure_yn":"category",
             "current_status": "category", "symptom_status": "category", "hosp_yn": "category", "icu_yn": "category", "death_yn":"category",
             "underlying_conditions_yn": "category"}

case_data =  pd.read_csv("case_surveillance.csv",usecols=required_columns, dtype=column_datatypes,
                         memory_map=True)
# case_data =  pd.read_csv("case_surveillance.csv", nrows=1000000)

CPU times: user 1min 17s, sys: 12.4 s, total: 1min 29s
Wall time: 1min 48s


In [6]:
case_data.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90183148 entries, 0 to 90183147
Columns: 10 entries, case_month to underlying_conditions_yn
dtypes: category(8), object(2)
memory usage: 11.0 GB


In [7]:
case_data.head(10)

Unnamed: 0,case_month,res_state,age_group,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2022-03,CA,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,
1,2021-07,KS,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
2,2022-05,NC,50 to 64 years,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,No,
3,2022-06,RI,65+ years,Missing,Probable Case,Missing,Missing,Missing,No,
4,2020-08,FL,18 to 49 years,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
5,2021-09,KY,50 to 64 years,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
6,2020-11,MN,65+ years,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
7,2020-10,IL,65+ years,Missing,Probable Case,Missing,Missing,Missing,,
8,2021-05,WY,,Missing,Laboratory-confirmed case,Missing,Missing,Missing,,
9,2022-01,MI,18 to 49 years,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,


In [8]:
case_data.isna().sum()

case_month                         9
res_state                       1288
age_group                     723020
exposure_yn                        0
current_status                     0
symptom_status                     0
hosp_yn                            0
icu_yn                             0
death_yn                     2826297
underlying_conditions_yn    87213425
dtype: int64

In [9]:
case_data.dropna(subset=['underlying_conditions_yn'],inplace=True)

In [10]:
# case_data.dropna(subset=['death_yn'],inplace=True)

In [11]:
case_data.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2969723 entries, 27 to 90183145
Columns: 10 entries, case_month to underlying_conditions_yn
dtypes: category(8), object(2)
memory usage: 393.7 MB


In [12]:
case_data.isna().sum()

case_month                       0
res_state                      136
age_group                    46397
exposure_yn                      0
current_status                   0
symptom_status                   0
hosp_yn                          0
icu_yn                           0
death_yn                    230833
underlying_conditions_yn         0
dtype: int64

In [13]:
case_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2969723 entries, 27 to 90183145
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   case_month                object  
 1   res_state                 object  
 2   age_group                 category
 3   exposure_yn               category
 4   current_status            category
 5   symptom_status            category
 6   hosp_yn                   category
 7   icu_yn                    category
 8   death_yn                  category
 9   underlying_conditions_yn  category
dtypes: category(8), object(2)
memory usage: 90.6+ MB


In [14]:
# case_data = case_data[(case_data.death_yn !='Missing') & (case_data.death_yn !='Unknown')]
case_data = case_data[(case_data.hosp_yn !='Missing') & (case_data.hosp_yn !='Unknown')]
case_data = case_data[(case_data.icu_yn !='Missing') & (case_data.icu_yn !='Unknown')]
case_data = case_data[(case_data.symptom_status !='Missing') & (case_data.symptom_status !='Unknown')]
case_data = case_data[(case_data.current_status !='Probable Case')]

In [15]:
# case_data['underlying_conditions_yn'] = case_data.underlying_conditions_yn.replace({"Yes":1,
#                                                         "No": 0})

# case_data['death_yn'] = case_data.death_yn.replace({"Yes":1,
#                                                         "No": 0})

case_data['hosp_yn'] = case_data.hosp_yn.replace({"Yes":1,
                                                        "No": 0})

case_data['icu_yn'] = case_data.icu_yn.replace({"Yes":1,
                                                        "No": 0})

case_data['symptom_status'] = case_data.icu_yn.replace({"Symptomatic":1,
                                                        "Asymptomatic": 0})

In [16]:
# case_data.symptom_status.unique()
case_data.info()
# case_data['underlying_conditions_yn'] = case_data['underlying_conditions_yn'].astype('int')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730310 entries, 42 to 90183106
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   case_month                730310 non-null  object  
 1   res_state                 730246 non-null  object  
 2   age_group                 715907 non-null  category
 3   exposure_yn               730310 non-null  category
 4   current_status            730310 non-null  category
 5   symptom_status            730310 non-null  category
 6   hosp_yn                   730310 non-null  category
 7   icu_yn                    730310 non-null  category
 8   death_yn                  640684 non-null  category
 9   underlying_conditions_yn  730310 non-null  category
dtypes: category(8), object(2)
memory usage: 22.3+ MB


In [17]:
# case_data['death_yn'] = case_data['death_yn'].astype(str).astype('int')

In [18]:
case_data['hosp_yn'] = case_data['hosp_yn'].astype(str).astype('int')

In [19]:
case_data['icu_yn'] = case_data['icu_yn'].astype(str).astype('int')
case_data['symptom_status'] = case_data['symptom_status'].astype(str).astype('int')

In [20]:
# # Multiple Regression
# features = ['underlying_conditions_yn','hosp_yn','icu_yn','symptom_status']
# label = ['death_yn']
# X = case_data[features]
# Y = case_data[label]
# X_new = sm.add_constant(X)
# regcorr = sm.OLS(Y,X_new)
# stats = regcorr.fit()
# print(stats.summary())

In [27]:
from scipy import stats
r, p = stats.pearsonr(case_data.symptom_status,case_data.hosp_yn)
print(r,p)

0.5057817213703841 0.0
