In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
import warnings
warnings.simplefilter("ignore")
import statsmodels.api as sm

### Reading the dataset ###

In [2]:
# First read the data in a dataframe
conditions_data = pd.read_csv("conditions.csv")
# See how the data looks like
conditions_data.head()

#case_surveillance.csv

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1454.0,1520.0,
1,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5666.0,5880.0,
2,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14785.0,15395.0,
3,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,36798.0,38235.0,
4,11/27/2022,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,80764.0,83685.0,


### Data Preprocessing ###

In [3]:
conditions_data.isna().sum()

Data As Of                 0
Start Date                 0
End Date                   0
Group                      0
Year                   12420
Month                  49680
State                      0
Condition Group            0
Condition                  0
ICD10_codes                0
Age Group                  0
COVID-19 Deaths       141685
Number of Mentions    137129
Flag                  342695
dtype: int64

In [4]:
### Removing unnecessary columns

conditions_data = conditions_data.drop(['Flag','Number of Mentions','Data As Of'], axis=1)

In [5]:
conditions_data.head()

Unnamed: 0,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
0,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1454.0
1,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5666.0
2,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14785.0
3,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,36798.0
4,01/01/2020,11/26/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,80764.0


### Removing Null Values ###

In [6]:
conditions_data.isna().sum()

Start Date              0
End Date                0
Group                   0
Year                12420
Month               49680
State                   0
Condition Group         0
Condition               0
ICD10_codes             0
Age Group               0
COVID-19 Deaths    141685
dtype: int64

In [7]:
### Removing null values from the columns COVID-19 Deaths, Year, Month
df = conditions_data.dropna(subset=['COVID-19 Deaths','Year', 'Month'])

In [8]:
df.head()

Unnamed: 0,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
49680,01/01/2020,01/31/2020,By Month,2020.0,1.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.0
49681,02/01/2020,02/29/2020,By Month,2020.0,2.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0.0
49682,03/01/2020,03/31/2020,By Month,2020.0,3.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,9.0
49683,04/01/2020,04/30/2020,By Month,2020.0,4.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,27.0
49684,05/01/2020,05/31/2020,By Month,2020.0,5.0,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,19.0


In [9]:
df.isna().sum()

Start Date         0
End Date           0
Group              0
Year               0
Month              0
State              0
Condition Group    0
Condition          0
ICD10_codes        0
Age Group          0
COVID-19 Deaths    0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302414 entries, 49680 to 484379
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Start Date       302414 non-null  object 
 1   End Date         302414 non-null  object 
 2   Group            302414 non-null  object 
 3   Year             302414 non-null  float64
 4   Month            302414 non-null  float64
 5   State            302414 non-null  object 
 6   Condition Group  302414 non-null  object 
 7   Condition        302414 non-null  object 
 8   ICD10_codes      302414 non-null  object 
 9   Age Group        302414 non-null  object 
 10  COVID-19 Deaths  302414 non-null  float64
dtypes: float64(3), object(8)
memory usage: 27.7+ MB


### Changing the data type ###

In [11]:
df['Start Date'] = df['Start Date'].astype('datetime64')
df['End Date'] = df['End Date'].astype('datetime64')
df['Year'] = df['Year'].astype('int')
df['Month'] = df['Month'].astype('int')
df['COVID-19 Deaths'] = df['COVID-19 Deaths'].astype('int')

In [12]:
df.head()

Unnamed: 0,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
49680,2020-01-01,2020-01-31,By Month,2020,1,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0
49681,2020-02-01,2020-02-29,By Month,2020,2,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0
49682,2020-03-01,2020-03-31,By Month,2020,3,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,9
49683,2020-04-01,2020-04-30,By Month,2020,4,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,27
49684,2020-05-01,2020-05-31,By Month,2020,5,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,19


In [13]:
num_rows = df.shape[0]

print(num_rows) 

302414


### Dropping duplicate rows if any ###

In [17]:
df = df.drop_duplicates()
df

Unnamed: 0,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths
49680,2020-01-01,2020-01-31,By Month,2020,1,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0
49681,2020-02-01,2020-02-29,By Month,2020,2,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,0
49682,2020-03-01,2020-03-31,By Month,2020,3,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,9
49683,2020-04-01,2020-04-30,By Month,2020,4,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,27
49684,2020-05-01,2020-05-31,By Month,2020,5,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,19
...,...,...,...,...,...,...,...,...,...,...,...
484375,2022-07-01,2022-07-31,By Month,2022,7,Puerto Rico,COVID-19,COVID-19,U071,All Ages,201
484376,2022-08-01,2022-08-31,By Month,2022,8,Puerto Rico,COVID-19,COVID-19,U071,All Ages,222
484377,2022-09-01,2022-09-30,By Month,2022,9,Puerto Rico,COVID-19,COVID-19,U071,All Ages,168
484378,2022-10-01,2022-10-31,By Month,2022,10,Puerto Rico,COVID-19,COVID-19,U071,All Ages,115


In [None]:
### Correlation between Condition and Deaths