In [146]:
import pandas as pd 
import numpy as np
from datetime import timedelta
import missingno as msno
from scipy import stats
import statistics
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [147]:
df=pd.read_csv(r'../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')


In [148]:
df.head(3)

In [149]:
df.tail(3)

In [150]:
print("The number of columns present in a dataset are:",df.shape[1])
print("The number of rows present in a dataset are:",df.shape[0])

In [151]:
df.columns

# Column Description

1.cdc_report_dt - earlier clinical dates related to illness or specimen collection

2.pos_spec_dt   - date at which column name was submitted to the database

3.onset_dt      - date of the symptoms shown

4.sex           - gender of a person

5.age_group     - various age groups of a person

6.Race and ethnicity - social group or clutural tradition of a person

7.hosp_yn       - status of admission in hospitial 

8.icu_yn        - status of admission in ICU

9.death_yn      - did the patient died as a result of this illness

10.medcond_yn   - status of pre-existing medical condition

In [152]:
#data type of various columns
df.info()

In [153]:
#Changing object to datetime 
df['cdc_report_dt']=pd.to_datetime(df['cdc_report_dt'])

df['onset_dt']=pd.to_datetime(df.onset_dt)
df['pos_spec_dt']=pd.to_datetime(df.pos_spec_dt)


In [154]:
#checking the datatypes
df.info()

In [155]:
#After changing the columns datatype let's check for missing in the dataset
df.isnull().sum()

In [156]:
#Percentage of missing values in each columns would be considered better for understanding

def percentage_of_missing():
    t=df.columns 
    for i in t:
        t=(df[i].isnull().sum())/df.shape[0]
        print(i[:4],'\t\t',round(t*100,4),'%')
percentage_of_missing()

In [157]:
msno.matrix(df)


In [158]:
#Heatmap with msno helps to know how strongly the presence or absence of one variable affects the presence of another
msno.heatmap(df)

In [159]:
 msno.dendrogram(df)

 # DataPreprocessing

In [160]:

#Instead of deleting the columns pos_spec_dt and onset_pos_difference we will try to fill the missing values by taking the difference between them.
df['onset_pos_difference'] = (df['onset_dt'] -df['pos_spec_dt'][df.pos_spec_dt.notnull()]).dt.days
df['pos_difference'] = (df['cdc_report_dt'] -df['pos_spec_dt'][df.pos_spec_dt.notnull()]).dt.days
df['onset_difference'] = (df['cdc_report_dt'] -df['onset_dt'][df.onset_dt.notnull()]).dt.days

In [161]:
print('on_pos_set:-',df['onset_pos_difference'].mean(),df['onset_pos_difference'].median(),df['onset_pos_difference'].mode())

print('onset_set:-',df['onset_difference'].mean(),df['onset_difference'].median(),df['onset_difference'].mode())

print('pos_difference:-',df['pos_difference'].mean(),df['pos_difference'].mode(),df['pos_difference'].median())


In [162]:
df['onset_dt'].fillna(df.cdc_report_dt + timedelta(days=2), inplace = True)

df['pos_spec_dt'].fillna(df.cdc_report_dt+timedelta(days=2),inplace=True)

In [163]:
#Extracting month 
df['month_cdc']=df['cdc_report_dt'].dt.month
#Extracting week
df['week_cdc']=df['cdc_report_dt'].dt.week

In [164]:
df.drop(columns=['onset_pos_difference','onset_difference','pos_difference'],inplace=True)

In [165]:
df.isnull().sum()

In [166]:
df.sex.unique()

In [167]:
df['sex'].value_counts()

In [168]:
df[df['age_group'].isna()][:29]

In [169]:
df['sex']=df.groupby(['death_yn','icu_yn','month_cdc']).sex.transform(lambda x: x.fillna(x.mode()[0]))

In [170]:
df['sex'].unique()

In [171]:
df['sex'].value_counts()

In [172]:
df['sex'].replace({'Missing':np.nan,'Unknown':np.nan},inplace=True)

In [173]:
df['sex']=df.groupby(['death_yn','icu_yn','month_cdc']).sex.transform(lambda x: x.fillna(x.mode()[0]))

df['sex'].unique()

In [174]:
#filling missing values in Race and ethnicity
df['Race and ethnicity (combined)'].unique()

# Apart from missing values there were Unkown, missing separately
How many missing values are present ..?

How many Unkown values are present...?


In [175]:
df['Race and ethnicity (combined)'].value_counts()

In [176]:
#Replacing unkowns and missing to nan and then trying to fill the values 
df["Race and ethnicity (combined)"].replace({"Missing": np.nan, "Unknown": np.nan}, inplace=True)

In [177]:
df['Race and ethnicity (combined)'].unique()

In [178]:
df['Race and ethnicity (combined)']=df.groupby(['death_yn','icu_yn','month_cdc'])['Race and ethnicity (combined)'].transform(lambda x: x.fillna(x.mode()[0]))

In [179]:

df['Race and ethnicity (combined)'].unique()

In [180]:
#Missing values in age_group column

df['age_group'].unique()

In [181]:
df['age_group'].replace({'Unknown':np.nan},inplace=True)

In [182]:
df['age_group'].unique()

In [183]:
df['age_group']=df.groupby(['death_yn','icu_yn','month_cdc','Race and ethnicity (combined)'])['age_group'].transform(lambda x: x.fillna(x.mode()[0]))

In [184]:
df['age_group'].unique()

In [185]:
#checking in other columns  for missing and Unkown data

df['hosp_yn'].unique()

In [186]:
df['hosp_yn'].replace({'Missing':np.nan,'Unknown':np.nan},inplace=True)
df['hosp_yn'].unique()


In [187]:
df['hosp_yn']=df.groupby(['death_yn','Race and ethnicity (combined)'])['hosp_yn'].transform(lambda x: x.fillna(x.mode()[0]))
df['hosp_yn'].unique()

In [188]:
df['hosp_yn'].unique()

In [189]:
#Checking icu_yn
df['icu_yn'].unique()

In [190]:
df['icu_yn'].replace({'Missing':np.nan,'Unknown':np.nan},inplace=True)

In [191]:
df['icu_yn']=df.groupby(['age_group','Race and ethnicity (combined)'])['icu_yn'].transform(lambda x: x.fillna(x.mode()[0]))

In [192]:
df['icu_yn'].unique()

In [193]:
#death column
df['death_yn'].unique()

In [194]:
df['death_yn'].replace({'Missing':np.nan,'Unknown':np.nan},inplace=True)

In [195]:
df['death_yn'].unique()

In [196]:
df['death_yn']=df.groupby(['age_group','icu_yn','Race and ethnicity (combined)'])['death_yn'].transform(lambda x: x.fillna(x.mode()[0]))

In [197]:
df['death_yn'].unique()

In [198]:
#medcond_yn column
df['medcond_yn'].unique()

In [199]:
df['medcond_yn'].replace({'Missing':np.nan},inplace=True)

In [200]:
df['medcond_yn']=df.groupby(['icu_yn','Race and ethnicity (combined)'])['medcond_yn'].transform(lambda x: x.fillna(x.mode()[0]))


In [201]:
df['medcond_yn'].unique()

In [202]:
#checking current status column
df['current_status'].unique()

In [203]:
df.isnull().sum()

In [204]:
df.describe().T

In [205]:
df['death_yn'].unique()

In [206]:
df['death_yn'].replace({'Yes':1,'No':0},inplace=True)

# DataVisualization 

# How to know the values distribution of age group ?

In [207]:
plt.figure(figsize=(10,6))
plt.hist(df['age_group'],log=True)
plt.xlabel("Age group")
plt.xticks(rotation=45)
plt.show()

# How to know the values distribution of Race and ethnicity..?

In [208]:
plt.figure(figsize=(10,6))
plt.hist(df['Race and ethnicity (combined)'],log=True)
plt.xlabel("Race Ethnicity")
plt.xticks(rotation=45)
plt.show()

# How to know the values distribution in Medical condition..?

In [209]:
plt.figure(figsize=(10,6))
plt.hist(df['medcond_yn'],log=True)
plt.xlabel("Medical condition")
plt.xticks(rotation=45)
plt.show()

# Spread of admission based on gender in ICU..?


In [210]:

plt.figure(figsize=(10,7))
sns.histplot(data=df, x="icu_yn", hue="sex", multiple="stack")
plt.xlabel("Admission's in ICU")
plt.show()

# How to know the current status of people based on age group..?

In [211]:
plt.figure(figsize=(8,6))
sns.countplot(y="current_status",hue ='age_group',data=df)
plt.show()

# How to know the death of people based on age group..?


In [212]:

plt.figure(figsize=(8,6))
sns.countplot(y='death_yn',hue='age_group',data=df)
plt.ylabel('death')
plt.show()

# How to know to count of people joined in icu based on age group..?

In [213]:

plt.figure(figsize=(8,6))
sns.countplot(y='icu_yn',hue='age_group',data=df)
plt.ylabel('admission in icu')
plt.show()

# How to know the death of people based on race and ethnicity..?

In [214]:
plt.figure(figsize=(8,8))
sns.countplot(y='death_yn',hue='Race and ethnicity (combined)',data=df)
plt.ylabel('Death')
plt.show()

# How to know the count of people based on age group and gender..?


In [215]:
plt.figure(figsize=(15,7))
sns.histplot(data=df,x='age_group',bins=30,hue='sex',multiple='stack')
plt.xlabel("Age group")
plt.title(" Count of age group's")
plt.show()

# How to figureout the  count of people joined in icu based on age groups..?

In [216]:
plt.figure(figsize=(15,7))
sns.histplot(data=df,x='age_group',bins=30,hue='icu_yn',multiple='stack')
plt.xlabel("Age group")
plt.title(" Count of age group's")
plt.show()

In [217]:
df['death_yn'].unique()

In [218]:
values = df['death_yn'].value_counts().tolist()
names = [ 'Yes', 'No']
fig = px.pie(names=names,values=values, title="Distribution Pre-Existing Medical Conditions",)
fig.show()


In [219]:
values = df['medcond_yn'].value_counts().tolist()
names = [ 'Unknown', 'Yes', 'No']
fig = px.pie(names=names,values=values, title="Distribution Pre-Existing Medical Conditions",)
fig.show()


In [220]:
values = df['current_status'].value_counts().tolist()
names = ['Confirmed', 'Probable']
px.pie( names=names,    values=values,title="Case Status Distribution")



In [221]:
df.corr()

In [222]:
plt.figure(figsize=(10,10))
sns.pairplot(df)
plt.show()

In [223]:
#df['year']
df['cdc_report_dt'].dt.year.max(),df['cdc_report_dt'].dt.year.min()#as the year is same so not considering year column

In [224]:
data=df.groupby(["week_cdc",'death_yn']).count()
data.reset_index(inplace=True)

# How to know the trend of deaths based on weeks of a year..?

In [225]:
plt.figure(figsize=(10,7))
sns.lineplot(data=data,x='week_cdc',y='cdc_report_dt')
plt.show()

#  How to know the trend of deaths based on weeks of a year , age group , admissions in icu and death of a person..?

In [226]:
data=df.groupby(["week_cdc",'age_group','icu_yn','death_yn']).count()
data.reset_index(inplace=True)

In [227]:
plt.figure(figsize=(10,7))
sns.lineplot(data=data,x='week_cdc',y='cdc_report_dt')
plt.show()

In [228]:
df.head()

In [229]:
df.info()

In [230]:
df.current_status.unique()

In [231]:
df.sex.unique()

In [232]:
df.age_group.unique()

In [233]:
df['Race and ethnicity (combined)'].unique()

In [234]:
df.hosp_yn.unique()

In [235]:
df.icu_yn.unique()

In [236]:
df.medcond_yn.unique()

In [237]:
df['month_cdc'].unique()

In [238]:
df.current_status.replace({'Laboratory-confirmed case':1,'Probable Case':2},inplace=True)

In [239]:
df.sex.replace({'Male':1,'Female':2,'Other':3},inplace=True)


In [240]:
df.age_group.replace({'10 - 19 Years':1, '20 - 29 Years':2, '30 - 39 Years':2, '40 - 49 Years':3,'50 - 59 Years':3, '0 - 9 Years':1, '60 - 69 Years':4, '80+ Years':4,'70 - 79 Years':4},inplace=True)

In [241]:
df['Race and ethnicity (combined)'].replace({'Black, Non-Hispanic':1, 'White, Non-Hispanic':2,
       'Asian, Non-Hispanic':3,
       'American Indian/Alaska Native, Non-Hispanic':4,
       'Multiple/Other, Non-Hispanic':5,
       'Native Hawaiian/Other Pacific Islander, Non-Hispanic':6,
       'Hispanic/Latino':7},inplace=True)

In [242]:
df['hosp_yn'].replace({'No':0,'Yes':1},inplace=True)

In [243]:
df['icu_yn'].replace({'No':0,'Yes':1},inplace=True)

In [244]:
df['medcond_yn'].replace({'No':0, 'Yes':1, 'Unknown':2},inplace=True)

In [245]:
plt.subplots(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,annot_kws={'size':9})

# Implementing machine learning algorthm

# Performing classification problem based on Death condition


In [246]:
X=df.drop(columns=['death_yn','month_cdc','week_cdc','Race and ethnicity (combined)','age_group','pos_spec_dt','onset_dt','cdc_report_dt'])

In [247]:
y=df['death_yn']


In [248]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  


In [249]:
from sklearn.model_selection import train_test_split

In [250]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix


In [251]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [252]:
#Logistic Regression
logistic_reg = LogisticRegression()
pred=logistic_reg.fit(X_train, y_train)
#Predicting values
Y_pred = logistic_reg.predict(X_test)

In [253]:
print(classification_report(y_test, Y_pred))

In [254]:
confusion_matrix(y_test,Y_pred)

In [255]:
plot_confusion_matrix(logistic_reg, X_test, y_test)  

In [257]:
#RandomForest

rf_c=RandomForestClassifier()
pred_rf=rf_c.fit(X_train, y_train)
#Predicting values
Y_pred_rf = pred_rf.predict(X_test)

In [258]:
print(classification_report(y_test, Y_pred_rf))

In [259]:
confusion_matrix(y_test,Y_pred_rf)

In [260]:
plot_confusion_matrix(rf_c, X_test, y_test)  

In [261]:
#Naive Bayes
Nb=GaussianNB()
pred_nb=Nb.fit(X_train, y_train)
#Predicting values
Y_pred_nb = pred_nb.predict(X_test)

In [262]:
print(classification_report(y_test, Y_pred_nb))

In [263]:
confusion_matrix(y_test,Y_pred_nb)

In [264]:
plot_confusion_matrix(Nb, X_test, y_test)

In [265]:
#Decision tree classifier
DC= DecisionTreeClassifier()  
pred_dc=DC.fit(X_train, y_train)

In [266]:
Y_pred_dc= pred_dc.predict(X_test)

In [267]:
print(classification_report(y_test, Y_pred_dc))

In [268]:
confusion_matrix(y_test,Y_pred_dc)

In [269]:
plot_confusion_matrix(DC, X_test, y_test)

# Performing classification problem based on joined in icu 

In [270]:
X=df.drop(columns=['death_yn','month_cdc','week_cdc','Race and ethnicity (combined)','age_group','pos_spec_dt','onset_dt','cdc_report_dt','current_status','icu_yn'])

In [271]:
y=df['icu_yn']

In [273]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [275]:
#Logistic Regression
logistic_reg = LogisticRegression()
pred=logistic_reg.fit(X_train, y_train)
#Predicting values
Y_pred = logistic_reg.predict(X_test)

In [276]:
print(classification_report(y_test, Y_pred))

In [277]:
confusion_matrix(y_test,Y_pred)

In [278]:
plot_confusion_matrix(logistic_reg, X_test, y_test)

In [279]:
#RandomForest

rf_c=RandomForestClassifier()
pred_rf=rf_c.fit(X_train, y_train)
#Predicting values
Y_pred_rf = pred_rf.predict(X_test)

In [282]:
print(classification_report(y_test, Y_pred_rf))

In [283]:
confusion_matrix(y_test,Y_pred_rf)

In [284]:
plot_confusion_matrix(rf_c, X_test, y_test)  

In [285]:
#Naive Bayes
Nb=GaussianNB()
pred_nb=Nb.fit(X_train, y_train)
#Predicting values
Y_pred_nb = pred_nb.predict(X_test)

In [286]:
print(classification_report(y_test, Y_pred_nb))

In [287]:
confusion_matrix(y_test,Y_pred_nb)

In [288]:
plot_confusion_matrix(Nb, X_test, y_test)

In [289]:
#Decision tree classifier
DC= DecisionTreeClassifier()  
pred_dc=DC.fit(X_train, y_train)

In [290]:
Y_pred_dc= pred_dc.predict(X_test)

In [291]:
print(classification_report(y_test, Y_pred_dc))

In [292]:
confusion_matrix(y_test,Y_pred_dc)

In [293]:
plot_confusion_matrix(DC, X_test, y_test)