In [62]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# load dataset
df = pd.read_csv('../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')
df.head()

In [3]:
#replace value Missing with NAN
df.replace('Missing', np.nan, inplace=True)

In [4]:
df.info()

In [5]:
df.isnull().sum()

In [6]:
plt.figure(figsize=(16,8))
sns.heatmap(df.isnull())

In [6]:
df['symptomatic_yn'] = np.where(df.onset_dt.isnull(), 'No', 'Yes')
df[0:10][['symptomatic_yn','onset_dt']]

In [7]:
df['tested_yn'] = np.where(df[['pos_spec_dt']].isnull(), 'No', 'Yes')
df[:][['tested_yn','pos_spec_dt']]

In [8]:
# drop columns 'onset_dt', 'pos_spec_dt' 
df.drop(['onset_dt', 'pos_spec_dt'], axis=1, inplace=True)

In [9]:
df['cdc_report_dt'] = pd.to_datetime(df['cdc_report_dt'])

In [10]:
df.set_index('cdc_report_dt', inplace=True)
df.head()

In [11]:
df = df.dropna(thresh=7)

In [12]:
df.shape

In [14]:
plt.figure(figsize=(16,8))
sns.heatmap(df.isnull())

## Data Exploration

In [15]:
df.isnull().sum()

In [16]:
df['symptomatic_yn'].value_counts()

In [17]:
df['tested_yn'].value_counts()

In [18]:
df['current_status'].value_counts()

In [19]:
df['sex'].value_counts()

In [20]:
df['age_group'].value_counts()

In [21]:
df['Race and ethnicity (combined)'].value_counts()

In [22]:
df['hosp_yn'].value_counts()

In [23]:
df['icu_yn'].value_counts()

In [24]:
df['death_yn'].value_counts()

In [25]:
df['medcond_yn'].value_counts()

In [None]:
#df['month'] = pd.DatetimeIndex(df['cdc_report_dt']).month
#df['day'] = pd.DatetimeIndex(df['cdc_report_dt']).day
#df['month_year'] = pd.to_datetime(df['cdc_report_dt']).dt.to_period('M')

In [26]:
x = df.groupby(df.index.month).count()['current_status']
sns.barplot(x=x.index, y=x.values);
plt.xticks(rotation=90);

In [13]:
d = df.groupby([df.index]).sex.value_counts().sort_index().unstack()

In [28]:
plt.figure(figsize=(20,8))
d.Female.plot(color='b');
d.Male.plot(color='r');
plt.xticks(rotation=0);
plt.xlabel('date', fontsize=18)
plt.ylabel('number of cases per day', fontsize=18)
plt.legend(['Male', 'Female'])

In [None]:
#x_male = df_male.groupby('month').count()['cdc_report_dt']
#x_female = df_female.groupby('month').count()['cdc_report_dt']
#sns.lineplot(x=x_male.index, y=x_male.values, color='b');
#sns.lineplot(x=x_female.index, y=x_female.values, color='r');
#plt.xticks(rotation=90);
#plt.legend(['Male', 'Female'])

In [None]:
sns.histplot(data=df, x="age_group");
plt.xticks(rotation=60);

In [None]:
sns.histplot(data=df, x="Race and ethnicity (combined)");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="sex");
plt.xticks(rotation=45);

In [None]:
sns.histplot(data=df, x="day");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="hosp_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="icu_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="medcond_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="death_yn");
plt.xticks(rotation=90);

In [15]:
df['age_group'].replace('Unknown', np.nan, inplace=True)
df.dropna(subset=['age_group'], inplace=True)

In [16]:
df.shape

In [None]:
# drop rows with Missing and Unknown and Other values from 'sex' column
#df.drop(df.loc[(df['sex'].isnull()) | (df['sex']=='Missing') | (df['sex']=='Unknown') | (df['sex']=='Other')].index, inplace=True)
#df['sex'].value_counts()

In [None]:
# drop rows with Missing and Unknown and Other values from 'Race and ethnicity (combined)' column
#df.drop(df.loc[df['Race and ethnicity (combined)']=='Missing'].index, inplace=True)
#df['Race and ethnicity (combined)'].value_counts()

## Data Visulization

In [17]:
plt.xticks(rotation=90)
sns.countplot(df['age_group'],hue='death_yn',data=df)

In [None]:
plt.xticks(rotation=90)
sns.countplot(df['month'],hue='death_yn',data=df)

In [None]:
sns.countplot(df['month'],hue='hosp_yn',data=df);

In [19]:
sns.countplot(df['age_group'],hue='hosp_yn',data=df);
plt.xticks(rotation=90);

In [20]:
sns.countplot(df['death_yn'],hue='hosp_yn',data=df);

In [21]:
sns.countplot(df['icu_yn'],hue='hosp_yn',data=df);

from this figures shown that the high percentage of people who went to hospital in the first months, and most of people who wnt to hospital were older than 60 years old, and most of dead people went to hospital, and all of people who went to icu also went to hospital wich make sense. 

In [None]:
sns.countplot(df['month'],hue='icu_yn',data=df);

In [None]:
sns.countplot(df['month'],hue='medcond_yn',data=df);

In [22]:
age = ['0 - 9 Years', '10 - 19 Years', '20 - 29 Years', '30 - 39 Years', '40 - 49 Years',
       '50 - 59 Years', '60 - 69 Years', '70 - 79 Years', '80+ Years']

def create_list(value):
    range_age = []
    for old in value:
        sam = []
        range_old = old
        len_old_yes = len(df[(df['death_yn'] == 'Yes') & (df['age_group'] == old)])
        len_old_no = len(df[(df['death_yn'] == 'No') & (df['age_group'] == old)])
        sam.append(range_old)
        sam.append(len_old_yes)
        sam.append(len_old_no)
        range_age.append(sam)
    return range_age

## convert list to DataFrame
new_df = pd.DataFrame(create_list(age), columns=['year', 'Death', 'Alive'])

In [23]:
## Plot pie chart Dearth
plt.figure(figsize=(15, 15))
plt.subplot(121)
explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1)  # explode 1st slice
plt.pie(new_df['Death'], startangle = 90, explode=explode, autopct='%1.0f%%', shadow=True)
plt.legend(labels=new_df['year'])
plt.title("Death in 2020/11/10 to 2020/11/14")

## Plot pie chart Alive
plt.subplot(122)
plt.pie(new_df['Alive'], startangle = 90, explode=explode, autopct='%1.0f%%', shadow=True)
plt.legend(labels=new_df['year'])
plt.title("Alive in 2020/11/10 to 2020/11/14")
plt.show()

## Show Table
fig, ax = plt.subplots()
ax.table(cellText=new_df.values, colLabels=new_df.columns, loc="center")
ax.axis('off')
fig.tight_layout()
plt.show()

from this pie chart shown that most of dead people were older that 50 years old although the percentage of cases were from young people.

In [24]:
sns.countplot('current_status',hue='death_yn',data=df);

In [25]:
sns.countplot('current_status',hue='hosp_yn',data=df);

In [26]:
sns.countplot('current_status',hue='icu_yn',data=df);

## Data Preprossing 

In [27]:
df['age'] = df['age_group'].replace({'0 - 9 Years':0, '10 - 19 Years':1, '20 - 29 Years':2, '30 - 39 Years':3, '40 - 49 Years':4,
       '50 - 59 Years':5, '60 - 69 Years':6, '70 - 79 Years':7, '80+ Years':8})

In [28]:
df_encoded = pd.get_dummies(df, columns=["current_status", "sex", "symptomatic_yn", "tested_yn"], drop_first=True)
df_encoded.head()

In [29]:
df_encoded.shape

In [30]:
df_encoded.drop('age_group', axis=1, inplace=True)

In [31]:
df_encoded.isnull().sum()

In [32]:
# use RandomForestClassifier to fill the misiing values of 'hosp_yn'
rf_hosp = RandomForestClassifier(n_estimators=10)

df_hosp = pd.get_dummies(df_encoded, columns=["icu_yn", "death_yn", "medcond_yn", 'Race and ethnicity (combined)'])
train_hosp = df_hosp.loc[(df_encoded['hosp_yn'].notnull())]  # known hosp values
test_hosp = df_hosp.loc[(df_encoded['hosp_yn'].isnull())]  # all nan hosp values

features = train_hosp.drop(['hosp_yn'], axis=1)
label = train_hosp['hosp_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.2, random_state=42)
X_test = test_hosp.drop(['hosp_yn'], axis=1)

rf_hosp.fit(X_train, y_train)

print(rf_hosp.score(X_train, y_train))
print(rf_hosp.score(X_val, y_val))

In [33]:
# for hosp_yn column
print(confusion_matrix(y_val, rf_hosp.predict(X_val)))
print(classification_report(y_val, rf_hosp.predict(X_val)))

In [34]:
df_encoded.loc[(df_encoded['hosp_yn'].isnull(), 'hosp_yn')] = rf_hosp.predict(X_test)

In [None]:
sns.histplot(data=df_encoded, x="hosp_yn", color='r');
sns.histplot(data=df, x="hosp_yn", color='b');
plt.xticks(rotation=90);

In [36]:
# use RandomForestClassifier to fill the misiing values of 'medcond_yn'
rf_medcond = RandomForestClassifier(n_estimators=10)

df_medcond = pd.get_dummies(df_encoded, columns=["icu_yn", "death_yn", "hosp_yn", 'Race and ethnicity (combined)'])
train_medcond = df_medcond.loc[(df_encoded['medcond_yn'].notnull())]  # known medcond values
test_medcond = df_medcond.loc[(df_encoded['medcond_yn'].isnull())] # all nan medcond values

features = train_medcond.drop(['medcond_yn'], axis=1)
label = train_medcond['medcond_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.2, random_state=42)
X_test = test_medcond.drop(['medcond_yn'], axis=1)

rf_medcond.fit(X_train, y_train)

print(rf_medcond.score(X_train, y_train))
print(rf_medcond.score(X_val, y_val))

In [37]:
# for medcond_yn column
print(confusion_matrix(y_val, rf_medcond.predict(X_val)))
print(classification_report(y_val, rf_medcond.predict(X_val)))

In [38]:
df_encoded.loc[(df_encoded['medcond_yn'].isnull(), 'medcond_yn')] = rf_medcond.predict(X_test)

In [None]:
#df_encoded['medcond_yn'].value_counts()

In [None]:
sns.histplot(data=df_encoded, x="medcond_yn", color='r');
sns.histplot(data=df, x="medcond_yn", color='b');
plt.xticks(rotation=90);

In [39]:
# use RandomForestClassifier to fill the misiing values of 'icu_yn'
rf_icu = RandomForestClassifier(n_estimators=10)

df_icu = pd.get_dummies(df_encoded, columns=["medcond_yn", "death_yn", "hosp_yn",'Race and ethnicity (combined)'])
train_icu = df_icu.loc[(df_encoded['icu_yn'].notnull())]  # known icu values
test_icu = df_icu.loc[(df_encoded['icu_yn'].isnull())]  # all nan icu values

features = train_icu.drop('icu_yn', axis=1)
label = train_icu['icu_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.2, random_state=42)
X_test = test_icu.drop('icu_yn', axis=1)

rf_icu.fit(X_train, y_train)

print(rf_icu.score(X_train, y_train))
print(rf_icu.score(X_val, y_val))

In [40]:
# for icu_yn column
print(confusion_matrix(y_val, rf_icu.predict(X_val)))
print(classification_report(y_val, rf_icu.predict(X_val)))

In [41]:
df_encoded.loc[(df_encoded['icu_yn'].isnull(), 'icu_yn')] = rf_icu.predict(X_test)

In [None]:
#df_encoded['icu_yn'].value_counts()

In [None]:
sns.histplot(data=df_encoded, x="icu_yn", color='r');
sns.histplot(data=df, x="icu_yn", color='b');
plt.xticks(rotation=90);

In [42]:
# use RandomForestClassifier to fill the misiing values of 'Race and ethnicity (combined)'
rf_race = RandomForestClassifier(n_estimators=10)

df_race = pd.get_dummies(df_encoded, columns=["icu_yn", "death_yn", "medcond_yn", 'hosp_yn'])
train_race = df_race.loc[(df_encoded['Race and ethnicity (combined)'].notnull())]  # known hosp values
test_race = df_race.loc[(df_encoded['Race and ethnicity (combined)'].isnull())]  # all nan hosp values

features = train_race.drop(['Race and ethnicity (combined)'], axis=1)
label = train_race['Race and ethnicity (combined)']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.2, random_state=42)
X_test = test_race.drop(['Race and ethnicity (combined)'], axis=1)

rf_race.fit(X_train, y_train)

print(rf_race.score(X_train, y_train))
print(rf_race.score(X_val, y_val))

In [43]:
# for Race column
print(confusion_matrix(y_val, rf_race.predict(X_val)))
print(classification_report(y_val, rf_race.predict(X_val)))

In [44]:
df_encoded.loc[(df_encoded['Race and ethnicity (combined)'].isnull(), 'Race and ethnicity (combined)')] = rf_race.predict(X_test)

In [None]:
sns.histplot(data=df_encoded, x="Race and ethnicity (combined)", color='r');
sns.histplot(data=df, x="Race and ethnicity (combined)", color='b');
plt.xticks(rotation=90);

In [45]:
# use RandomForestClassifier to fill the misiing values of 'death_yn'
rf_death = RandomForestClassifier(n_estimators=10)

df_death = pd.get_dummies(df_encoded, columns=["medcond_yn", "icu_yn", "hosp_yn", 'Race and ethnicity (combined)'])
train_death = df_death.loc[(df_encoded['death_yn'].notnull())]  
test_death = df_death.loc[(df_encoded['death_yn'].isnull())]  

features = train_death.drop('death_yn', axis=1)
label = train_death['death_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.2, random_state=42)
X_test = test_death.drop('death_yn', axis=1)

rf_death.fit(X_train, y_train)

print(rf_death.score(X_train, y_train))
print(rf_death.score(X_val, y_val))

In [46]:
# for death_yn column
print(confusion_matrix(y_val, rf_death.predict(X_val)))
print(classification_report(y_val, rf_death.predict(X_val)))

In [47]:
df_encoded.loc[(df_encoded['death_yn'].isnull(), 'death_yn')] = rf_death.predict(X_test)

In [48]:
df_encoded['death_yn'].value_counts()

In [None]:
sns.histplot(data=df_encoded, x="death_yn", color='r');
sns.histplot(data=df, x="death_yn", color='b');
plt.xticks(rotation=90);

In [49]:
df_encoded.head()

In [50]:
df_encoded.to_csv('final_data.csv', index=False)

## Classification Models

In [51]:
# #df_final = pd.read_csv('final_data.csv')
df_final = df_encoded
df_final.head()

In [52]:
df_final.isnull().sum()

In [53]:
df_final = pd.get_dummies(df_final, columns=['hosp_yn','icu_yn','medcond_yn', 'Race and ethnicity (combined)'], drop_first=True)
df_final.head()

In [54]:
df_final.shape

In [55]:
features = df_final.drop('death_yn', axis=1)
label = df_final['death_yn']
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

In [64]:
models = []

In [58]:

def evaluate_models(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print(classification_report(y_true, y_pred))
    acc = accuracy_score(y_true, y_pred)
    return cm, acc
    

In [59]:
def plot_cm(cm, title=''):
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=['No', 'Unknown', 'Yes'], yticklabels=['No', 'Unknown', 'Yes'], cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()

**Random Forest Classifier**

In [60]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

In [65]:
y_pred_rf = rf.predict(X_test)
cm_rf, acc_rf = evaluate_models(y_test, y_pred_rf)
plot_cm(cm_rf, 'Random Forest Confusion Matrix ')
models.append(['Random Forest', acc_rf])

**AdaBoost Classifier**

In [66]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_pred_ada = clf.predict(X_test)

In [67]:
cm_ada, acc_ada = evaluate_models(y_test, y_pred_ada)
plot_cm(cm_ada, 'AdaBoost Classifier Confusion Matrix ')
models.append(['AdaBoost', acc_ada])

**Decision Tree**

In [68]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)

In [69]:
cm_dt, acc_dt = evaluate_models(y_test, y_pred_dtc)
plot_cm(cm_dt, 'Decision Tree Confusion Matrix')
models.append(['Decision Tree', acc_dt])

**Neural Network**

In [74]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.python.ops.numpy_ops import np_utils
from tensorflow.python.keras.utils import np_utils

In [71]:
pmap = {'No':0,'Unknown':1,'Yes':2}
df_final['death'] = df_final['death_yn'].map(pmap)
df_final.head()

In [75]:
features_nn = df_final.drop(['death_yn', 'death'], axis=1)
label_nn = df_final['death']
y = np_utils.to_categorical(label_nn)
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(features_nn, y, test_size=0.2, random_state=42)

In [79]:
y_train_nn.shape

In [80]:
model = Sequential()
model.add(Dense(128, input_dim=X_train_nn.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(y_train_nn.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_nn, y_train_nn, epochs=10, batch_size=64)
eva_train = model.evaluate(X_train_nn, y_train_nn)
eva_train

In [82]:
y_pred_nn = model.predict(X_test_nn)
y_test_nn = y_test_nn.argmax(1)
y_pred_nn = np.argmax(y_pred_nn, axis=1)

In [83]:
cm_nn, acc_nn = evaluate_models(y_test_nn, y_pred_nn)
plot_cm(cm_nn, 'Neural Network Confusion Matrix')
models.append(['Neural Network', acc_nn])

In [84]:
# plt.figure(figsize=(7, 5))
for model in models:
    plt.bar(model[0], model[1])

plt.xlabel('Models')
plt.ylabel("Accuracy")
plt.title('Compare Models')
plt.show()