In [None]:
# import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load dataset
df = pd.read_csv('/kaggle/input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# drop columns 'onset_dt', 'pos_spec_dt' 
df.drop(['onset_dt', 'pos_spec_dt'], axis=1, inplace=True)

In [None]:
# drop rows with null values
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#df['cdc_report_dt'].sort_values()

In [None]:
#df['onset_dt'].value_counts()

In [None]:
#df['pos_spec_dt'].value_counts()

In [None]:
df['current_status'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
df['age_group'].value_counts()

In [None]:
df['Race and ethnicity (combined)'].value_counts()

In [None]:
df['hosp_yn'].value_counts()

In [None]:
df['icu_yn'].value_counts()

In [None]:
df['death_yn'].value_counts()

In [None]:
df['medcond_yn'].value_counts()

In [None]:
df['month'] = pd.DatetimeIndex(df['cdc_report_dt']).month
#df['year'] = pd.DatetimeIndex(df['cdc_report_dt']).year
df['day'] = pd.DatetimeIndex(df['cdc_report_dt']).day
df['month_year'] = pd.to_datetime(df['cdc_report_dt']).dt.to_period('M')

In [None]:
x = df.groupby('month_year').count()['cdc_report_dt']
sns.barplot(x=x.index, y=x.values);
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="age_group");
plt.xticks(rotation=60);

In [None]:
sns.histplot(data=df, x="Race and ethnicity (combined)");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="sex");
plt.xticks(rotation=45);

In [None]:
sns.histplot(data=df, x="day");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="hosp_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="icu_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="medcond_yn");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="death_yn");
plt.xticks(rotation=90);

In [None]:
# drop rows with Unknown values from 'age_group' column
df.drop(df.loc[df['age_group']=='Unknown'].index, inplace=True)
df['age_group'].value_counts()

In [None]:
# drop rows with Missing and Unknown and Other values from 'sex' column
df.drop(df.loc[(df['sex']=='Missing') | (df['sex']=='Unknown') | (df['sex']=='Other')].index, inplace=True)
df['sex'].value_counts()

In [None]:
# drop rows with Missing and Unknown and Other values from 'Race and ethnicity (combined)' column
df.drop(df.loc[df['Race and ethnicity (combined)']=='Missing'].index, inplace=True)
df['Race and ethnicity (combined)'].value_counts()

In [None]:
# drop rows with Missing values in 'hosp_yn', 'icu_yn', 'death_yn' and 'medcond_yn' cloumns
df.drop(df.loc[(df['icu_yn']=='Missing') & (df['hosp_yn']=='Missing') & (df['death_yn']=='Missing') & (df['medcond_yn']=='Missing')].index, inplace=True)

In [None]:
df.shape

In [None]:
for col in ['hosp_yn', 'icu_yn', 'medcond_yn', 'death_yn']:
    sns.histplot(data=df, x=col);
    plt.xticks(rotation=90);
    plt.show()

In [None]:
# encode columns "current_status", "age_group", "sex"
df_encoded = pd.get_dummies(df, columns=["current_status", "age_group", "sex"])
#df_encoded = df_encoded.drop(["current_status", "age_group", "sex"], axis=1)
df_encoded.head()

In [None]:
train_columns = ['current_status_Laboratory-confirmed case',
       'current_status_Probable Case', 'age_group_0 - 9 Years',
       'age_group_10 - 19 Years', 'age_group_20 - 29 Years',
       'age_group_30 - 39 Years', 'age_group_40 - 49 Years',
       'age_group_50 - 59 Years', 'age_group_60 - 69 Years',
       'age_group_70 - 79 Years', 'age_group_80+ Years', 'sex_Female',
       'sex_Male']

In [None]:
# use DecisionTreeClassifier to fill the misiing values of 'hosp_yn'
dt = RandomForestClassifier(n_estimators=10)

train = df_encoded.loc[(df_encoded['hosp_yn'] != 'Missing')]  # known age values
test = df_encoded.loc[(df_encoded['hosp_yn'] == 'Missing')]  # all nan age values

features = train[train_columns]
label = train['hosp_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.4, random_state=42)
X_test = test[train_columns]

dt.fit(X_train, y_train)

print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, dt.predict(X_val))

In [None]:
df_encoded.loc[(df_encoded['hosp_yn'] == 'Missing', 'hosp_yn')] = dt.predict(X_test)

In [None]:
#df_encoded['hosp_yn'].value_counts()

In [None]:
#df['hosp_yn'].value_counts()

In [None]:
sns.histplot(data=df_encoded, x="hosp_yn", color='r');
sns.histplot(data=df, x="hosp_yn", color='b');
plt.xticks(rotation=90);

In [None]:
# use DecisionTreeClassifier to fill the misiing values of 'medcond_yn'
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier(n_estimators=10)

train = df_encoded.loc[(df_encoded['medcond_yn'] != 'Missing')]  # known age values
test = df_encoded.loc[(df_encoded['medcond_yn'] == 'Missing')]  # all nan age values

features = train[train_columns]
label = train['medcond_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.4, random_state=42)
X_test = test[train_columns]

dt.fit(X_train, y_train)

print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, dt.predict(X_val))

In [None]:
df_encoded.loc[(df_encoded['medcond_yn'] == 'Missing', 'medcond_yn')] = dt.predict(X_test)

In [None]:
sns.histplot(data=df_encoded, x="medcond_yn", color='r');
sns.histplot(data=df, x="medcond_yn", color='b');
plt.xticks(rotation=90);

In [None]:
# use DecisionTreeClassifier to fill the misiing values of 'icu_yn'
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier(n_estimators=10)

train = df_encoded.loc[(df_encoded['icu_yn'] != 'Missing')]  # known age values
test = df_encoded.loc[(df_encoded['icu_yn'] == 'Missing')]  # all nan age values

features = train[train_columns]
label = train['icu_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.4, random_state=42)
X_test = test[train_columns]

dt.fit(X_train, y_train)

print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, dt.predict(X_val))

In [None]:
df_encoded.loc[(df_encoded['icu_yn'] == 'Missing', 'icu_yn')] = dt.predict(X_test)

In [None]:
sns.histplot(data=df_encoded, x="icu_yn", color='r');
sns.histplot(data=df, x="icu_yn", color='b');
plt.xticks(rotation=90);

In [None]:
# use DecisionTreeClassifier to fill the misiing values of 'death_yn'
dt = RandomForestClassifier(n_estimators=10)

train = df_encoded.loc[(df_encoded['death_yn'] != 'Missing')]  
test = df_encoded.loc[(df_encoded['death_yn'] == 'Missing')]  

features = train[train_columns]
label = train['death_yn']
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=.4, random_state=42)
X_test = test[train_columns]

dt.fit(X_train, y_train)

print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, dt.predict(X_val))

In [None]:
df_encoded.loc[(df_encoded['death_yn'] == 'Missing', 'death_yn')] = dt.predict(X_test)

In [None]:
sns.histplot(data=df_encoded, x="death_yn", color='r');
sns.histplot(data=df, x="death_yn", color='b');
plt.xticks(rotation=90);

In [None]:
df_encoded['death_yn'].value_counts()

In [None]:
df_encoded.head()

In [None]:
#df['onset_dt'].fillna('Asymptomatic', inplace=True) 

In [None]:
#df.loc[(df['icu_yn']=='Missing') | (df['hosp_yn']=='Missing') | (df['death_yn']=='Missing') | (df['medcond_yn']=='Missing')]

In [None]:
# drop rows with Missing values in 'hosp_yn', 'icu_yn', 'death_yn' or 'medcond_yn' cloumns
#df.drop(df.loc[(df['icu_yn']=='Missing') | (df['hosp_yn']=='Missing') | (df['death_yn']=='Missing') | (df['medcond_yn']=='Missing')].index, inplace=True)

In [None]:
x = df.groupby('month_year').count()['cdc_report_dt']
sns.barplot(x=x.index, y=x.values);
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="age_group");
plt.xticks(rotation=60);

In [None]:
sns.histplot(data=df, x="Race and ethnicity (combined)");
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=df, x="sex");
plt.xticks(rotation=45);

In [None]:
sns.histplot(data=df, x="day");
plt.xticks(rotation=90);