In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')
df.shape

In [None]:
df.head(10)

# Drop rows with missing target values (Unknown, Missing)

In [None]:
print(df['death_yn'].value_counts(dropna=False))

In [None]:
df = df.loc[((df.death_yn != 'Missing') & (df.death_yn != 'Unknown'))]
df.reset_index(drop=True, inplace=True)
df.shape

# Data overview, cleaning and preprocessing

In [None]:
def NaN_info(df):
    global null_view
    try:
        null_view = df[[col for col in df.columns if df[col].isna().sum() > 0]].isna().sum().sort_values(ascending = True)
        null_view = pd.DataFrame(null_view, columns=['NANs'])
        null_view[['PERCENT']] = null_view.NANs.apply(lambda x: round((x/len(df))*100, 2))
        null_view[['TYPE']] = df.dtypes
    except:
        return null_view
    return null_view

NaN_info(df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(20, 50))
sns.heatmap(df.isnull(), cbar=False)

# Remove some NaNs in [Race and ethnicity, sex, age_group]

In [None]:
df = df.dropna(subset=['Race and ethnicity (combined)', 'sex', 'age_group'], how='any')
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
NaN_info(df)

In [None]:
for el in list(df.columns):
    print(f'======================= {el} =======================')
    print(df[el].value_counts(dropna=False))
    print('')

# Feature Engineering

In [None]:
df['symptomatic_yn'] = np.where(df.onset_dt.isnull(), 'No', 'Yes')
df[0:10][['symptomatic_yn','onset_dt']]

In [None]:
df['tested_yn'] = np.where(df[['pos_spec_dt']].isnull(), 'No', 'Yes')
df[:][['tested_yn','pos_spec_dt']]

In [None]:
df['month'] = pd.DatetimeIndex(df['cdc_report_dt']).month
df['month'].value_counts(dropna=False)

In [None]:
print(df.age_group.unique())

In [None]:
change={
        '0 - 9 Years': 10,
        '10 - 19 Years': 20,
        '20 - 29 Years': 30,
        '30 - 39 Years': 40,
        '40 - 49 Years': 50,
        '50 - 59 Years': 60,
        '60 - 69 Years': 70,
        '70 - 79 Years': 80,
        '80+ Years': 90
        }

df['decades_of_age(less_than)'] = df['age_group'].map(change)
df[55:65][['decades_of_age(less_than)','age_group']]

In [None]:
NaN_info(df)

In [None]:
df = df.dropna(subset=['decades_of_age(less_than)'], how='any')
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
target_column = ['death_yn']
predictors = list(set(list(df.columns))-set(target_column))

In [None]:
sns.set(font_scale=1.5)

for el in predictors:
    plt.figure(figsize=(20, 10))
    plot_data = df[['death_yn', el]]
    try:
        sns.countplot(x=el, hue='death_yn', data=plot_data, palette='Set1')
    except:
        pass

In [None]:
plt.figure(figsize=(20, 10))
plot_data = df[['death_yn', 'decades_of_age(less_than)']]

g = sns.pairplot(plot_data, hue='death_yn', palette='Set1', height=10, aspect=2)

handles = g._legend_data.values()
labels = g._legend_data.keys()
g.fig.legend(handles=handles, labels=labels, loc='upper center', ncol=1)

In [None]:
plt.figure(figsize=(20, 10))
plot_data = df[['death_yn', 'month']]

g = sns.pairplot(plot_data, hue='death_yn', palette='Set1', height=10, aspect=2)


handles = g._legend_data.values()
labels = g._legend_data.keys()
g.fig.legend(handles=handles, labels=labels, loc='upper center', ncol=1)

In [None]:
df.columns

# Is this synthetic data?

In [None]:
plt.figure(figsize=(20, 20))
plot_data = pd.DataFrame(df.loc[:, 'decades_of_age(less_than)'])
plt.plot(plot_data.index, plot_data[['decades_of_age(less_than)']], label="Age")

In [None]:
plt.figure(figsize=(20, 20))
plt.scatter(plot_data.index, plot_data[['decades_of_age(less_than)']], label="Age")