## Importing Packages & Displaying Available Data Files

In [None]:
# Import required packages
import math
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, recall_score, precision_score, SCORERS

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Loading Dataset

In [None]:
raw_data = pd.read_csv("/kaggle/input/characteristics-corona-patients/Characteristics_Corona_patients_version_6 - 19-7-2020.csv")
raw_data

## Data Cleaning

In [None]:
df = raw_data.copy()

def is_dead(row):
    if math.isnan(row['deceased_date_D']):
        return 0
    else:
        return 1

df['is_dead'] = df.apply(is_dead, axis=1)

df.drop(columns=['confirmed_date_D',
                 'deceased_date_D',
                 'released_date_D',
                 'return_date_D',
                 'date_onset_symptoms_D',
                 'age_band',
                 'background_diseases_binary',
                 'country',
                 'origin',
                 'return_date_until_date_onset_symptoms',
                 'confirmed_date_until_released_date',
                 'confirmed_date_until_deceased_date'], inplace=True)

df = pd.concat([df.pop('is_dead'), df], axis=1)

df.isna().sum()/len(df)

In [None]:
# Cleaning 'treatment' column
def clean_treatment(data):
    if data == '1':
        return 1.0
    elif type(data) == str:
        return np.NaN
    elif math.isnan(data):
        return np.NaN
    else:
        return data
    
df['treatment'] = df['treatment'].apply(clean_treatment)

## Data Wrangling

In [None]:
df2 = df.copy()
df2

In [None]:
# Collecting all symptoms and background_diseases features
all_symptoms = []
all_bd = []

for col in list(df2.columns):
    if col[:8] == "symptoms":
        all_symptoms.append(col)
    elif col[:19] == "background_diseases":
        all_bd.append(col)

# Check which features contain missing data for dataset with symptoms
temp = df2.copy()
temp.dropna(how='any', subset=all_symptoms, inplace=True)
temp.reset_index(drop=True, inplace=True)

print("Length of symptoms data:", len(temp))

temp.iloc[:,:8].isna().sum()/len(temp)

In [None]:
# Check which features contain missing data for dataset with background diseases
temp = df2.copy()
temp.dropna(how='any', subset=all_bd, inplace=True)
temp.reset_index(drop=True, inplace=True)

print("Length of background_diseases data:", len(temp))

temp.iloc[:,:8].isna().sum()/len(temp)

## Further Data Cleaning

In [None]:
### Cleaning only symptoms
df.drop(columns=['severity_illness_infectious_person',
                 'severity_illness'], inplace=True)

all_symptoms = []

for col in list(df.columns):
    if col[:8] == "symptoms":
        all_symptoms.append(col)

df.drop(columns=all_symptoms, inplace=True)

df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)

df

In [None]:
list(df.columns)

In [None]:
temp = df.copy()

for col in list(df.columns):
    print("Column:", col)
    print(temp[col].value_counts().append(pd.Series(temp[col].isna().sum(),index=['NaN'])))
    print("\n")

In [None]:
for col in list(df.columns):
    if col == "age" or col == "date_onset_symptoms_until_confirmed_date":
        continue
    else:
        df[col] = df[col].astype('bool')

print(df.info())
df

In [None]:
# Train-Test Split
X = df.iloc[:,1:].copy()
y = df.iloc[:,0:1].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

In [None]:
df_train = pd.concat([y_train, X_train], axis=1)
df_train

In [None]:
df_test = pd.concat([y_test, X_test], axis=1)
df_test

In [None]:
df_train.to_csv("Exception_Datasets (Train).csv", index=False)
df_test.to_csv("Exception_Datasets (Test).csv", index=False)