In [None]:
# Install all packeges

# !python -m pip install --upgrade pip
# !pip3 install pandas
# !pip3 install numpy
# !pip3 install plotly
# !pip3 install cufflinks
# !pip3 install seaborn
# !pip3 install sklearn
# !pip3 install scikit-plot
# !pip3 install chart_studio

In [None]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.metrics import confusion_matrix
import itertools  

from sklearn.metrics import roc_curve, auc


import seaborn as sns
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split

import scikitplot as skplt
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
# Import Data
df = pd.read_csv("../input/coronavirusdataset/PatientInfo.csv")
df.head()

In [None]:
# Remove symbols from df
df['age'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [None]:
age_dist = df['age'].value_counts()
px.bar(age_dist, title='Age distribution')

In [None]:
df['sex'].dropna(axis=0,inplace=True)

In [None]:
# Change to int
d_state =  {"released": 0, 'deceased': 1, 'isolated': 2}
df['state'] = df['state'].map(d_state)
d_sex = {"male": 1, "female": 0,}
df['sex'] = df['sex'].map(d_sex)
# fill nan values
df['state'].fillna(2, inplace = True)

In [None]:
# Remove active cases
indexNames = df[df['state'] == 2].index

df_c = df.drop(indexNames)

In [None]:
death_by_sex = pd.crosstab(df_c['sex'], df_c['state'], normalize=True)


In [None]:
px.bar(death_by_sex, title='Death by sex')

In [None]:
# Save only informative features and change datatypes
df_closed = df_c[['state', 'age', 'sex']]
df_closed['age'].dropna(axis=0,inplace=True)
df_closed['age'].astype('float64')
df_closed.fillna(0,inplace = True)
df_closed.dropna(inplace=True)

# Stratifying Split Data

In [None]:
y = df_closed['state']
X = df_closed.drop('state', axis = 1)

X_train, X_holdout, y_train, y_holdout,  = train_test_split(X, y, test_size=0.3,
                                                          random_state=10, stratify=y)

In [None]:
X_train.shape, X_holdout.shape, y_train.shape, y_holdout.shape

In [None]:
y_holdout.value_counts()

# Models Training 

In [None]:
tree = DecisionTreeClassifier(max_depth=10, random_state=10, class_weight='balanced')

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
%%time
knn.fit(X_train, y_train)

In [None]:
%%time
tree.fit(X_train, y_train)

In [None]:
knn_pred = knn.predict(X_holdout)
metrics.balanced_accuracy_score(y_holdout, knn_pred)

In [None]:
tree_pred = tree.predict(X_holdout)
metrics.balanced_accuracy_score(y_holdout, tree_pred)

In [None]:
# Initialize hyperparameters for grid search
parameters = { 'min_samples_leaf': range(1,10, 2), 'min_samples_split': range(1,25, 5)}
rfc = RandomForestClassifier(n_estimators=500, random_state=42, 
                             n_jobs=-1, criterion='gini', class_weight='balanced')
gcv = GridSearchCV(rfc, parameters, n_jobs=2, cv=10, verbose=1)
gcv.fit(X, y)

In [None]:
print(gcv.best_estimator_, gcv.best_score_)

In [None]:
metrics.balanced_accuracy_score(y_holdout, gcv.best_estimator_.predict(X_holdout))

### Metrics of Binar Classifier

$$rec = TPR = \frac{TP}{TP + FN},\quad SPC = \frac{TN}{TN + FP},\quad prec = PPV = \frac{TP}{TP + FP},\quad FPR = 1 - SPC,$$

$$ACC = \frac{TP + TN}{TP + TN + FP + FN},\quad F1 = 2\frac{PPV\cdot TRP}{PPV + TPR}.$$

 $TPR$ (True positive rate, recall, sensitivity)

 $SPC$ (Specificity, true negative rate)

 $PPV$ (Positive predictive value, precision)

$FPR$ (False positive rate)

$ACC$ (Accuracy)
$F1$ (F1-measure)

In [None]:
true_labels = (y_holdout)
predicted_labels = np.array(gcv.best_estimator_.predict(X_holdout))

M = metrics.confusion_matrix(true_labels, predicted_labels)
M


PPV = metrics.precision_score(true_labels, predicted_labels)
TPR = metrics.recall_score(true_labels, predicted_labels)
F1 = metrics.f1_score(true_labels, predicted_labels)
ACC = metrics.accuracy_score(true_labels, predicted_labels)
Balanced = metrics.balanced_accuracy_score(true_labels, predicted_labels)
recall = metrics.recall_score(true_labels, predicted_labels)

print(f' PPV: {PPV}, \n TPR: {TPR}, \n ACC: {ACC}, \n F1: {F1}, \n Balanced: {Balanced}, \n recall: {recall}')

In [None]:
fpr, tpr, thresholds = roc_curve(y_holdout, gcv.best_estimator_.predict_proba(X_holdout)[:,1], pos_label=1)

fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=800, height=800
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

# Confusion Matrix for Random Forest Best params model

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]*100)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

font = {'size' : 15}

plt.rc('font', **font)

cnf_matrix = confusion_matrix(y_holdout, gcv.best_estimator_.predict(X_holdout))
plt.figure(figsize=(10, 8))
plot_confusion_matrix(cnf_matrix, classes=['Survived', 'Dead'],
                      title='Confusion matrix', normalize=False)
plt.savefig("conf_matrix.png")
plt.show()

# Feature Importances

In [None]:
skplt.estimators.plot_feature_importances(gcv.best_estimator_, title='Feature Importance', feature_names=X_holdout.columns, 
                                          max_num_features=5, 
                                          order='descending', x_tick_rotation=45,
                                          ax=None, figsize=None, title_fontsize='medium', text_fontsize='medium')

In [None]:
# Deriving Class probabilities
predicted_probabilities = gcv.best_estimator_.predict_proba(X_holdout)

# Creating the plot
skplt.metrics.plot_cumulative_gain(y_holdout, predicted_probabilities)