In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load in 

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the "../input/" directory.
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import cufflinks as cf
import plotly
import missingno as msno
import seaborn as sns
%pylab inline

cf.go_offline()
py.init_notebook_mode()

sns.set_style('whitegrid')

In [None]:
df = pd.read_csv("/kaggle/input/uncover/UNCOVER/einstein/diagnosis-of-covid-19-and-its-clinical-spectrum.csv")
df[:5]

In [None]:
PATIENT_ID = "patient_id"
PATIENT_AGE_QUANTILE = "patient_age_quantile"
ADMITTED_REGULAR = "patient_addmited_to_regular_ward_1_yes_0_no"
ADMITTED_SEMI_ICU = "patient_addmited_to_semi_intensive_unit_1_yes_0_no"
ADMITTED_ICU = "patient_addmited_to_intensive_care_unit_1_yes_0_no"
EXAM_RESULT = "sars_cov_2_exam_result"

# Missing Values Analysis
- I see a lot of null values. The values most present are from the blood chemistry tests and a viral history of the patient.

In [None]:
# remove variables that are all null
null_pct = (df.isnull().sum() / len(df)).sort_values()[::-1]
df = df.drop(columns=null_pct[null_pct == 1].index)

In [None]:
msno.bar(df.iloc[:, :50]);

In [None]:
msno.bar(df.iloc[:, 50:100]);

In [None]:
msno.matrix(df);

In [None]:
msno.dendrogram(df, );

# Target Distributions
- We have 558 positive results (9.88%) from the SARS-COV-2 exam.
- The admitted variables may or may not have positive results
- *Define severity in our case definition to be: Positive, Positive and admitted to regular ward, admitted to semi-ICU and admitted to ICU*

In [None]:
df[EXAM_RESULT].value_counts()/len(df)`

In [None]:
df[EXAM_RESULT].value_counts().iplot(kind='barh', title='Positive SARS-COV-2 Result',)
df.groupby(EXAM_RESULT)[ADMITTED_REGULAR].value_counts().unstack().iplot(kind='barh', title='Admitted to regular ward', )
df.groupby(EXAM_RESULT)[ADMITTED_SEMI_ICU].value_counts().unstack().iplot(kind='barh', title='Admitted to semi-ICU', )
df.groupby(EXAM_RESULT)[ADMITTED_ICU].value_counts().unstack().iplot(kind='barh', barmode='stack', title='Admitted to ICU', )

In [None]:
TARGETS = [ADMITTED_REGULAR, ADMITTED_SEMI_ICU, ADMITTED_ICU]
SEVERITY = "Severity"
SEVERITY_CATEGORICAL = "SeverityCategorical"
# create new ordinal variable, based on severity
df.loc[(df[EXAM_RESULT] == "positive"), SEVERITY] = 1
df.loc[(df[ADMITTED_REGULAR] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY] = 2
df.loc[(df[ADMITTED_SEMI_ICU] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY] = 3
df.loc[(df[ADMITTED_ICU] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY] = 4
# those that are turned out negative that are admitted to the wards will be dropped
df.loc[(df[EXAM_RESULT] == "negative") & (np.all(df.filter(like="addmited") == 'f', axis=1)), SEVERITY] = 0

df.loc[df[EXAM_RESULT] == "positive", SEVERITY_CATEGORICAL] = "Positive"
df.loc[(df[ADMITTED_REGULAR] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY_CATEGORICAL] = "Admitted to regular ward"
df.loc[(df[ADMITTED_SEMI_ICU] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY_CATEGORICAL] = "Admitted to semi-ICU"
df.loc[(df[ADMITTED_ICU] == "t") & (df[EXAM_RESULT] == "positive"), SEVERITY_CATEGORICAL] = "Admitted to ICU"
df.loc[(df[EXAM_RESULT] == "negative") & (np.all(df.filter(like="addmited") == 'f', axis=1)), SEVERITY_CATEGORICAL] = "Negative"

# dropping null severity
df = df.dropna(subset=[SEVERITY, SEVERITY_CATEGORICAL])

In [None]:
# https://stackoverflow.com/questions/43214978/seaborn-barplot-displaying-values
def show_values_on_bars(axs, fmt="{:.2f}"):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height()
            value = fmt.format(p.get_height())
            ax.text(_x, _y, value, ha="center") 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
vc = df[SEVERITY_CATEGORICAL].value_counts().to_frame("Count of Cases").reset_index()
vc = vc.rename(columns={"index" : SEVERITY})

plt.figure(figsize=(15, 7))
g=sns.barplot(x=SEVERITY,y='Count of Cases',data=vc, )
show_values_on_bars(g, fmt='{:.0f}')

# layout = dict(yaxis=dict(side='left'),title='Positive SARS-COV-2 Result')
# vc.iplot(kind='bar', layout=layout,)

# Distribution of the blood chemistry variables

In [None]:
BLOOD_VARIABLES = ['hematocrit',
       'hemoglobin', 'platelets', 'mean_platelet_volume', 'red_blood_cells', 'neutrophils',
       'lymphocytes', 'mean_corpuscular_hemoglobin_concentration_mchc',
       'leukocytes', 'basophils', 'mean_corpuscular_hemoglobin_mch',
       'eosinophils', 'mean_corpuscular_volume_mcv', 'monocytes',
       'red_blood_cell_distribution_width_rdw', 'serum_glucose', 'creatinine', 'sodium', 'urea', 'potassium',
       'proteina_c_reativa_mg_dl']

## Correlation

In [None]:
df[BLOOD_VARIABLES].corr().round(2)

## Positive vs Negative

In [None]:
for var_name in BLOOD_VARIABLES:
    fig = plt.figure(figsize=(15, 7))
    sns.distplot(df[df[SEVERITY] == 0][var_name], label="Negative")
    sns.distplot(df[df[SEVERITY] > 0][var_name], label="Positive, all")
    fig.suptitle("{} Positive vs Negative".format(var_name).title())
    plt.legend()

## Negative vs Admitted

In [None]:
for var_name in BLOOD_VARIABLES:
    fig = plt.figure(figsize=(15, 7))
    sns.distplot(df[df[SEVERITY] == 0][var_name], label="Negative")
    sns.distplot(df[df[SEVERITY] > 1][var_name], label="Admitted, all")
    fig.suptitle("{} Positive vs Admitted".format(var_name).title())
    plt.legend()

# Antigen Variables

In [None]:
ANTIGEN_VARIABLES = ['respiratory_syncytial_virus', 'influenza_a',
       'influenza_b', 'parainfluenza_1', 'coronavirusnl63',
       'rhinovirus_enterovirus', 'coronavirus_hku1',
       'parainfluenza_3', 'chlamydophila_pneumoniae', 'adenovirus',
       'parainfluenza_4', 'coronavirus229e', 'coronavirusoc43',
       'inf_a_h1n1_2009', 'bordetella_pertussis', 'metapneumovirus',
       'parainfluenza_2', 'influenza_b_rapid_test',
       'influenza_a_rapid_test']

In [None]:
for var_name in ANTIGEN_VARIABLES:
    fig = plt.figure(figsize=(15, 7))
    vc = df.groupby(var_name)[SEVERITY_CATEGORICAL].value_counts().unstack().T
    # order
    vc = vc.loc[["Admitted to ICU", "Admitted to semi-ICU", "Admitted to regular ward", "Positive"]]
    vc_pct = vc.div(vc.sum(axis=1), axis=0)
#     vc_pct.iplot(kind='bar', barmode = 'stack', title = "{} on Severity".format(var_name).title());
    vc.iplot(kind='bar', barmode = 'stack', title = "{} on Severity".format(var_name).title());

# Antigen Correlation with Positive Results

In [None]:
for var_name in ANTIGEN_VARIABLES:
    vc = df.groupby(EXAM_RESULT)[var_name].value_counts().unstack()
    vc_pct = vc.div(vc.sum(axis=1), axis=0)
#     vc_pct.iplot(kind='bar', barmode = 'stack', title = "{} on Severity".format(var_name).title());
    
    g = vc.plot.barh()
#     vc.iplot(kind='bar', barmode = 'stack', title = "{} on Severity".format(var_name).title());

# Modeling
- I am choosing only the blood chemistry variables for this, since most other tests are null.
- The antigen variables seem to not indicate anything useful from the above figures.
- I am combining all the positive results due to the lack of samples for admitted categories.

- *First run: dropping all rows with null blood variables*
- *Second run: imputing to 0 all null blood variables. Since these are I think z-scores (hopefully), this will be fine.*

In [None]:
df.dropna(subset=BLOOD_VARIABLES)[SEVERITY_CATEGORICAL].value_counts()

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV

In [None]:
X1 = df.dropna(subset=BLOOD_VARIABLES)
y1 = np.where(X1[EXAM_RESULT] == "negative", 0, 1)
X1 = X1[BLOOD_VARIABLES].reset_index(drop=True)

## Removing collinearity through VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

In [None]:
X1_selected = calculate_vif_(X1)

In [None]:
# removing regularization (high C)
lr = LogisticRegression(C=1e10, max_iter=10000)
results = cross_validate(lr, X1_selected, y1, cv=10, scoring=['roc_auc', "precision", "recall"])
pd.DataFrame(results)

In [None]:
n_repeats=20
list_results = []
lpo = StratifiedShuffleSplit(n_splits=10)
for _ in range(n_repeats):
    for train_index, test_index in lpo.split(X1_selected, y1):
        lr.fit(X1_selected.iloc[train_index], y1[train_index])
        y_preds = lr.predict(X1_selected.iloc[test_index])
        list_results.append(precision_recall_fscore_support(y1[test_index], y_preds))


In [None]:
df_results = pd.DataFrame(np.array(list_results).reshape(n_repeats * 10, -1), columns=["Precision0", "Precision1", "Recall0", "Recall1", "FScore0", "FScore1", "Support0", "Support1"])
display(df_results.mean().to_frame().T)

## Coefficients

In [None]:
lr.fit(X1_selected, y1)
coef = pd.Series(lr.coef_.ravel(), index=X1_selected.columns)
coef.sort_values().plot.barh()

In [None]:
import statsmodels.api as sm
x = sm.add_constant(X1_selected, prepend=False)

res1 = sm.Logit(y1, x).fit()
print(res1.summary())

# Support Vector Machine
- The best result is a linear kernel. That does not bode well for our nonlinear classifiers.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
svc = SVC(class_weight='balanced')
params = {"C": [0.01, 0.1, 1, 10, ], "kernel": ["linear", "poly", "rbf"],}
gs = GridSearchCV(svc, params, scoring='roc_auc', cv= 10)
gs.fit(X1, y1)

In [None]:
print("Best ROC AUC Score:", gs.best_score_.round(2))
print("Best Params:", gs.best_params_)

In [None]:
svc = gs.best_estimator_.fit(X1, y1)
coef = pd.Series(svc.coef_.ravel(), index=X1.columns)
coef.sort_values().plot.barh()

In [None]:
n_repeats=20
list_results = []
lpo = StratifiedShuffleSplit(n_splits=10)
for _ in range(n_repeats):
    for train_index, test_index in lpo.split(X1, y1):
        svc.fit(X1.iloc[train_index], y1[train_index])
        y_preds = svc.predict(X1.iloc[test_index])
        list_results.append(precision_recall_fscore_support(y1[test_index], y_preds))


In [None]:
df_results = pd.DataFrame(np.array(list_results).reshape(n_repeats * 10, -1), columns=["Precision0", "Precision1", "Recall0", "Recall1", "FScore0", "FScore1", "Support0", "Support1"])
display(df_results.mean().to_frame().T)

# Decision Tree
- Not good at all

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# removing regularization (high C)
dt = DecisionTreeClassifier(class_weight='balanced')
params = {"max_depth": [3, 5, 7], "min_samples_split": [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]}
gs = GridSearchCV(dt, params, scoring='roc_auc', cv= 10)
gs.fit(X1, y1)

In [None]:
print("Best ROC AUC Score:", gs.best_score_.round(2))
print("Best Params:", gs.best_params_)

# Imputation

In [None]:
X1 = df[BLOOD_VARIABLES].fillna(0)
y1 = np.where(df[EXAM_RESULT] == "negative", 0, 1)
X1 = X1[BLOOD_VARIABLES].reset_index(drop=True)

## Removing collinearity through VIF

In [None]:
X1_selected = calculate_vif_(X1)

In [None]:
# removing regularization (high C)
lr = LogisticRegression(C=1e10, max_iter=10000, class_weight='balanced')
results = cross_validate(lr, X1_selected, y1, cv=10, scoring=['roc_auc', "precision", "recall"])
pd.DataFrame(results)

In [None]:
n_repeats=20
list_results = []
lpo = StratifiedShuffleSplit(n_splits=10)
for _ in range(n_repeats):
    for train_index, test_index in lpo.split(X1_selected, y1):
        lr.fit(X1_selected.iloc[train_index], y1[train_index])
        y_preds = lr.predict(X1_selected.iloc[test_index])
        list_results.append(precision_recall_fscore_support(y1[test_index], y_preds))


In [None]:
df_results = pd.DataFrame(np.array(list_results).reshape(n_repeats * 10, -1), columns=["Precision0", "Precision1", "Recall0", "Recall1", "FScore0", "FScore1", "Support0", "Support1"])
display(df_results.mean().to_frame().T)

## Coefficients

In [None]:
lr.fit(X1_selected, y1)
coef = pd.Series(lr.coef_.ravel(), index=X1_selected.columns)
coef.sort_values().plot.barh()

In [None]:
import statsmodels.api as sm
x = sm.add_constant(X1_selected, prepend=False)

res1 = sm.Logit(y1, x).fit()
print(res1.summary())

# Support Vector Machine
- The best result is a linear kernel. That does not bode well for our nonlinear classifiers.
- The best score is not very impressive either.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
svc = SVC(class_weight='balanced')
params = {"C": [0.01, 0.1, 1, 10, ], "kernel": ["linear", "poly", "rbf"],}
gs = GridSearchCV(svc, params, scoring='roc_auc', cv= 3)
gs.fit(X1, y1)

In [None]:
print("Best ROC AUC Score:", gs.best_score_.round(2))
print("Best Params:", gs.best_params_)

In [None]:
svc = gs.best_estimator_.fit(X1, y1)
coef = pd.Series(svc.coef_.ravel(), index=X1.columns)
coef.sort_values().plot.barh()

In [None]:
n_repeats=20
list_results = []
lpo = StratifiedShuffleSplit(n_splits=10)
for _ in range(n_repeats):
    for train_index, test_index in lpo.split(X1, y1):
        svc.fit(X1.iloc[train_index], y1[train_index])
        y_preds = svc.predict(X1.iloc[test_index])
        list_results.append(precision_recall_fscore_support(y1[test_index], y_preds))


In [None]:
df_results = pd.DataFrame(np.array(list_results).reshape(n_repeats * 10, -1), columns=["Precision0", "Precision1", "Recall0", "Recall1", "FScore0", "FScore1", "Support0", "Support1"])
display(df_results.mean().to_frame().T)

# Decision Tree
- Not good at all

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# removing regularization (high C)
dt = DecisionTreeClassifier(class_weight='balanced')
params = {"max_depth": [3, 5, 7], "min_samples_split": [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]}
gs = GridSearchCV(dt, params, scoring='roc_auc', cv= 10)
gs.fit(X1, y1)

In [None]:
print("Best ROC AUC Score:", gs.best_score_.round(2))
print("Best Params:", gs.best_params_)