# Pipeline

In [24]:
import pandas as pd

RANDOM_STATE=137

In [ ]:
covid = pd.read_csv('data/corona_tested_individuals_ver_006.english.csv', dtype={
    'age_60_and_above': 'str'
})

In [ ]:
covid.info()

In [ ]:
# covid = covid.drop(columns=['age_60_and_above', 'gender', 'test_date'])

In [ ]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metric
from sklearn.metrics import roc_auc_score

num_features = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache']
cat_features = [
    'test_indication',
    # 'age_60_and_above',
    # 'gender'
]

num_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline([
    ('encoder', OneHotEncoder()),
    # ("selector", SelectPercentile(chi2, percentile=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', 'passthrough'),
    # ('classifier', XGBClassifier())
    ('classifier', DecisionTreeClassifier())
])

params = {
    'scaler': [StandardScaler(), MaxAbsScaler()],
    # 'classifier__learning_rate': [0.3, 0.5, 0.7],
    'classifier__n_estimators': [10, 50, 100]
    'classifier__max_depth': [2, 4, 6]
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='roc_auc', # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=2
)

X = covid.drop(columns=['encoded_corona_result', 'test_date']).copy()
y = covid.encoded_corona_result.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

# model = XGBClassifier(objective='multi:softmax')
# model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [ ]:
model.best_params_

In [ ]:

# print(f'ROC AUC score: {roc_auc_score(y_test, model.predict(X_test)):0.4f}')

# n_estimators=100, max_depth=6, objective='multi:softmax'

In [ ]:
y_test

In [ ]:
pred = model.predict(X_test)

In [ ]:
predicted_probabilities = model.predict_proba(X_test)

In [ ]:
pred

In [ ]:
roc_auc_score(y_test, predicted_probabilities[:, -1])*100

In [ ]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Number of of TN, FP, FN, TP of predicted diagnoses

def plot_confusion_matrix(y_test, predictions):
    """
        Top left - (Negative-Negative) - True Negative
        Top right - (Positive-Negative) - False Positive
        Bottom left - (Negative-Positive) - False Negative
        Bottom right - (Positive-Positive) - True Positive

    """
    classes = ["Negative", "Positive"]

    matrix = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()


plot_confusion_matrix(y_test, pred)

## Initial interpretation

- Model exhibits high performance for correctly classifying negative diagnoses, as expected given the class imbalance.
- False positives is low, indicating there are fewer instances of producing false diagnoses.
- Significantly more FN than FP - indicating the model is more likely to miss actual positive cases (FN) than to than to misdiagnose (FP)

- Expected to see this type of distribution, given the heavy class imbalance
- Difference between TP and FP is not that far off, given the small subset of positive diagnoses overall. This is a strong indicator that the model struggles to diagnose for actual positive cases.