# Pipeline

In [1]:
import pandas as pd

RANDOM_STATE=137

In [2]:
covid = pd.read_csv('data/corona_tested_individuals_ver_006.english.csv', dtype={
    'corona_result': 'string[pyarrow]',
    'age_60_and_above': 'string[pyarrow]',
    'gender': 'string[pyarrow]',
    'test_indication': 'string[pyarrow]'
}, parse_dates=['test_date'])
covid.sample(5)

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
22036,2020-04-27,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
48570,2020-04-23,0.0,0.0,0.0,0.0,0.0,negative,,male,Other
190827,2020-04-03,0.0,0.0,0.0,0.0,0.0,negative,No,male,Other
238966,2020-03-27,0.0,0.0,0.0,0.0,0.0,negative,No,male,Other
31276,2020-04-26,0.0,0.0,0.0,0.0,0.0,negative,,male,Other


In [3]:
print(covid.isna().sum() / covid.shape[0] * 100)
print()
print(covid.isna().sum())

test_date               0.000000
cough                   0.090372
fever                   0.090372
sore_throat             0.000359
shortness_of_breath     0.000359
head_ache               0.000359
corona_result           0.000000
age_60_and_above       45.659284
gender                  7.015650
test_indication         0.000000
dtype: float64

test_date                   0
cough                     252
fever                     252
sore_throat                 1
shortness_of_breath         1
head_ache                   1
corona_result               0
age_60_and_above       127320
gender                  19563
test_indication             0
dtype: int64


In [4]:
from ydata_profiling import ProfileReport

ProfileReport(covid, title="Covid Profiling Report").to_notebook_iframe()

ModuleNotFoundError: No module named 'ydata_profiling'

In [8]:
# Removes the "other" label from the target
covid = covid[covid.corona_result != 'other']
# Creates "contact_with_confirmed" column out of "test_indication"
covid = covid.assign(contact_with_confirmed=(covid.test_indication == 'Contact with confirmed').map({True: 1, False: 0}));
# Removes "test_indication" column
covid = covid.drop(columns=['test_indication'])

In [6]:
covid.corona_result = covid.corona_result.map({'negative': 0, 'positive': 1})
covid.gender = covid.gender.map({'female': 0, 'male': 1})

In [7]:
covid = covid.drop(columns='age_60_and_above')

In [8]:
covid = covid.drop(columns='test_date')

In [9]:
covid = covid.dropna()

In [10]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import KMeansSMOTE

# Models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metric
from sklearn.metrics import roc_auc_score

num_features = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'gender', 'contact_with_confirmed']

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline([
    # ('encoder', OneHotEncoder()),
    # ("selector", SelectPercentile(chi2, percentile=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        # ('cat', cat_transformer, cat_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # This should be wrapped in a "Transformer" that implements "fit" and "transform", otherwise it doesn't work
    # ('resampler', KMeansSMOTE(sampling_strategy='minority', random_state=RANDOM_STATE, k_neighbors=2)),
    ('scaler', 'passthrough'),
    ('classifier', XGBClassifier())
])

params = {
    'scaler': [StandardScaler(), MaxAbsScaler(), 'passthrough'],
    # 'classifier__learning_rate': [0.3, 0.5, 0.7],
    'classifier__n_estimators': [10, 100],
    'classifier__max_depth': [2, 6]
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='roc_auc', # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=2
)

X = covid.drop(columns=['corona_result'])
y = covid.corona_result

sm = KMeansSMOTE(sampling_strategy='minority', random_state=RANDOM_STATE, k_neighbors=2)
X, y = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

model.fit(X_train, y_train)

In [11]:
model.best_params_

{'classifier__max_depth': 6,
 'classifier__n_estimators': 100,
 'scaler': StandardScaler()}

In [12]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [13]:
print(f'ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1]):0.4f}')

ROC AUC score: 0.9923


In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.97      0.99      0.98     48428
    positive       0.99      0.97      0.98     48427

    accuracy                           0.98     96855
   macro avg       0.98      0.98      0.98     96855
weighted avg       0.98      0.98      0.98     96855

