# Pipeline

In [1]:
import pandas as pd

RANDOM_STATE=137

In [9]:
covid = pd.read_csv('data/corona_tested_individuals_ver_006.english.csv', dtype={
    'age_60_and_above': 'str'
})

In [10]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   test_date            278848 non-null  object 
 1   cough                278596 non-null  float64
 2   fever                278596 non-null  float64
 3   sore_throat          278847 non-null  float64
 4   shortness_of_breath  278847 non-null  float64
 5   head_ache            278847 non-null  float64
 6   corona_result        278848 non-null  object 
 7   age_60_and_above     151528 non-null  object 
 8   gender               259285 non-null  object 
 9   test_indication      278848 non-null  object 
dtypes: float64(5), object(5)
memory usage: 21.3+ MB


In [206]:
# covid = covid.drop(columns=['age_60_and_above', 'gender', 'test_date'])

  covid = covid.replace({


In [220]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metric
from sklearn.metrics import roc_auc_score

num_features = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache']
cat_features = ['test_indication']

num_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline([
    ('encoder', OneHotEncoder()),
    # ("selector", SelectPercentile(chi2, percentile=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', 'passthrough'),
    # ('classifier', XGBClassifier())
    ('classifier', DecisionTreeClassifier())
])

params = {
    'scaler': [StandardScaler(), MaxAbsScaler()],
    # 'classifier__learning_rate': [0.3, 0.5, 0.7],
    'classifier__n_estimators': [10, 50, 100]
    'classifier__max_depth': [2, 4, 6]
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='roc_auc', # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=2
)

X = covid.drop(columns=['encoded_corona_result', 'test_date']).copy()
y = covid.encoded_corona_result.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

# model = XGBClassifier(objective='multi:softmax')
# model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [221]:
model.best_params_

{'classifier__n_estimators': 10}

In [187]:

# print(f'ROC AUC score: {roc_auc_score(y_test, model.predict(X_test)):0.4f}')

# n_estimators=100, max_depth=6, objective='multi:softmax'

array([[0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       ...,
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932]], dtype=float32)

In [223]:
y_test

267506    0
210664    0
33542     0
232328    0
254684    0
         ..
87890     0
203900    0
256779    1
271317    0
239038    1
Name: encoded_corona_result, Length: 54941, dtype: int64

In [225]:
model.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [229]:
model.predict_proba(X_test)[:, 1]

array([0.01593744, 0.01593744, 0.01593744, ..., 0.92425686, 0.01593744,
       0.09324414], dtype=float32)