# Pipeline

In [1]:
import pandas as pd

RANDOM_STATE=137

In [5]:
covid = pd.read_csv('data/corona_tested_individuals_ver_006.english.csv', dtype={
    'age_60_and_above': 'str'
})
covid.test_date = pd.to_datetime(covid.test_date)

In [6]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   test_date            278848 non-null  datetime64[ns]
 1   cough                278596 non-null  float64       
 2   fever                278596 non-null  float64       
 3   sore_throat          278847 non-null  float64       
 4   shortness_of_breath  278847 non-null  float64       
 5   head_ache            278847 non-null  float64       
 6   corona_result        278848 non-null  object        
 7   age_60_and_above     151528 non-null  object        
 8   gender               259285 non-null  object        
 9   test_indication      278848 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(4)
memory usage: 21.3+ MB


In [4]:
covid.describe(include='all')

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
count,278848,278596.0,278596.0,278847.0,278847.0,278847.0,278848,151528,259285,278848
unique,51,,,,,,3,2,2,3
top,2020-04-20,,,,,,negative,No,female,Other
freq,10921,,,,,,260227,125703,130158,242741
mean,,0.151574,0.078077,0.006907,0.005655,0.008657,,,,
std,,0.358608,0.268294,0.082821,0.07499,0.09264,,,,
min,,0.0,0.0,0.0,0.0,0.0,,,,
25%,,0.0,0.0,0.0,0.0,0.0,,,,
50%,,0.0,0.0,0.0,0.0,0.0,,,,
75%,,0.0,0.0,0.0,0.0,0.0,,,,


In [7]:
covid.sample(5)

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
253577,2020-03-24,1.0,0.0,0.0,0.0,0.0,negative,No,female,Abroad
44447,2020-04-23,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
39069,2020-04-24,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
262577,2020-03-22,1.0,1.0,0.0,0.0,0.0,negative,No,male,Contact with confirmed
152229,2020-04-10,1.0,0.0,0.0,0.0,0.0,negative,No,female,Other


In [8]:
from ydata_profiling import ProfileReport

ProfileReport(covid, title="Covid Profiling Report").to_notebook_iframe()

ModuleNotFoundError: No module named 'ydata_profiling'

In [220]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metric
from sklearn.metrics import roc_auc_score

num_features = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache']
cat_features = ['test_indication']

num_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline([
    ('encoder', OneHotEncoder()),
    # ("selector", SelectPercentile(chi2, percentile=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', 'passthrough'),
    # ('classifier', XGBClassifier())
    ('classifier', DecisionTreeClassifier())
])

params = {
    'scaler': [StandardScaler(), MaxAbsScaler()],
    # 'classifier__learning_rate': [0.3, 0.5, 0.7],
    'classifier__n_estimators': [10, 50, 100]
    'classifier__max_depth': [2, 4, 6]
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='roc_auc', # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=2
)

X = covid.drop(columns=['encoded_corona_result', 'test_date']).copy()
y = covid.encoded_corona_result.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

# model = XGBClassifier(objective='multi:softmax')
# model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [221]:
model.best_params_

{'classifier__n_estimators': 10}

In [187]:

# print(f'ROC AUC score: {roc_auc_score(y_test, model.predict(X_test)):0.4f}')

# n_estimators=100, max_depth=6, objective='multi:softmax'

array([[0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       ...,
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932],
       [0.9316995 , 0.03658113, 0.03171932]], dtype=float32)

In [223]:
y_test

267506    0
210664    0
33542     0
232328    0
254684    0
         ..
87890     0
203900    0
256779    1
271317    0
239038    1
Name: encoded_corona_result, Length: 54941, dtype: int64

In [225]:
model.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [229]:
model.predict_proba(X_test)[:, 1]

array([0.01593744, 0.01593744, 0.01593744, ..., 0.92425686, 0.01593744,
       0.09324414], dtype=float32)