In [1]:
%run "/Users/rileyfox/Code/Feature_Engineering/Grid_Search_Functions.ipynb"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, FeatureUnion
from feature_engine.imputation import EndTailImputer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2, SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from Transformer_for_Risk_Factors import DummifyRiskFactor

In [3]:
covid_flu = pd.read_csv('/Users/rileyfox/Code/Feature_Engineering/Chapter_3/covid_flu.csv')

In [4]:
# make a Female binary feature instead of M/F sex
covid_flu['Female'] = covid_flu['Sex'] == 'F'
del covid_flu['Sex']

# replace Yes/No with True/False
covid_flu = covid_flu.replace({'Yes': True, 'No': False})

# FluSymptoms feature
flu_symptoms = [
    'Diarrhea', 'Fever', 'Coughing', 'SoreThroat',
    'NauseaVomitting', 'Fatigue'
]
covid_flu['FluSymptoms'] = covid_flu[flu_symptoms].sum(axis=1) >= 2

# make age positive for box-cox
covid_flu['Age'] = covid_flu['Age'] + .01

In [5]:
X, y = covid_flu.drop(['Diagnosis'], axis=1), covid_flu['Diagnosis']
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0, test_size=.2)

In [6]:
# vectorize risk factors using custom transformer
risk_factor_pipeline = Pipeline(
    [
        ('select_and_parse_risk_factor', FunctionTransformer(lambda df: df['RiskFactors'])),
        ('dummify', DummifyRiskFactor()),
        ('tree_selector', SelectFromModel(max_features=20, estimator=DecisionTreeClassifier()))
    ]
)

# deal with binary columns

binary_features = [
    'Female', 'GroundGlassOpacity', 'CTscanResults',
    'Diarrhea', 'Fever', 'FluSymptoms', 'Coughing',
    'SoreThroat', 'NauseaVomitting', 'Fatigue', 'InitialPCRDiagnosis'
]

binary_pipeline = Pipeline(
    [
        ('select_categorical_features', FunctionTransformer(lambda df: df[binary_features])),
        ('fillna', SimpleImputer(strategy='constant', fill_value=False))
    ]
)

# deal with numerical columns

numeric_types = ['float16', 'float32', 'float64', 'int16', 'int32', 'int64']  # pandas numeric types

numerical_columns = covid_flu.select_dtypes(include=numeric_types).columns.tolist()

numerical_pipeline = Pipeline(
    [
        ('select_numerical_features', FunctionTransformer(lambda df: df[numerical_columns])),
        ('Box-Cox', PowerTransformer('box-cox', standardize=True)),
        ('turn_into_df', FunctionTransformer(lambda matrix: pd.DataFrame(matrix))),
        ('end_of_tail', EndTailImputer(imputation_method='gaussian')),
        ('ordinal_bins', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans'))
    ]
)

In [7]:
# concat all pipelines
simple_fe = FeatureUnion([
    ('risk_factors', risk_factor_pipeline),
    ('binary_pipeline', binary_pipeline),
    ('numerical_pipeline', numerical_pipeline)
])

simple_fe.fit_transform(x_train, y_train).shape

best_model = simple_grid_search(x_train, y_train, x_test, y_test, simple_fe)

2023-01-09 19:29:15,849 - Feature Engineering Log - Parsing took 0.08 seconds
2023-01-09 19:29:24,581 - Feature Engineering Log - Training took 8.81 seconds
2023-01-09 19:29:24,604 - Feature Engineering Log - Overall took 8.83 seconds


              precision    recall  f1-score   support

     COVID19       0.84      0.84      0.84        82
        H1N1       0.94      0.94      0.94       215

    accuracy                           0.91       297
   macro avg       0.89      0.89      0.89       297
weighted avg       0.91      0.91      0.91       297



In [8]:
simple_fe.transformer_list

[('risk_factors',
  Pipeline(steps=[('select_and_parse_risk_factor',
                   FunctionTransformer(func=<function <lambda> at 0x13b5ec400>)),
                  ('dummify', DummifyRiskFactor()),
                  ('tree_selector',
                   SelectFromModel(estimator=DecisionTreeClassifier(),
                                   max_features=20))])),
 ('binary_pipeline',
  Pipeline(steps=[('select_categorical_features',
                   FunctionTransformer(func=<function <lambda> at 0x13b5ec540>)),
                  ('fillna',
                   SimpleImputer(fill_value=False, strategy='constant'))])),
 ('numerical_pipeline',
  Pipeline(steps=[('select_numerical_features',
                   FunctionTransformer(func=<function <lambda> at 0x105128c20>)),
                  ('Box-Cox', PowerTransformer(method='box-cox')),
                  ('turn_into_df',
                   FunctionTransformer(func=<function <lambda> at 0x105128ea0>)),
                  ('end_of_tail', En