# Pipeline to automate data preprocessing and model training

## Load data

In [11]:
import pandas as pd

path = '../../../../data/Heart-disease/output/filled_missing.csv'
df = pd.read_csv(path, index_col=0)
df

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,disease
0,63,Male,Cleveland,typical angina,145,233,1,lv hypertrophy,150,0,2.300000,No
1,67,Male,Cleveland,asymptomatic,160,286,0,lv hypertrophy,108,1,1.500000,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...
918,58,Male,VA Long Beach,asymptomatic,139,385,1,lv hypertrophy,124,1,1.287899,No
919,62,Male,VA Long Beach,atypical angina,120,254,0,lv hypertrophy,93,1,0.000000,Mild


## Feature selection

In [12]:
target = 'disease'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Pipeline

### Data preprocessing

In [14]:
features = X.dtypes

features_categorical = features[features == 'object'].index
features_numerical = features[features != 'object'].index

print(f'Categorical: {features_categorical}')
print(f'Numerical: {features_numerical}')

Categorical: Index(['sex', 'dataset', 'cp', 'restecg'], dtype='object')
Numerical: Index(['age', 'trestbps', 'chol', 'fbs', 'thalch', 'exang', 'oldpeak'], dtype='object')


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), features_categorical),
        ('scaler', MinMaxScaler(), features_numerical)
    ])

### Model

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

### Alltogether

In [20]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

In [21]:
pipeline.score(X_test, y_test)

0.5615942028985508

In [22]:
pipeline.score(X_train, y_train)

0.9130434782608695

## Pipeline with GridSearchCV

In [25]:
from sklearn.model_selection import GridSearchCV

model = GradientBoostingClassifier()
param_grid = {
    'max_depth': [5, 7, 9, 11],
    'learning_rate': [0.001, 0.01, 0.1, 1]
}

cv = GridSearchCV(model, param_grid)

In [26]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cv)
])

pipeline.fit(X_train, y_train)

In [27]:
pipeline.named_steps['model'].best_params_

{'learning_rate': 0.01, 'max_depth': 5}

In [28]:
pipeline.score(X_test, y_test)

0.5869565217391305

In [29]:
pipeline.score(X_train, y_train)

0.8183229813664596

## Pipeline with another model

In [30]:
from sklearn.svm import SVC

model = SVC(probability=True)
model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

cv = GridSearchCV(model, param_grid)
cv

In [18]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cv)
])

pipeline.fit(X_train, y_train)

In [19]:
pipeline.score(X_test, y_test)

0.7101449275362319

In [20]:
pipeline.score(X_train, y_train)

0.7432712215320911

In [21]:
pipeline.steps[1][1].best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'linear'}

In [23]:
import pickle

path = '../../../artifacts/pipeline.pkl'

with open(path, 'wb') as file:
    pickle.dump(pipeline, file)