<a href="https://colab.research.google.com/github/rtajeong/2025_machine_learning_class/blob/main/Pipeline_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline example
- This is a simple but informative example for Pipeline.
- notebook source from https://drive.google.com/drive/folders/1j0ZkQ3bWlYYBt60idE14VtDGPi7-bsqM
- download the dataset from: https://www.kaggle.com/datasets/kumargh/pimaindiansdiabetescsv

In [None]:
# from IPython.display import Image
# Image(filename='C:\\Users\\User\\Desktop\\capture.png')

In [None]:
#Filtering Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

- data preparation

In [None]:
# pd.read_csv("pima-indians-diabetes.csv")
df = pd.read_csv("archive.zip", header=None)
df.columns = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI",
              "DiabetesPedigreeFunction","Age","Class"]

In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


- split data in train and test

In [None]:
X, y = df.drop(['Class'], axis=1), df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

- create pipelines: Pipeline steps will include
  - 1. Data Preprocessing using MinMax Scaler   
  - 2. Reducing Dimensionality using PCA
  - 3. Training respective models

In [None]:
#Logistic Regression Pipeline
LR_Pipe = Pipeline([('myscaler', StandardScaler()),
                     ('mypca',PCA(n_components=3)),
                     ('logistic_classifier',LogisticRegression())])

#Decision tree Pipeline
DT_Pipe = Pipeline([('myscaler', StandardScaler()),
                     ('mypca',PCA(n_components=3)),
                     ('decisiontree_classifier',DecisionTreeClassifier())])

#Random Forest Pipeline
RF_Pipe = Pipeline([('myscaler', StandardScaler()),
                     ('mypca',PCA(n_components=3)),
                     ('randomforest_classifier',RandomForestClassifier())])

- model training and validation

In [None]:
# Defining the pipelines in a list
mypipeline = [LR_Pipe, DT_Pipe, RF_Pipe]

In [None]:
# Creating dictionary of pipelines and training models
PipelineDict = {0: 'Logistic Regression',
                1: 'Decision Tree',
                2: 'Random Forest'}

# Fit the pipelines
for mypipe in mypipeline:
    mypipe.fit(X_train, y_train)

In [None]:
# getting test accuracy for all classifiers
for i, model in enumerate(mypipeline):
    print("{} Test Accuracy: {}".format(PipelineDict[i], model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.7727272727272727
Decision Tree Test Accuracy: 0.7142857142857143
Random Forest Test Accuracy: 0.7272727272727273


In [None]:
#Choosing best model for the given data
accuracy=0.0
classifier=0
pipeline=""

for i,model in enumerate(mypipeline):
    if model.score(X_test,y_test) > accuracy:
        accuracy=model.score(X_test,y_test)
        pipeline=model
        classifier=i
print('Classifier with best accuracy:{}'.format(PipelineDict[classifier]))

Classifier with best accuracy:Logistic Regression


In [None]:
pipeline.get_params()

{'memory': None,
 'steps': [('myscaler', StandardScaler()),
  ('mypca', PCA(n_components=3)),
  ('logistic_classifier', LogisticRegression())],
 'transform_input': None,
 'verbose': False,
 'myscaler': StandardScaler(),
 'mypca': PCA(n_components=3),
 'logistic_classifier': LogisticRegression(),
 'myscaler__copy': True,
 'myscaler__with_mean': True,
 'myscaler__with_std': True,
 'mypca__copy': True,
 'mypca__iterated_power': 'auto',
 'mypca__n_components': 3,
 'mypca__n_oversamples': 10,
 'mypca__power_iteration_normalizer': 'auto',
 'mypca__random_state': None,
 'mypca__svd_solver': 'auto',
 'mypca__tol': 0.0,
 'mypca__whiten': False,
 'logistic_classifier__C': 1.0,
 'logistic_classifier__class_weight': None,
 'logistic_classifier__dual': False,
 'logistic_classifier__fit_intercept': True,
 'logistic_classifier__intercept_scaling': 1,
 'logistic_classifier__l1_ratio': None,
 'logistic_classifier__max_iter': 100,
 'logistic_classifier__multi_class': 'deprecated',
 'logistic_classifier_

In [None]:
pipeline

In [None]:
# 특정 단계의 객체에 접근
pipeline.named_steps['myscaler'].get_params()  # 밑줄 없는, 즉 훈련 전에 설정하는 모든 파라미터를 반환

{'copy': True, 'with_mean': True, 'with_std': True}

- 밑줄이 붙은 속성들은 모델을 훈련한 후에만 접근할 수 있으면 직접 찾아야 함.

In [None]:
pipeline.named_steps['myscaler'].mean_, pipeline.named_steps['myscaler'].scale_

(array([  3.86644951, 121.25732899,  68.86156352,  19.82084691,
         79.03583062,  31.83241042,   0.46379479,  33.2980456 ]),
 array([  3.38492752,  31.63480905,  19.72255578,  15.84679774,
        116.49057512,   7.62427174,   0.327095  ,  11.80062782]))

In [None]:
pipeline.named_steps['logistic_classifier'].coef_

array([[ 0.63726878,  0.40106695, -0.50568754]])

In [None]:
pipeline.named_steps['logistic_classifier'].intercept_

array([-0.76494229])

------------------------------