<a href="https://colab.research.google.com/github/rahiakela/machine-learning-research-and-practice/blob/main/machine-learning-with-pytorch-and-scikit-learn/06-model-evaluation-and-hyperparameter-tuning/01_streamlining_ml_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Streamlining ML workflow

**Reference**

[Machine Learning with PyTorch and Scikit-Learn](https://github.com/rasbt/machine-learning-book)

##Setup

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

##Dataset

In [2]:
wdbc_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", header=None)

In [3]:
wdbc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
wdbc_df.shape

(569, 32)

In [5]:
X = wdbc_df.loc[:, 2:].values  # get the 30 features
y = wdbc_df.loc[:, 1].values   # get the last feature

# transform the class labels from their original string representation ('M' and 'B') into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
label_encoder.classes_

array(['B', 'M'], dtype=object)

In [6]:
# double-check it
label_encoder.transform(["M", "B"])

array([1, 0])

In [7]:
# let’s divide the dataset into training and test dataset
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

##ML pipeline

In [9]:
# let's create ml pipeline
ml_pipeline = make_pipeline(StandardScaler(),
                            PCA(n_components=2),
                            LogisticRegression())

ml_pipeline.fit(x_train, y_train)
y_pred = ml_pipeline.predict(x_test)
test_accuracy = ml_pipeline.score(x_test, y_test)
print(f"Test accuracy: {test_accuracy:.3f}")

Test accuracy: 0.956
