# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer  # added for OneHotEncoder in the pipeline
from sklearn.compose import make_column_selector # added to further automate the OneHotEncoder functionality
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC

from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Load the dataset

In [2]:
df = pd.read_csv('./data/penguins_clean.csv')
df.shape, df.columns

((333, 7),
 Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
        'flipper_length_mm', 'body_mass_g', 'sex'],
       dtype='object'))

# Perform EDA

In [3]:
# no EDA to perform in this model as the focus is just on a simple pipeline and how it works...
nulls = df.isna().sum() > 0
df.isna().sum()[nulls]

Series([], dtype: int64)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    int64  
 5   body_mass_g        333 non-null    int64  
 6   sex                333 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 18.3+ KB


## Y-data Profile Output
[Penguins Clean](https://psdbia.github.io/ydata/penguins_profile.html)

# Prepare the data for modeling


In [5]:
X = df.drop(columns=['species'])
y = df['species']

# Split the data into train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((223, 6), (110, 6))

# Define column transformer (i.e. One-Hot encoding)

In [7]:
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']),
        ('cat', OneHotEncoder(drop='first'), ['island', 'sex'])
    ])

preprocessor

In [8]:
# Define the column transformer - Pat Method
preprocessor_minmax = ColumnTransformer(
    transformers=[
        ('num_minmax', MinMaxScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat_onehot', OneHotEncoder(drop='first'), make_column_selector(dtype_include=['object', 'category']))
    ])
preprocessor_minmax

# Create the pipeline using the preprocessor and the chosen classification model

In [9]:
# Create a pipeline standard scaler, and logistic regression
# Create a pipeline with the preprocessor and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mdl', KNeighborsClassifier ())
])

pipeline

In [10]:
# Create a pipeline standard scaler, and logistic regression
# Create a pipeline with the preprocessor and logistic regression
pipeline_mm = Pipeline([
    ('preprocessor_minmax', preprocessor_minmax),
    ('mdl_mm', KNeighborsClassifier ())
])

pipeline_mm

# Train the model

In [11]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        52
   Chinstrap       1.00      0.95      0.97        20
      Gentoo       1.00      1.00      1.00        38

    accuracy                           0.99       110
   macro avg       0.99      0.98      0.99       110
weighted avg       0.99      0.99      0.99       110



In [12]:
# Train the model
pipeline_mm.fit(X_train, y_train)

# Make predictions
y_pred = pipeline_mm.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      0.98      0.98        52
   Chinstrap       0.95      0.95      0.95        20
      Gentoo       1.00      1.00      1.00        38

    accuracy                           0.98       110
   macro avg       0.98      0.98      0.98       110
weighted avg       0.98      0.98      0.98       110

