In [1]:
import pandas as pd
import numpy as np
 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer  # added for OneHotEncoder in the pipeline
from sklearn.compose import make_column_selector # added to further automate the OneHotEncoder functionality
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
 
from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv('./data/penguins_clean.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [3]:
df.isna().sum()[df.isna().sum() > 0]

Series([], dtype: int64)

In [4]:
X = df.drop(columns='species')
y = df['species']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 42)

In [6]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']),
        ('cat', OneHotEncoder(drop='first'), ['island','sex'])
    ])
preprocessor

In [7]:
preprocessor_minmax = ColumnTransformer(
    transformers=[
        ('num_minmax', MinMaxScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat_onehot', OneHotEncoder(drop='first'), make_column_selector(dtype_include=['object', 'category']))
    ])
preprocessor_minmax

In [8]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mdl', KNeighborsClassifier())
     ])
pipeline

In [9]:
pipeline_minmax = Pipeline([
    ('preprocessor_minmax', preprocessor_minmax),
    ('mdl_minmax', KNeighborsClassifier())
     ])
pipeline_minmax

In [10]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      0.98      0.98        55
   Chinstrap       0.96      0.96      0.96        23
      Gentoo       1.00      1.00      1.00        39

    accuracy                           0.98       117
   macro avg       0.98      0.98      0.98       117
weighted avg       0.98      0.98      0.98       117



In [11]:
pipeline_minmax.fit(X_train, y_train)
y_pred = pipeline_minmax.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      0.98      0.98        55
   Chinstrap       0.96      0.96      0.96        23
      Gentoo       1.00      1.00      1.00        39

    accuracy                           0.98       117
   macro avg       0.98      0.98      0.98       117
weighted avg       0.98      0.98      0.98       117

