In [23]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
data = load_iris()
df = pd.DataFrame(data["data"], columns=data["feature_names"])
df["target"] = data["target_names"][data["target"]]
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [14]:
df = df.drop(df[df.duplicated(keep=False)].index)

In [15]:
df.isnull().sum(axis=0)/len(df)

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
target               0.0
dtype: float64

In [16]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,148.0,148.0,148.0,148.0
mean,5.843919,3.062162,3.739865,1.189865
std,0.833665,0.436811,1.770266,0.762982
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.575,0.3
50%,5.8,3.0,4.3,1.3
75%,6.4,3.325,5.1,1.8
max,7.9,4.4,6.9,2.5


In [18]:
X, y = df[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]], df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)
X_train.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
32,5.2,4.1,1.5,0.1
113,5.7,2.5,5.0,2.0
50,7.0,3.2,4.7,1.4


In [39]:
imputer = SimpleImputer(strategy="mean")
scaler  = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imp)

X_test_imp   = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imp)

In [40]:
from sklearn.metrics import classification_report

model = LogisticRegression(random_state=34)
model.fit(X_train_scaled, y_train)
y_pred1 = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        13
  versicolor       1.00      1.00      1.00        12
   virginica       1.00      1.00      1.00         5

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [37]:
#Instead of manual preprocessing we could use the pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression


num_cols = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]

preprocessor = ColumnTransformer([
    ("num", make_pipeline(SimpleImputer(strategy="mean"), StandardScaler()), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

pipeline = make_pipeline(preprocessor, LogisticRegression(random_state=34))

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        13
  versicolor       1.00      1.00      1.00        12
   virginica       1.00      1.00      1.00         5

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [30]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=34)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
print("CV accuracies:", scores)
print("Mean CV accuracy:", scores.mean())


CV accuracies: [0.93333333 0.96666667 0.96666667 0.93103448 1.        ]
Mean CV accuracy: 0.9595402298850575
