In [1]:

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

# Load data
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name="target")
X, y = shuffle(X, y, random_state=42)

X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
70,18.94,21.31,123.6,1130.0,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,...,24.86,26.58,165.9,1866.0,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
131,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,...,19.26,26.0,124.9,1156.0,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
431,12.4,17.68,81.47,467.8,0.1054,0.1316,0.07741,0.02799,0.1811,0.07102,...,12.88,22.91,89.61,515.8,0.145,0.2629,0.2403,0.0737,0.2556,0.09359
540,11.54,14.44,74.65,402.9,0.09984,0.112,0.06737,0.02594,0.1818,0.06782,...,12.26,19.68,78.78,457.8,0.1345,0.2118,0.1797,0.06918,0.2329,0.08134


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X = X[y != 2]
y = y[y != 2]

rng = np.random.RandomState(42)
missing_mask = rng.rand(*X.shape) < 0.1
X_missing = X.copy()
X_missing[missing_mask] = np.nan

X_train, X_test, y_train, y_test = train_test_split(X_missing, y, test_size=0.2, random_state=42)

model_with_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced'))
])

model_with_imputer.fit(X_train, y_train)
y_pred = model_with_imputer.predict(X_test)
accuracy_with_imputer = accuracy_score(y_test, y_pred)

print("Accuracy WITH SimpleImputer (Median):", accuracy_with_imputer)
print("\nClassification Report WITH Imputer:\n", classification_report(y_test, y_pred))

model_without_imputer = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced'))
])

X_train_no_nan = X_train[~np.isnan(X_train).any(axis=1)]
y_train_no_nan = y_train[~np.isnan(X_train).any(axis=1)]
X_test_no_nan = X_test[~np.isnan(X_test).any(axis=1)]
y_test_no_nan = y_test[~np.isnan(X_test).any(axis=1)]

model_without_imputer.fit(X_train_no_nan, y_train_no_nan)
y_pred_no_imputer = model_without_imputer.predict(X_test_no_nan)
accuracy_without_imputer = accuracy_score(y_test_no_nan, y_pred_no_imputer)

print("\nAccuracy WITHOUT Imputer (rows dropped):", accuracy_without_imputer)
print("\nClassification Report WITHOUT Imputer:\n", classification_report(y_test_no_nan, y_pred_no_imputer))

print("\nIterations with imputer:", model_with_imputer.named_steps['log_reg'].n_iter_)



Accuracy WITH SimpleImputer (Median): 1.0

Classification Report WITH Imputer:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20


Accuracy WITHOUT Imputer (rows dropped): 1.0

Classification Report WITHOUT Imputer:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15


Iterations with imputer: [8]


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(as_frame=True)
X = data.frame.drop(columns="target")
y = data.frame["target"]

rng = np.random.default_rng(0)
X_miss = X.copy()
col = "mean radius"
n = len(X_miss)
idx = rng.choice(n, size=int(0.05 * n), replace=False)
X_miss.loc[X_miss.index[idx], col] = np.nan

Xtr, Xte, ytr, yte = train_test_split(X_miss, y, test_size=0.2, random_state=42, stratify=y)

pipe_mean = Pipeline([
    ("imp", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000))
])
pipe_mean.fit(Xtr, ytr)
acc_mean = accuracy_score(yte, pipe_mean.predict(Xte))
print("Mean imputation acc:", acc_mean)

pipe_median = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000))
])
pipe_median.fit(Xtr, ytr)
acc_median = accuracy_score(yte, pipe_median.predict(Xte))
print("Median imputation acc:", acc_median)


Mean imputation acc: 0.9824561403508771
Median imputation acc: 0.9824561403508771
