In [3]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

data = np.array([["Red"], ["Blue"], ["Green"], ["Red"]])
encoder = OneHotEncoder(sparse_output=False)  # convert to dense array
encoded = encoder.fit_transform(data)

print(encoded)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np

data = np.array([[10], [20], [30]])
scaler = StandardScaler()
scaled = scaler.fit_transform(data)

print(scaled)

[[-1.22474487]
 [ 0.        ]
 [ 1.22474487]]


In [5]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

data = np.array([[10], [20], [30]])
scaler = MinMaxScaler()
scaled = scaler.fit_transform(data)

print(scaled)

[[0. ]
 [0.5]
 [1. ]]


In [6]:

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

# Load data
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name="target")
X, y = shuffle(X, y, random_state=42)

X.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
70,18.94,21.31,123.6,1130.0,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,...,24.86,26.58,165.9,1866.0,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
131,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,...,19.26,26.0,124.9,1156.0,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
431,12.4,17.68,81.47,467.8,0.1054,0.1316,0.07741,0.02799,0.1811,0.07102,...,12.88,22.91,89.61,515.8,0.145,0.2629,0.2403,0.0737,0.2556,0.09359
540,11.54,14.44,74.65,402.9,0.09984,0.112,0.06737,0.02594,0.1818,0.06782,...,12.26,19.68,78.78,457.8,0.1345,0.2118,0.1797,0.06918,0.2329,0.08134


In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

baseline = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
    ("clf", LogisticRegression(max_iter=1000))
])
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)
print("Baseline accuracy:", accuracy_score(y_test, y_pred))


Baseline accuracy: 0.9473684210526315


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.impute import SimpleImputer

median_imputer_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("clf", LogisticRegression(max_iter=1000))
])

median_imputer_pipeline.fit(X_train, y_train)

y_pred_median = median_imputer_pipeline.predict(X_test)

print("Accuracy with median imputation:", accuracy_score(y_test, y_pred_median))

Accuracy with median imputation: 0.9473684210526315


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:

rng = np.random.default_rng(0)
X_miss = X.copy()
col = "mean radius"
n = len(X_miss)
idx = rng.choice(n, size=int(0.05*n), replace=False)
X_miss.loc[X_miss.index[idx], col] = np.nan

pipe_mean = Pipeline([
    ("imp", SimpleImputer(strategy="mean")),
    ("clf", LogisticRegression(max_iter=1000))
])
Xtr, Xte, ytr, yte = train_test_split(X_miss, y, test_size=0.2, random_state=42, stratify=y)
pipe_mean.fit(Xtr, ytr)
print("Mean imputation acc:", accuracy_score(yte, pipe_mean.predict(Xte)))


Mean imputation acc: 0.9385964912280702


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean imputation acc: 0.9385964912280702


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
rng = np.random.default_rng(0)
X_miss = X.copy()
col = "mean radius"
n = len(X_miss)
idx = rng.choice(n, size=int(0.05*n), replace=False)
X_miss.loc[X_miss.index[idx], col] = np.nan

pipe_median = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("clf", LogisticRegression(max_iter=1000))
])
Xtr, Xte, ytr, yte = train_test_split(X_miss, y, test_size=0.2, random_state=42, stratify=y)
pipe_median.fit(Xtr, ytr)
print("Median imputation acc:", accuracy_score(yte, pipe_median.predict(Xte)))

Median imputation acc: 0.9473684210526315


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:

X_enc = X.copy()
X_enc['radius_cat'] = pd.qcut(X_enc['mean radius'], q=3, labels=['small','medium','large'])
cat_cols = ['radius_cat']
num_cols = [c for c in X_enc.columns if c not in cat_cols]

pre_no_scale = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", SimpleImputer(strategy="mean"), num_cols)
])
pipe = Pipeline([("pre", pre_no_scale), ("clf", LogisticRegression(max_iter=2000))])
Xtr, Xte, ytr, yte = train_test_split(X_enc, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(Xtr, ytr)
print("OHE no scaling acc:", accuracy_score(yte, pipe.predict(Xte)))
