# Standardizing Datasets

Author: Pierre Nugues

A few experiments with standardization techniques

In [1]:

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline

In [2]:
cancer = datasets.load_breast_cancer()

In [3]:
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [4]:
clf = LogisticRegression()

In [5]:
clf.fit(cancer['data'], cancer['target'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [6]:
y_pred = clf.predict(cancer['data'])

In [7]:
print(classification_report(cancer['target'], y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       212
           1       0.95      0.97      0.96       357

    accuracy                           0.95       569
   macro avg       0.94      0.94      0.94       569
weighted avg       0.95      0.95      0.95       569



In [8]:
sscaler = StandardScaler()

In [9]:
X_std = sscaler.fit_transform(cancer['data'])

In [10]:
clf.fit(X_std, cancer['target'])

LogisticRegression()

In [11]:
y_pred_std = clf.predict(X_std)

In [12]:
print(classification_report(cancer['target'], y_pred_std))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       212
           1       0.99      0.99      0.99       357

    accuracy                           0.99       569
   macro avg       0.99      0.99      0.99       569
weighted avg       0.99      0.99      0.99       569



In [13]:
normalizer = Normalizer()

In [14]:
X_norm = normalizer.fit_transform(cancer['data'])

In [15]:
clf.fit(X_norm, cancer['target'])

LogisticRegression()

In [16]:
y_pred_norm = clf.predict(X_norm)

In [17]:
print(classification_report(cancer['target'], y_pred_norm))

              precision    recall  f1-score   support

           0       0.99      0.49      0.65       212
           1       0.77      1.00      0.87       357

    accuracy                           0.81       569
   macro avg       0.88      0.74      0.76       569
weighted avg       0.85      0.81      0.79       569



In [18]:
pipe = Pipeline([('sscaler', StandardScaler()), ('normalizer', Normalizer()), ('lr', LogisticRegression())])

In [19]:
clf_ns = pipe.fit(cancer['data'], cancer['target'])

In [20]:
y_pred_ns = pipe.predict(cancer['data'])

In [21]:
print(classification_report(cancer['target'], y_pred_ns))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       212
           1       0.98      0.99      0.99       357

    accuracy                           0.98       569
   macro avg       0.98      0.98      0.98       569
weighted avg       0.98      0.98      0.98       569

