## Applying MinMaxScaler on breast cancer data

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
cancer = load_breast_cancer()
scaler = MinMaxScaler()

In [4]:
print(cancer['data'])

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0) 

In [6]:
scaler.fit(x_train)

In [7]:
x_train_scaled = scaler.transform(x_train) #Transforms the training data into scaled training data. 

In [8]:
print(x_train_scaled) #All values should range between 0 and 1

[[0.23044157 0.32157676 0.21940433 ... 0.31484671 0.30277942 0.09858323]
 [0.20062473 0.42116183 0.19452699 ... 0.06965208 0.34042973 0.06677161]
 [0.62232003 0.76929461 0.60403566 ... 0.56079917 0.19850187 0.07431457]
 ...
 [0.11619102 0.35726141 0.11077327 ... 0.17402687 0.17524147 0.17263545]
 [0.12963226 0.35311203 0.11706171 ... 0.         0.06780997 0.06919848]
 [0.21434995 0.59004149 0.21235575 ... 0.33251808 0.10782574 0.21172767]]


In [9]:
print(x_train.shape)

(426, 30)


In [10]:
print(x_train_scaled.shape)

(426, 30)


Due to scaling, the minimum of each feature will be scaled to 0 and maximum will be scaled to 1

In [11]:
print("Minimum of each feature before scaling \n{}".format(x_train.min(axis=0)))

Minimum of each feature before scaling 
[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]


In [12]:
print("Maximum of each feature before scaling \n{}".format(x_train.max(axis=0)))

Maximum of each feature before scaling 
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]


In [13]:
print("Minimum of each feature after scaling \n{}".format(x_train_scaled.min(axis=0)))

Minimum of each feature after scaling 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


In [14]:
print("Maximum of each feature after scaling \n{}".format(x_train_scaled.max(axis=0)))

Maximum of each feature after scaling 
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [15]:
x_test_scaled = scaler.transform(x_test)

In [16]:
print("Minimum of each feature of test before scaling \n{}".format(x_test.min(axis=0)))

Minimum of each feature of test before scaling 
[7.729e+00 1.072e+01 4.798e+01 1.788e+02 6.576e-02 3.398e-02 0.000e+00
 0.000e+00 1.203e-01 5.024e-02 1.144e-01 3.602e-01 7.714e-01 6.802e+00
 2.826e-03 3.746e-03 0.000e+00 0.000e+00 1.013e-02 1.217e-03 8.964e+00
 1.249e+01 5.717e+01 2.422e+02 8.409e-02 4.619e-02 0.000e+00 0.000e+00
 1.603e-01 5.865e-02]


In [17]:
print("Maximum of each feature of test before scaling \n{}".format(x_test.max(axis=0)))

Maximum of each feature of test before scaling 
[2.321e+01 3.928e+01 1.535e+02 1.670e+03 1.634e-01 3.454e-01 4.264e-01
 1.823e-01 2.906e-01 9.502e-02 1.370e+00 3.647e+00 1.107e+01 1.765e+02
 3.113e-02 1.354e-01 1.438e-01 4.090e-02 7.895e-02 2.193e-02 3.101e+01
 4.487e+01 2.068e+02 2.944e+03 1.902e-01 9.327e-01 1.170e+00 2.910e-01
 5.440e-01 1.446e-01]


In [18]:
print("Minimum of each feature after scaling \n{}".format(x_test_scaled.min(axis=0)))

Minimum of each feature after scaling 
[ 0.03540158  0.04190871  0.02895446  0.01497349  0.14260888  0.04999658
  0.          0.          0.07222222  0.00589722  0.00105015 -0.00057494
  0.00067851 -0.0007963   0.05148726  0.01434497  0.          0.
  0.04195752  0.01113138  0.03678406  0.01252665  0.03366702  0.01400904
  0.08531995  0.01833687  0.          0.          0.00749064  0.02367834]


In [19]:
print("Maximum of each feature after scaling \n{}".format(x_test_scaled.max(axis=0)))

Maximum of each feature after scaling 
[0.76809125 1.22697095 0.75813696 0.64750795 1.20310633 1.11643038
 0.99906279 0.90606362 0.93232323 0.94903117 0.45573058 0.72623944
 0.48593507 0.31641282 1.36082713 1.2784499  0.36313131 0.77476795
 1.32643996 0.72672498 0.82106012 0.87553305 0.77887345 0.67803775
 0.78603975 0.87843331 0.93450479 1.0024113  0.76384782 0.58743277]


Notice how some values in the maximum of features went above the value 1 in test data. This happens because the way scaling is applied only takes the minimum values of training data and not the testing data. Hence, the paramets required to scale are that of the training data which can differ from the minimum and maximum of testing data which can lead to different results. 

In [20]:
from sklearn.svm import SVC #Importing SVC for finding accuracy

In [21]:
# Accuracy of unscaled data 
svm = SVC(C=100)
svm.fit(x_train, y_train)
print("Accuracy for test data is {:.2f} ".format(svm.score(x_test, y_test)))

Accuracy for test data is 0.94 


In [22]:
# Accuracy of scaled data 
svm.fit(x_train_scaled, y_train)
print("Accuracy for scaled test data is {:.2f}".format(svm.score(x_test_scaled, y_test)))


Accuracy for scaled test data is 0.97


In [24]:
# Checking accuracy with standard scaler
from sklearn.preprocessing import StandardScaler
scaler_standard = StandardScaler()
scaler_standard.fit(x_train)
x_train_scaled_std = scaler_standard.transform(x_train)
x_test_scaled_std = scaler_standard.transform(x_test)
svm.fit(x_train_scaled_std, y_train)
print("SVM test set accuracy for standard scaling is {:.2f}".format(svm.score(x_test_scaled_std, y_test)))

SVM test set accuracy for standard scaling is 0.96
