# Preparing the Data

In [1]:
import pandas
import scipy
import numpy
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer

In [2]:
# these examples use the Pima Indian diabetes dataset
url = "pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

In [3]:
# separate array into features (X) and label (y) parts
X = array[:,0:8]
y = array[:,8]

## Rescale
Transform the features so that they have a value between 0 and 1.

In [4]:
# rescale features (X) into the range between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# summarise the transformed data (see the first 5 rows)
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[0.353 0.744 0.59  0.354 0.    0.501 0.234 0.483]
 [0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]]


## Standardize
Transform the features so that they have a Gaussian distribution. Rescales to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1. Most suitable for input variables that assume a Gaussian distribution.

In [5]:
# rescale to standard Gaussian disribution
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

# summarise the transformed data (see the first 5 rows)
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.998 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.504 -1.505  0.907  0.766  1.41   5.485 -0.02 ]]


## Normalize
Transform each observation (i.e. row) to have a length of 1 (i.e. take the squared value for each value in the row, sum them up and the square root = 1). Also called a unit norm. Useful for sparse datasets (lots of zeros) with attributes of varying scales when using algorithms that apply weights to input variables (e.g. neural networks) or use distance measures (e.g. K-nearest neighbours).

In [6]:
# normalize the data
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)

# summarise the transformed data (see the first 5 rows)
numpy.set_printoptions(precision=5)
print(normalizedX[0:5,:])

[[0.03355 0.82763 0.40263 0.19572 0.      0.18789 0.00351 0.2796 ]
 [0.00842 0.71604 0.55598 0.2443  0.      0.22408 0.00296 0.26114]
 [0.0404  0.9241  0.32318 0.      0.      0.11766 0.00339 0.16159]
 [0.00661 0.58847 0.43639 0.15208 0.62153 0.1858  0.0011  0.13885]
 [0.      0.59639 0.17413 0.15236 0.73134 0.18762 0.00996 0.14366]]


## Binarize
Transform the data to ones or zeros. Useful for when you have probabilities that you want to make crisp. In this example, all values <= 0 are marked as zero and all those > 0 are marked as one.

In [7]:
# binarize the data
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

# summarise the transformed data (see the first 5 rows)
numpy.set_printoptions(precision=3)
print(binaryX[0:5,:])

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]
