In [4]:
import sklearn
import numpy as np

## Data Standardization

In [5]:
pizza_data = np.array([[2100,   10,  800],
       [2500,   11,  850],
       [1800,   10,  760],
       [2000,   12,  800],
       [2300,   11,  810]]
)

In [6]:
from sklearn.preprocessing import scale

In [7]:
Pizza_mean  = pizza_data.mean(axis = 0)
print('{}\n'.format(Pizza_mean))

[2140.    10.8  804. ]



In [8]:
Pizza_sd  = pizza_data.std(axis = 0)
print('{}\n'.format(Pizza_sd))

[241.66091947   0.74833148  28.70540019]



In [9]:
col_standardization = scale(pizza_data)
print('{}\n'.format(col_standardization))

[[-0.16552118 -1.06904497 -0.1393466 ]
 [ 1.4896906   0.26726124  1.60248593]
 [-1.40693001 -1.06904497 -1.53281263]
 [-0.57932412  1.60356745 -0.1393466 ]
 [ 0.66208471  0.26726124  0.2090199 ]]



In [10]:
col_means = col_standardization.mean(axis=0).round(decimals=3)
print('{}\n'.format(repr(col_means)))

array([ 0., -0.,  0.])



In [11]:

# Column standard deviations
col_stds = col_standardization.std(axis=0)
print('{}\n'.format(repr(col_stds)))

array([1., 1., 1.])



In [12]:
def standardize_data(data):
  scaled_data = scale(data)
  return scaled_data

## Range Compression (Min Max Scalar)

In [13]:
data = np.array([[ 1.2,  3.2],
       [-0.3, -1.2],
       [ 6.5, 10.1],
       [ 2.2, -8.4]])
# predefined data
print('{}\n'.format(repr(data)))


array([[ 1.2,  3.2],
       [-0.3, -1.2],
       [ 6.5, 10.1],
       [ 2.2, -8.4]])



In [14]:

from sklearn.preprocessing import MinMaxScaler
default_scaler = MinMaxScaler() # the default range is [0,1]
transformed = default_scaler.fit_transform(data)
print('{}\n'.format(repr(transformed)))


array([[0.22058824, 0.62702703],
       [0.        , 0.38918919],
       [1.        , 1.        ],
       [0.36764706, 0.        ]])



In [15]:

custom_scaler = MinMaxScaler(feature_range=(-2, 3))
transformed = custom_scaler.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[-0.89705882,  1.13513514],
       [-2.        , -0.05405405],
       [ 3.        ,  3.        ],
       [-0.16176471, -2.        ]])



In [16]:
new_data= np.array([[ 1.2, -0.5],
       [ 5.3,  2.3],
       [-3.3,  4.1]])

In [17]:
print('{}\n'.format(repr(new_data)))

array([[ 1.2, -0.5],
       [ 5.3,  2.3],
       [-3.3,  4.1]])



In [18]:
default_scaler = MinMaxScaler() # the default range is [0,1]
transformed = default_scaler.fit_transform(new_data)
print('{}\n'.format(repr(transformed)))

array([[0.52325581, 0.        ],
       [1.        , 0.60869565],
       [0.        , 1.        ]])



In [19]:
default_scaler = MinMaxScaler()  # new instance
default_scaler.fit(data)  # different data value fit
transformed = default_scaler.transform(new_data)
print('{}\n'.format(repr(transformed)))

array([[ 0.22058824,  0.42702703],
       [ 0.82352941,  0.57837838],
       [-0.44117647,  0.67567568]])



In [20]:
data = np.array([[ 1.2,  2.3],
       [ 2.1,  4.2],
       [-1.9,  3.1],
       [-2.5,  2.5],
       [ 0.8,  3. ],
       [ 6.3,  2.1],
       [-1.5,  2.7],
       [ 1.4,  2.9],
       [ 1.8,  3.2]])# predefined data
print('{}\n'.format(repr(data)))


array([[ 1.2,  2.3],
       [ 2.1,  4.2],
       [-1.9,  3.1],
       [-2.5,  2.5],
       [ 0.8,  3. ],
       [ 6.3,  2.1],
       [-1.5,  2.7],
       [ 1.4,  2.9],
       [ 1.8,  3.2]])



In [21]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
transformed = robust_scaler.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[ 0.        , -1.        ],
       [ 0.27272727,  2.16666667],
       [-0.93939394,  0.33333333],
       [-1.12121212, -0.66666667],
       [-0.12121212,  0.16666667],
       [ 1.54545455, -1.33333333],
       [-0.81818182, -0.33333333],
       [ 0.06060606,  0.        ],
       [ 0.18181818,  0.5       ]])



## Data Imputation

In [28]:
data = np.array(([[ 1.,  2., np.nan,  2.],
       [ 5., np.nan,  1.,  2.],
       [ 4., np.nan,  3., np.nan],
       [ 5.,  6.,  8.,  1.],
       [np.nan,  7., np.nan,  0.]]))
print('{}\n'.format(repr(data)))


array([[ 1.,  2., nan,  2.],
       [ 5., nan,  1.,  2.],
       [ 4., nan,  3., nan],
       [ 5.,  6.,  8.,  1.],
       [nan,  7., nan,  0.]])



In [32]:
# Default is MEAN 
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer()
transformed = imp_mean.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[1.  , 2.  , 4.  , 2.  ],
       [5.  , 5.  , 1.  , 2.  ],
       [4.  , 5.  , 3.  , 1.25],
       [5.  , 6.  , 8.  , 1.  ],
       [3.75, 7.  , 4.  , 0.  ]])



In [33]:
imp_median = SimpleImputer(strategy='median')
transformed = imp_median.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[1. , 2. , 3. , 2. ],
       [5. , 6. , 1. , 2. ],
       [4. , 6. , 3. , 1.5],
       [5. , 6. , 8. , 1. ],
       [4.5, 7. , 3. , 0. ]])



In [34]:
imp_frequent = SimpleImputer(strategy='most_frequent')
transformed = imp_frequent.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[1., 2., 1., 2.],
       [5., 2., 1., 2.],
       [4., 2., 3., 2.],
       [5., 6., 8., 1.],
       [5., 7., 1., 0.]])



In [36]:
#fill with a specified Value
imp_constant = SimpleImputer(strategy='constant',
                             fill_value=-1)
transformed = imp_constant.fit_transform(data)
print('{}\n'.format(repr(transformed)))

array([[ 1.,  2., -1.,  2.],
       [ 5., -1.,  1.,  2.],
       [ 4., -1.,  3., -1.],
       [ 5.,  6.,  8.,  1.],
       [-1.,  7., -1.,  0.]])

