### 1. Rescaling Numerical data

Rescale numerical data data to be between teo values

In [1]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[-500.5], [-100.1], [0], [100.1], [900.9]])

feature

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

In [4]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

### 2. Standardize a feature

Transform a feature to have mean 0 and standard deviation of 1

In [6]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[-900.5], [-250.5], [150.9], [711.9], [9999.1]])

feature

array([[-900.5],
       [-250.5],
       [ 150.9],
       [ 711.9],
       [9999.1]])

In [8]:
standard_scaler = preprocessing.StandardScaler()

scaled_feature = standard_scaler.fit_transform(feature)

scaled_feature

array([[-0.69971382],
       [-0.53971903],
       [-0.4409161 ],
       [-0.30282829],
       [ 1.98317724]])

### 3. Normalizing observations

Rescale the feature values of observations to have unit norm (total length of 1)

In [9]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[0.4, 0.4], [1.5, 3.5], [1.2, 15.5], [1.89, 38.9], [20.2, 5.2]])

feature

array([[ 0.4 ,  0.4 ],
       [ 1.5 ,  3.5 ],
       [ 1.2 , 15.5 ],
       [ 1.89, 38.9 ],
       [20.2 ,  5.2 ]])

In [11]:
# L2 normalizing
normalizer = preprocessing.Normalizer(norm='l2')

normalized_feature = normalizer.transform(feature)

normalized_feature

array([[0.70710678, 0.70710678],
       [0.3939193 , 0.91914503],
       [0.07718838, 0.99701653],
       [0.04852887, 0.99882178],
       [0.96842682, 0.24929799]])

In [12]:
# L1 normalizing
normalizer = preprocessing.Normalizer(norm='l1')

normalized_feature = normalizer.transform(feature)

normalized_feature

array([[0.5       , 0.5       ],
       [0.3       , 0.7       ],
       [0.07185629, 0.92814371],
       [0.04633489, 0.95366511],
       [0.79527559, 0.20472441]])

### 4. Generate Polynomial and Intercation features

Polynomial features are often created when we want to include the notion that there exists a nonlinear relationship between the features and the target

In [15]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

features = np.array([[2, 3], [2, 3]])

polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

transformed_features = polynomial_interaction.fit_transform(features)

transformed_features

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [16]:
# Setting interaction to True

polynomial_interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

transformed_features = polynomial_interaction.fit_transform(features)

transformed_features

array([[2., 3., 6.],
       [2., 3., 6.]])

### 5. Custom transforming features

Custom transformation to one or more features. We might want to create a feature that is the natural log of values of different feature. We can do this by creating a function and then mapping it to features using sklearn FunctionTransformer or pandas apply function

In [17]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

features = np.array([[2, 3], [2, 3], [2, 3]])

features

array([[2, 3],
       [2, 3],
       [2, 3]])

In [18]:
def add_ten(x):
    return x+10

ten_transformer = FunctionTransformer(add_ten)

transformed_features = ten_transformer.transform(features)

transformed_features

array([[12, 13],
       [12, 13],
       [12, 13]])

In [27]:
import pandas as pd

# Alternate way using pandas
df = pd.DataFrame(features, columns=['feature1', 'feature2'])

df.apply(add_ten)

print(df)
print('\n')

# convert back to numpy array
print('Converted back to numpy array :')
df.to_numpy()

   feature1  feature2
0         2         3
1         2         3
2         2         3


Converted back to numpy array :


array([[2, 3],
       [2, 3],
       [2, 3]])

### 6. Handling Outliers