## Machine Learning Operations

### 1. Rescaling Numerical data

Rescale numerical data data to be between teo values

In [1]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[-500.5], [-100.1], [0], [100.1], [900.9]])

feature

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

In [4]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

### 2. Standardize a feature(Numerical data)

Transform a feature to have mean 0 and standard deviation of 1

In [6]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[-900.5], [-250.5], [150.9], [711.9], [9999.1]])

feature

array([[-900.5],
       [-250.5],
       [ 150.9],
       [ 711.9],
       [9999.1]])

In [8]:
standard_scaler = preprocessing.StandardScaler()

scaled_feature = standard_scaler.fit_transform(feature)

scaled_feature

array([[-0.69971382],
       [-0.53971903],
       [-0.4409161 ],
       [-0.30282829],
       [ 1.98317724]])

### 3. Normalizing observations(Numerical data)

Rescale the feature values of observations to have unit norm (total length of 1)

In [9]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[0.4, 0.4], [1.5, 3.5], [1.2, 15.5], [1.89, 38.9], [20.2, 5.2]])

feature

array([[ 0.4 ,  0.4 ],
       [ 1.5 ,  3.5 ],
       [ 1.2 , 15.5 ],
       [ 1.89, 38.9 ],
       [20.2 ,  5.2 ]])

In [11]:
# L2 normalizing
normalizer = preprocessing.Normalizer(norm='l2')

normalized_feature = normalizer.transform(feature)

normalized_feature

array([[0.70710678, 0.70710678],
       [0.3939193 , 0.91914503],
       [0.07718838, 0.99701653],
       [0.04852887, 0.99882178],
       [0.96842682, 0.24929799]])

In [12]:
# L1 normalizing
normalizer = preprocessing.Normalizer(norm='l1')

normalized_feature = normalizer.transform(feature)

normalized_feature

array([[0.5       , 0.5       ],
       [0.3       , 0.7       ],
       [0.07185629, 0.92814371],
       [0.04633489, 0.95366511],
       [0.79527559, 0.20472441]])

### 4. Generate Polynomial and Intercation features(Numerical data)

Polynomial features are often created when we want to include the notion that there exists a nonlinear relationship between the features and the target

In [15]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

features = np.array([[2, 3], [2, 3]])

polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

transformed_features = polynomial_interaction.fit_transform(features)

transformed_features

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [16]:
# Setting interaction to True

polynomial_interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

transformed_features = polynomial_interaction.fit_transform(features)

transformed_features

array([[2., 3., 6.],
       [2., 3., 6.]])

### 5. Custom transforming features(Numerical data)

Custom transformation to one or more features. We might want to create a feature that is the natural log of values of different feature. We can do this by creating a function and then mapping it to features using sklearn FunctionTransformer or pandas apply function

In [17]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

features = np.array([[2, 3], [2, 3], [2, 3]])

features

array([[2, 3],
       [2, 3],
       [2, 3]])

In [18]:
def add_ten(x):
    return x+10

ten_transformer = FunctionTransformer(add_ten)

transformed_features = ten_transformer.transform(features)

transformed_features

array([[12, 13],
       [12, 13],
       [12, 13]])

In [27]:
import pandas as pd

# Alternate way using pandas
df = pd.DataFrame(features, columns=['feature1', 'feature2'])

df.apply(add_ten)

print(df)
print('\n')

# convert back to numpy array
print('Converted back to numpy array :')
df.to_numpy()

   feature1  feature2
0         2         3
1         2         3
2         2         3


Converted back to numpy array :


array([[2, 3],
       [2, 3],
       [2, 3]])

### 6. Handling Outliers(Numerical data)

There are 3 ways we can handle outliers.
1. We can drop them
2. We can include them as feature
3. We can transform them

In [7]:
import pandas as pd

# create a dataframe
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Rooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 40000]

houses

Unnamed: 0,Price,Rooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500
3,4322032,116.0,40000


In [3]:
# First way, dropping the observation
houses = houses[houses['Rooms'] < 20]
houses

Unnamed: 0,Price,Rooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [6]:
# note : execute first step before proceeding
import numpy as np

# Second way, incluse them as feature
houses['Outlier'] = np.where(houses['Rooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Rooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,40000,1


In [8]:
# note : execute first step before proceeding
import numpy as np

# Third way, transforming the features
houses['Log_of_square_feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Rooms,Square_Feet,Log_of_square_feet
0,534433,2.0,1500,7.31322
1,392333,3.5,2500,7.824046
2,293222,2.0,1500,7.31322
3,4322032,116.0,40000,10.596635


### 7. Discretizing Features(NUmerical data)

We have numerical values and want to break them into discrete bins.<br>
Depending on how we want to break up data, there are two techniques.<br>
-> Binarize feature according to some threshold<br>
-> Break up numerical values according to multiple thresholds

In [9]:
# Binarize feature according to some threshold
import numpy as np
from sklearn.preprocessing import Binarizer

# feature
age = np.array([[6], [12], [20], [36], [65]])

binarizer = Binarizer(18)

transformed_age = binarizer.fit_transform(age)
transformed_age

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [10]:
# Break up numerical values according to multiple thresholds

second_transformed_age = np.digitize(age, bins=[20, 30, 40]) # right=True, includes bins as well
second_transformed_age

# Note : we can also use digitize to binarize, having only one bin

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

### 8. Grouping observations using clustering(Numerical data)

We want to cluster observations so that similar observations are grouped together. If we know, we have k groups, we can use k-means clustering to group similar observations and output a new feature containing each observation's group membership

In [18]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

features, _ = make_blobs(n_samples=50, n_features=2, centers=3, random_state=1)

df = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

clusterer = KMeans(3, random_state=0)

clusterer.fit(features)

df['group'] = clusterer.predict(features)

df.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


### 9. Deleting Observations with Missing Values(Numerical data)

In [22]:
import numpy as np

features = np.array([[1.1, 11.1], [2.2, 22.2], [3.3, 33.3], [4.4, 44.4], [np.nan, 55]])

# Keep only observations that are not (denoted by -) missing
new_features = features[~np.isnan(features).any(axis=1)]

new_features

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [24]:
# Alternate way of dropping of mossing observations using Pandas
import pandas as pd

df = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

df = df.dropna()

df

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4
