In [1]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

**loading boston dataset**

In [2]:
boston_dataset = load_boston()
X = boston_dataset.data
y = boston_dataset.target

In [3]:
X.shape

(506, 13)

In [4]:
y.shape

(506,)

In [5]:
y = np.reshape(y, (-1, 1))
y.shape

(506, 1)

**splitting boston dataset into training and testing dataset**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print(f'lenth of training data = {len(X_train)}\nlength of testing data = {len(X_test)}')


lenth of training data = 379
length of testing data = 127


**use of lable encoder**

In [7]:
#we will create one numpy arry and perform encoding on it
data1 = np.random.choice(('Male', 'Female'), size=10)
le = LabelEncoder()
data1_encoded = le.fit_transform(data1)
print(f'Initial array = {data1}')
print(f'Encoded array = {data1_encoded}')

Initial array = ['Female' 'Female' 'Male' 'Male' 'Female' 'Male' 'Female' 'Male' 'Female'
 'Female']
Encoded array = [0 0 1 1 0 1 0 1 0 0]


**handling NaN's using SimpleImputer**

In [8]:
#we will create 2d numpy array with NaN values in it and then handle
#them with SimpleImputer using three strategies
data2 = np.array(
    [
        [1,np.nan,2],
        [2,3,np.nan],
        [-1,4,2]
    ]
)
mean_imp = SimpleImputer(strategy='mean')
median_imp = SimpleImputer(strategy='median')
most_freq_imp = SimpleImputer(strategy='most_frequent')
print(
    f'Original data :\n {data2}\n\n'
    f'Strategy = Mean :\n {mean_imp.fit_transform(data2)}\n\n',
    f'Strategy = Median :\n {median_imp.fit_transform(data2)}\n\n',
    f'Strategy = Most Frequent :\n {most_freq_imp.fit_transform(data2)}'
)

Original data :
 [[ 1. nan  2.]
 [ 2.  3. nan]
 [-1.  4.  2.]]

Strategy = Mean :
 [[ 1.   3.5  2. ]
 [ 2.   3.   2. ]
 [-1.   4.   2. ]]

 Strategy = Median :
 [[ 1.   3.5  2. ]
 [ 2.   3.   2. ]
 [-1.   4.   2. ]]

 Strategy = Most Frequent :
 [[ 1.  3.  2.]
 [ 2.  3.  2.]
 [-1.  4.  2.]]


**using Normalizer to normalize values**

In [9]:
#we will create 2d array and perform 3 types of normaisations on it
data3 =[
        [ 1., -1.,  2.],
        [ 2.,  0.,  0.],
        [ 0.,  1., -1.]
]
norm_max = Normalizer(norm='max')
norm_l1 = Normalizer(norm='l1')
norm_l2 = Normalizer(norm='l2')
print(
    f'originl data :\n {data3}\n\n',
    f'max normalization :\n {norm_max.fit_transform(data3)}\n\n',
    f'l1 normalization :\n {norm_l1.fit_transform(data3)}\n\n',
    f'l2 normalization :\n {norm_l2.fit_transform(data3)}'
)

originl data :
 [[1.0, -1.0, 2.0], [2.0, 0.0, 0.0], [0.0, 1.0, -1.0]]

 max normalization :
 [[ 0.5 -0.5  1. ]
 [ 1.   0.   0. ]
 [ 0.   1.  -1. ]]

 l1 normalization :
 [[ 0.25 -0.25  0.5 ]
 [ 1.    0.    0.  ]
 [ 0.    0.5  -0.5 ]]

 l2 normalization :
 [[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]


**Using variance thereshold to neglet fetures with less variance**

In [10]:
#we will take boston data that was imported in 2nd cell
#and we will remove the features that has very less variance
print(
    f'Data (1st five rows for representation) :\n{X[:3, :]}\n\n',
    f'Number of features : {len(X[0])}\n'
)

Data (1st five rows for representation) :
[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
  4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
  9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 7.1850e+00
  6.1100e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9283e+02
  4.0300e+00]]

 Number of features : 13



In [11]:
varience_threshold = VarianceThreshold(threshold=1.5)
X_after_applying_threshold = varience_threshold.fit_transform(X)
print(
    f'Data after removing features having variance 1.5 :\n{X_after_applying_threshold[:5, :]}\n\n',
    f'New number of features : {len(X_after_applying_threshold[0])}\n'
)

Data after removing features having variance 1.5 :
[[6.3200e-03 1.8000e+01 2.3100e+00 6.5200e+01 4.0900e+00 1.0000e+00
  2.9600e+02 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 7.8900e+01 4.9671e+00 2.0000e+00
  2.4200e+02 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 6.1100e+01 4.9671e+00 2.0000e+00
  2.4200e+02 1.7800e+01 3.9283e+02 4.0300e+00]
 [3.2370e-02 0.0000e+00 2.1800e+00 4.5800e+01 6.0622e+00 3.0000e+00
  2.2200e+02 1.8700e+01 3.9463e+02 2.9400e+00]
 [6.9050e-02 0.0000e+00 2.1800e+00 5.4200e+01 6.0622e+00 3.0000e+00
  2.2200e+02 1.8700e+01 3.9690e+02 5.3300e+00]]

 New number of features : 10

