In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iris/Iris.csv
/kaggle/input/iris/database.sqlite


# Scalling in Machine Learning
* Feature scaling is about transforming the values of different numerical features to fall within a similar range like each other. The feature scaling is used to prevent the supervised learning models from getting biased toward a specific range of values. For example, if your model is based on linear regression and you do not scale features, then some features may have a higher impact than others which will affect the performance of predictions by giving undue advantage for some variables over others.

* Data Scaling is a common preprocessing step in Machine Learning that involves
transforming the input variables to a similar scale or distribution. This can
improve the performance and stability of some Machine Learning algorithm.
* Scaling is especially important for distance-based algorithms, such as k-nearest neighbors (KNN) or support vector machines (SVM), where the distance between data points affects the model's performance. Scaling also aids gradient descent-based optimization algorithms, like in neural networks, by allowing faster convergence and preventing one feature from dominating the learning process.

* It's a good practice to perform scaling during the preprocessing phase before training a machine learning model to ensure the features are on a similar scale and contribute fairly to the model's learning process.There are eight scalling techniques.


# 1. Standard Scalar
* Standardization scales each input variable separately by subtracting the mean (called centering) and dividing by the standard deviation to shift the distribution to have a mean of zero and a standard deviation of one.
* The Scale Data Matters
Machine learning models learn a mapping from input variables to an output variable.
As such, the scale and distribution of the data drawn from the domain may be different for each variable.
Input variables may have different units (e.g. feet, kilometers, and hours) that, in turn, may mean the variables have different scales.

* Differences in the scales across input variables may increase the difficulty of the problem being modeled. An example of this is that large input values (e.g. a spread of hundreds or thousands of units) can result in a model that learns large weight values. A model with large weight values is often unstable, meaning that it may suffer from poor performance during learning and sensitivity to input values resulting in higher generalization error.

In [2]:
# import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [3]:
df = pd.read_csv('/kaggle/input/iris/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
df.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [6]:
df.shape

(150, 6)

In [7]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [8]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [9]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
# Perform Feature Scaling
scaler = StandardScaler()

In [11]:
# Fit the StandardScalar
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Print Minimum and Maximum values
print('X_train', X_train.min(),'-',X_train.max())

X_train Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [13]:
# Print Minimum and Maximum values of scaled values
print('X_train_scaled(Standard Scalar):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled(Standard Scalar): -2.3788960166206907 - 3.020016928901447


# 2. Min-Max Scalar
* MinMaxScaler scales the data to a fixed range, typically between 0 and 1. On the other hand, StandardScaler rescales the data to have a mean of 0 and a standard deviation of 1. This results in a distribution with zero mean and unit variance. The choice between MinMaxScaler and StandardScaler depends on the data distribution, the nature of the analysis, and the algorithm being used.

* MinMaxScaler is useful when the data has a bounded range or when the distribution is not Gaussian. For example, in image processing, pixel values are typically in the range of 0-255. Scaling these values using MinMaxScaler ensures that the values are within a fixed range and contributes equally to the analysis.

In [14]:
df1 = pd.read_csv('/kaggle/input/iris/Iris.csv')
df1.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [15]:
X = df1.iloc[:,:-1]
y = df1.iloc[:,-1]

In [16]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
# Perform Feature Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [18]:
# Fit the StandardScalar
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Print Minimum and Maximum values
print('X_train', X_train.min(),'-',X_train.max())

X_train Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [20]:
# Print Minimum and Maximum values of scaled values
print('X_train_scaled(Min Max Scaler):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled(Min Max Scaler): 0.0 - 1.0


# Robust Scalar
* Scaling dataset with robust scaler. 
Scale features using statistics that are robust to outliers.
This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile)
Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Median and interquartile range are then stored to be used on later data using the transform method.
*  Standardization can become skewed or biased if the input variable contains outlier values.

* To overcome this, the median and interquartile range can be used when standardizing numerical input variables, generally referred to as robust scaling.

In [21]:
df2 = pd.read_csv('/kaggle/input/iris/Iris.csv')
df2.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
X = df2.iloc[:,:-1]
y = df2.iloc[:,-1]

In [23]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [24]:
# Perform Feature Scaling
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [25]:
# Fit the StandardScalar
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
# Print Minimum and Maximum values
print('X_train', X_train.min(),'-',X_train.max())

X_train Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [27]:
# Print Minimum and Maximum values of scaled values
print('X_train_scaled(Min Max Scaler):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled(Min Max Scaler): -1.904761904761905 - 2.666666666666668


# 4: MaxAbs Scalar
* Scale each feature by its maximum absolute value.This estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.


* MaxAbsScaler is a scaling technique used in machine learning to scale features to a range of [-1, 1] without shifting the data distribution or distorting the relative relationships between data points. It is particularly useful when the data contains outliers and sparse features.

* The MaxAbsScaler scales the features by dividing each value by the maximum absolute value in the feature. The formula for scaling a feature x using MaxAbsScaler is:

  x_scaled = x / max_abs_value

  where x_scaled is the scaled value, x is the original value, and max_abs_value   is the maximum absolute value in the feature.

  The MaxAbsScaler preserves the sign of the values and scales them in a way   that the maximum absolute value in each feature becomes 1. The scaling is done independently for each feature, so the features are not affected by the scaling of other features.



In [28]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

In [29]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
print('X_Train: ',X_train.min(), '-',X_train.max())

X_Train:  Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [31]:
print('X_train_scaled( MaxAbsScaler )',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled( MaxAbsScaler ) 0.006666666666666667 - 1.0


# 5: Quantile Transformer
* QuantileTransformer is a data transformation technique used in machine learning to map the features of a dataset to a uniform or Gaussian distribution. It is particularly useful when the data does not follow a normal distribution or when you want to make the data more robust to outliers.

* The QuantileTransformer works by estimating the cumulative distribution function (CDF) of each feature and transforming the data based on the desired output distribution. It maps the values of each feature to a predefined probability distribution, which can be uniform or Gaussian.
* The key steps involved in using the QuantileTransformer are as follows:

* Estimate the cumulative distribution function (CDF) of each feature in the dataset.
* Map the original feature values to their corresponding quantiles based on the desired output distribution.
* Transform the quantiles to obtain the transformed feature values.
* The resulting transformed data will have a similar distribution across all features, which can be useful for certain machine learning algorithms that assume a specific data distribution, such as linear regression or Gaussian-based models.




In [32]:
from sklearn.preprocessing import QuantileTransformer

In [33]:
scaler = QuantileTransformer(output_distribution = 'normal')
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [34]:
print('X_train :',X_train.min(),'-',X_train.max())

X_train : Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [35]:
print('X_train_scaled(Quantile Transformer): ',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled(Quantile Transformer):  -5.199337582605575 - 5.19933758270342


# 6: Power Transformation

The Power Transformer method transforms the features to follow a Normal Distribution by applying a power transformation, useful for non linear data in which output is more normally distributed.

The Power Transformationer method applies a Power Transformation to the data to make it more Gaussian-like. The method parameter can be set to  Yeo-Johnson or box-cox to control the type of Power transformation used.

The default is Yeo-Johnson. The box-cox method is limited to strictly positive data( Neural Network, K-Nearest Neighbors).

Power transformation is a data transformation technique used in machine learning to adjust the distribution of data by applying a power function to the feature values. It is particularly useful when the data exhibits skewness or heteroscedasticity (unequal variances) and when you want to make the data more closely resemble a normal distribution.

The power transformation is defined by the formula:

x_transformed = (x^lambda - 1) / lambda

where x_transformed is the transformed value, x is the original value, and lambda is the power parameter. The choice of lambda determines the type of power transformation applied.

There are three common types of power transformations:

* Box-Cox Transformation: The Box-Cox transformation is a family of power transformations that covers a range of lambda values. It is suitable for both positive and negative data values. The optimal lambda value is typically determined by maximizing the log-likelihood function.

* Yeo-Johnson Transformation: The Yeo-Johnson transformation is an extension of the Box-Cox transformation that supports both positive and negative data values. Unlike the Box-Cox transformation, it handles zero and negative values. The optimal lambda value is determined by a numerical optimization method.

* Log Transformation: The log transformation is a special case of the power transformation where lambda is set to 0. It is commonly used to reduce skewness in data and is especially effective for positive-valued data.



In [36]:
from sklearn.preprocessing import PowerTransformer

In [37]:
transformer = PowerTransformer()
                               
X_train_scaled = transformer.fit_transform (X_train)
X_test_scaled = transformer.transform(X_test)

In [38]:
print('X_train: ',X_train.min(),'-',X_train.max())

X_train:  Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [39]:
print('X_train_scaled (Power Transformation):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled (Power Transformation): -2.702619947273049 - 2.6774798180501573


# 7: Normalizer
In machine learning, the term "Normalizer" refers to a data preprocessing technique that scales the values of individual features (columns) in a dataset independently. Normalizer operates on the columns of the dataset and transforms each feature separately, making them comparable and ensuring they fall within a specified range.

The Normalizer rescales the values of each feature so that they have unit norm.
This can be useful for sparse dataset (lots of zeros) with attributes of varying scales where using algorithms that weight input value such as neural network and algorithms that use distance measures such as K-nearest Neighbors.

Normalizer operates on the rows of the dataset independently, normalizing each sample separately. The normalization process ensures that each sample has a Euclidean norm (also known as L2 norm) of 1. The formula for normalizing a sample x is:

x_normalized = x / ||x||

where x_normalized is the normalized version of x and ||x|| represents the Euclidean norm of x.

The Normalizer can be applied using different norms, including L1 norm and Max norm, but the default is the L2 norm.

In [40]:
from sklearn.preprocessing import Normalizer

In [41]:
scaler = Normalizer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_train)

In [42]:
print('X_train:',X_train.min(), '-',X_train.max())

X_train: Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [43]:
print('X_train_scaled (Normalizer):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled (Normalizer): 0.0025994816110760767 - 0.9983803876747591


# 8: Binarizer
* In machine learning, the term "Binarizer" refers to a data preprocessing technique that converts numerical features into binary values based on a threshold. It is commonly used when you want to convert continuous numerical data into binary values based on a specific cutoff point.

* The Binarizer operates on each individual value in a feature column and transforms it into 0 or 1, depending on whether it is below or above the specified threshold. Any value equal to or below the threshold is transformed into 0, while any value above the threshold is transformed into 1.

* Binarizer is commonly used when you want to convert continuous numerical features into binary values to indicate the presence or absence of a certain characteristic or condition. For example, you might use it to convert probabilities into binary labels based on a specific threshold.

* It's important to note that Binarizer operates independently on each feature column and does not take into account the relationships between features. If you need to scale or transform the data collectively or consider the distributions of the features, other techniques such as StandardScaler or PowerTransformer might be more suitable.






In [44]:
from sklearn.preprocessing import Binarizer

In [45]:
scaler = Binarizer(threshold=0.5)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [46]:
print('X_train:',X_train.min(), '-',X_train.max())

X_train: Id               1.0
SepalLengthCm    4.3
SepalWidthCm     2.0
PetalLengthCm    1.0
PetalWidthCm     0.1
dtype: float64 - Id               150.0
SepalLengthCm      7.7
SepalWidthCm       4.4
PetalLengthCm      6.7
PetalWidthCm       2.5
dtype: float64


In [47]:
print('X_train_scaled (Binarizer ):',X_train_scaled.min(),'-',X_train_scaled.max())

X_train_scaled (Binarizer ): 0.0 - 1.0


# Note:
* The Binarizer method converts the data to binary values(0 or 1) based on threshold. This can be useful when you want to treat the data as a binary classification problem
* It is important to choose the appropriate scaling method based on the specific problem and dat at hand and to experiment with different techniques to find the best approach for your particular problem