# Normalization

In [5]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

## Introduction

__Why should we normalize the data?__

Make sure different feature take on similar ranges of values so that __Gradient Descent can converge more quickly__.

<div class="alert alert-block alert-info">
    <img src='https://icons.iconarchive.com/icons/paomedia/small-n-flat/16/sign-info-icon.png'>
    To change the values of numeric columns in the dataset to a common scale without distorting differences in the range of values.
</div>

## Implementation

In this section, the `Load Breast Cance` dataset can be used to show a practical example of normalization.

In [127]:
# Load Data
data = load_breast_cancer()

# Create DataFrame
breast_data = pd.DataFrame(data=data.data, columns=data.feature_names)

# Descriptive Analysis of the Data
display(breast_data.describe().round(2).T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.13,3.52,6.98,11.7,13.37,15.78,28.11
mean texture,569.0,19.29,4.3,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.97,24.3,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.89,351.91,143.5,420.3,551.1,782.7,2501.0
mean smoothness,569.0,0.1,0.01,0.05,0.09,0.1,0.11,0.16
mean compactness,569.0,0.1,0.05,0.02,0.06,0.09,0.13,0.35
mean concavity,569.0,0.09,0.08,0.0,0.03,0.06,0.13,0.43
mean concave points,569.0,0.05,0.04,0.0,0.02,0.03,0.07,0.2
mean symmetry,569.0,0.18,0.03,0.11,0.16,0.18,0.2,0.3
mean fractal dimension,569.0,0.06,0.01,0.05,0.06,0.06,0.07,0.1


## Select Features

Based on the descriptive analysis of the features, we should normalize only 10 out of the 30 features since the rest have similar ranges.

The featureas are shown as follows:

In [113]:
# List of Features
selected_features = ["mean radius","mean texture","mean perimeter","mean area", "perimeter error",
                     "area error","worst radius", "worst texture", "worst perimeter", "worst area"]

display(breast_data[selected_features].describe().round(2).T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.13,3.52,6.98,11.7,13.37,15.78,28.11
mean texture,569.0,19.29,4.3,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.97,24.3,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.89,351.91,143.5,420.3,551.1,782.7,2501.0
perimeter error,569.0,2.87,2.02,0.76,1.61,2.29,3.36,21.98
area error,569.0,40.34,45.49,6.8,17.85,24.53,45.19,542.2
worst radius,569.0,16.27,4.83,7.93,13.01,14.97,18.79,36.04
worst texture,569.0,25.68,6.15,12.02,21.08,25.41,29.72,49.54
worst perimeter,569.0,107.26,33.6,50.41,84.11,97.66,125.4,251.2
worst area,569.0,880.58,569.36,185.2,515.3,686.5,1084.0,4254.0


## Normalize Data

Normalize the data with the following Range __[0,1]__

The normalization is performed by applying the following formular: 
<div style="font-size: xx-large"> 
    $\frac{(x - x_{min})}{Range \, of \, x}$
</div> 
<br> 
<div style="font-size: medium"> 
    where <i>Range of x</i> is equal to $x_{max} - x_{min}$
</div>

In [143]:
def normalize_data(df, features=None):
    """Normalize data between 0 and 1"""
    
    if not features:
        features = df.columns
        
    # Range of X
    df_np = df[features].to_numpy()
    min_val = df_np.min()
    range_x = df_np.max() - min_val
    
    # Normalize Data
    # Whole dataset
    normalized_features = (df[features] - min_val) / range_x
    
    # Each feature
    #normalized_features = (df[features] - df[features].min(axis=0)) / (df[features].max(axis=0) - df[features].min(axis=0))
    
    # Set normalized data
    df_norm = df.copy()
    df_norm[features] = normalized_features.values
            
    return df_norm

In [144]:
# Normalize dataset
normalize_data(df=breast_data, features=selected_features).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,0.003144,0.000829,0.001463,0.002573,0.002966,0.003532,0.006431
mean texture,569.0,0.004357,0.001011,0.002105,0.003624,0.004252,0.004948,0.009057
mean perimeter,569.0,0.021445,0.005713,0.010118,0.017496,0.020098,0.024297,0.044141
mean area,569.0,0.153796,0.08274,0.033561,0.098641,0.129394,0.183846,0.587844
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


## Normalize with Sklean

Let's use the `normalize` function from Scikit-Learn.

In [146]:
from sklearn.preprocessing import normalize

In [150]:
# Normalize Data
X_norm = normalize(X=breast_data[selected_features])

# Set Normalized Data
breast_data_norm = breast_data.copy()
breast_data_norm[selected_features] = X_norm

# Display Results
display(breast_data_norm.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,0.014843,0.004011,0.005512,0.01169,0.015061,0.017354,0.028471
mean texture,569.0,0.02201,0.011077,0.004568,0.014104,0.020352,0.026888,0.08661
mean perimeter,569.0,0.096006,0.024912,0.036396,0.076533,0.097151,0.111015,0.178589
mean area,569.0,0.605086,0.047242,0.376233,0.582554,0.615221,0.639476,0.697401
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


***