In [1]:
#Normalization: Normalization is a data preprocessing technique used to rescale numerical features so that they fall within a 
#specific range, typically [0, 1].

In [3]:
# | Technique           | Range Target     | Use Case                          |
# | ------------------- | ---------------- | --------------------------------- |
# | **Standardization** | Mean = 0, SD = 1 | When data is normally distributed |
# | **Normalization**   | Range \[0, 1]    | When you need bounded values      |


In [7]:
#1.Data Dividation:
#after data collection and data preprocesssing, data splitting(train_test_split) is done.
#THe dataaframe is dividedinto two parts: 1. input(x) 2.target(y) where(df=input+target)
#from x, x_train,x_test is created   ..... x is independent data
#from y,y_train,y_test is created    ..... y is dependent data
#we divide data in two parts: training and testimg data.. where majority data is for training and minority data is for testing

# We must use both X_train and y_train to train a supervised machine learning model.
# Here’s why:
# X_train = input features (what the model sees).
# y_train = target labels (what the model is trying to predict).
# We actively use y_train during training — it’s not just for reference
# The model calculates(predict) the output from X_train, then compares its predictions with y_train to learn and improve.

# Actually, normalization (or scaling) is usually considered part of data preprocessing, not after splitting.
# But the important thing is when and how you apply it:

# ✅ Correct Order
# Data Collection
# Data Preprocessing (handle missing values, encode categories, remove duplicates, etc.)
# Data Splitting (train/test/validation split)
# Normalization / Standardization

#Standardization is actually a type of normalization.
#Types of normalization:
#1.Min-Max Scaling(mostly used types..90 % times this type is used)
#2. Z-Score Normalization (Standardization)
#3.Mean Normqalization

In [9]:
#Normalization Formula:
#x'=x-x(min)/x(max)-x(min)
# Example: If age = 40, min = 20, max = 60 → normalized = 0.5


# x → The actual value (your raw data point)
# x_min → The smallest value in that column (feature)
# x_max → The largest value in that column
# x - x_min → How far the value is from the minimum
# x_max - x_min → The full range of values (max − min)
#x' → The new scaled value/normalized value (between 0 and 1)

#The scaled/new distribution of data are in the range of 0 and 1 only.

In [112]:
import numpy as np
import pandas as pd

In [114]:
df=pd.read_csv("wine_data.csv",usecols=[0,1,2])

In [116]:
df.head(5)

Unnamed: 0,class_label,alcohol,malic_acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [118]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.drop("class_label",axis=1),df["class_label"],test_size=0.3,random_state=0)

In [120]:
x_train.shape, x_test.shape

((124, 2), (54, 2))

In [122]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()  #creating object
scaler.fit(x_train)    #fit the scaler to the train set, it will learn the parameters
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [128]:
x_train_scaled=pd.DataFrame(x_train_scaled,columns=x_train.columns)
x_test_scaled=pd.DataFrame(x_test_scaled,columns=x_test.columns)

In [130]:
np.round(x_train.describe(),2)

Unnamed: 0,alcohol,malic_acid
count,124.0,124.0
mean,12.98,2.38
std,0.8,1.14
min,11.03,0.89
25%,12.36,1.61
50%,13.04,1.88
75%,13.64,3.25
max,14.75,5.65


In [132]:
np.round(x_train_scaled.describe(),2)

Unnamed: 0,alcohol,malic_acid
count,124.0,124.0
mean,0.53,0.31
std,0.22,0.24
min,0.0,0.0
25%,0.36,0.15
50%,0.54,0.21
75%,0.7,0.5
max,1.0,1.0
