# normalization practice

# What is Normalization?
Normalization is a technique in feature scaling where you rescale numerical data into a fixed range, usually 0 to 1.
# Why Normalization is Needed in Machine Learning?
Because:

Different features may have different scales (e.g., Age: 18–60, Salary: 10k–100k)

Many ML algorithms (like KNN, SVM, Gradient Descent) work better when features are on similar scales

Prevents one large feature from dominating the others



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# simple code
# Sample dataset
data = {
    'Age': [18, 25, 35, 45, 55],
    'Salary': [15000, 25000, 35000, 45000, 55000]
}

df = pd.DataFrame(data)
print(df)


   Age  Salary
0   18   15000
1   25   25000
2   35   35000
3   45   45000
4   55   55000


In [3]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
scaler = MinMaxScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df)

# Convert back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

print(scaled_df)


        Age  Salary
0  0.000000    0.00
1  0.189189    0.25
2  0.459459    0.50
3  0.729730    0.75
4  1.000000    1.00


# Example with Output Column
👇 Sample Dataset:


In [5]:
# sample datasets

data = {
    'Age': [18, 25, 35, 45, 55],
    'Salary': [15000, 25000, 35000, 45000, 55000],
    'Purchased': [0, 1, 0, 1, 0]   # Target/output column
}

df = pd.DataFrame(data)


In [6]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Separate input (X) and output (y)
X = df[['Age', 'Salary']]          # Features
y = df['Purchased']                # Target/output

# Step 2: Apply MinMaxScaler only on X
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Convert scaled data back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Step 4: Combine back if needed
final_df = pd.concat([X_scaled_df, y], axis=1)

print(final_df)


        Age  Salary  Purchased
0  0.000000    0.00          0
1  0.189189    0.25          1
2  0.459459    0.50          0
3  0.729730    0.75          1
4  1.000000    1.00          0


# csve file

In [9]:
df=pd.read_csv('/content/diabetes.csv')

In [10]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
df=df[['Glucose','BloodPressure','SkinThickness','Insulin','Outcome']]

In [12]:
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,Outcome
0,148,72,35,0,1
1,85,66,29,0,0
2,183,64,0,0,1
3,89,66,23,94,0
4,137,40,35,168,1
...,...,...,...,...,...
763,101,76,48,180,0
764,122,70,27,0,0
765,121,72,23,112,0
766,126,60,0,0,1


# train test split

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
x=df.drop('Outcome',axis=1)
y=df['Outcome']

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin
60,84,0,0,0
618,112,82,24,0
346,139,46,19,83
294,161,50,0,0
231,134,80,37,370
...,...,...,...,...
71,139,64,35,140
106,96,122,0,0
270,101,86,37,0
435,141,0,0,0


In [19]:
y_train

Unnamed: 0,Outcome
60,0
618,1
346,0
294,0
231,1
...,...
71,0
106,0
270,1
435,1


# *** **Feature-Scaling:NOrmalization:minmax scaler

In [29]:
x_train_scalled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [30]:
x_train_scalled

array([[0.42211055, 0.        , 0.        , 0.        ],
       [0.56281407, 0.67213115, 0.38095238, 0.        ],
       [0.69849246, 0.37704918, 0.3015873 , 0.09810875],
       ...,
       [0.50753769, 0.70491803, 0.58730159, 0.        ],
       [0.70854271, 0.        , 0.        , 0.        ],
       [0.6281407 , 0.78688525, 0.        , 0.        ]])

In [31]:
x_test_scaled

array([[0.49246231, 0.47540984, 0.52380952, 0.22458629],
       [0.56281407, 0.6147541 , 0.50793651, 0.        ],
       [0.54271357, 0.52459016, 0.        , 0.        ],
       [0.53768844, 0.6557377 , 0.        , 0.        ],
       [0.68341709, 0.73770492, 0.        , 0.        ],
       [0.51758794, 0.59016393, 0.50793651, 0.22458629],
       [0.35678392, 0.39344262, 0.28571429, 0.08983452],
       [0.5879397 , 0.        , 0.        , 0.        ],
       [0.77386935, 0.59016393, 0.46031746, 0.14893617],
       [0.73869347, 0.63934426, 0.        , 0.        ],
       [0.55778894, 0.57377049, 0.42857143, 0.        ],
       [0.89949749, 0.77868852, 0.49206349, 0.        ],
       [0.74371859, 0.49180328, 0.42857143, 0.37588652],
       [0.48241206, 0.60655738, 0.28571429, 0.07919622],
       [0.44221106, 0.47540984, 0.41269841, 0.01891253],
       [0.6281407 , 0.40983607, 0.63492063, 0.19739953],
       [0.42211055, 0.59016393, 0.50793651, 0.        ],
       [0.4321608 , 0.55737705,

In [32]:
x_train_scalled=pd.DataFrame(data=x_train_scalled,columns=x_train.columns)
x_test_scalled=pd.DataFrame(data=x_test_scalled,columns=x_test.columns)

In [33]:
x_train_scalled

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin
0,0.422111,0.000000,0.000000,0.000000
1,0.562814,0.672131,0.380952,0.000000
2,0.698492,0.377049,0.301587,0.098109
3,0.809045,0.409836,0.000000,0.000000
4,0.673367,0.655738,0.587302,0.437352
...,...,...,...,...
609,0.698492,0.524590,0.555556,0.165485
610,0.482412,1.000000,0.000000,0.000000
611,0.507538,0.704918,0.587302,0.000000
612,0.708543,0.000000,0.000000,0.000000


In [35]:
x_test_scalled

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin
0,0.492462,0.475410,0.523810,0.224586
1,0.562814,0.614754,0.507937,0.000000
2,0.542714,0.524590,0.000000,0.000000
3,0.537688,0.655738,0.000000,0.000000
4,0.683417,0.737705,0.000000,0.000000
...,...,...,...,...
149,0.829146,0.721311,0.000000,0.000000
150,0.386935,0.459016,0.476190,0.066194
151,0.477387,0.590164,0.000000,0.000000
152,0.733668,0.573770,0.603175,0.425532
