In [3]:
#Feature Engineering:
# Feature Engineering is the process of creating new features or modifying the existing features and selecting these from raw data 
# to improve the performance of a machine learning model which helps in capturing hidden patterns in the data.
#It includes parts of data preprocessing and data cleaning
#it involves feature transformation, feature construction,feature selection, feature extraction.
# Feature Transformation: Modifying existing features to make them suitable for the model (e.g., scaling or normalizing).

# Feature Construction: Creating new features from raw data to capture hidden patterns (e.g., age from date_of_birth).

# Feature Selection: Choosing the most relevant features to improve model performance and reduce noise.

# Feature Extraction: Reducing dimensionality by combining or transforming features into a new set (e.g., PCA).

In [5]:
# Feature Scaling: The process of normalizing or standardizing numerical features so that they are on a similar scale, ensuring that no single 
# feature dominates the model due to its magnitude.
#This helps machine learning models perform better, especially those that rely on distances or gradients.


#Types of feature scaling:1.standardization 2. normalization
#1. Standardization: 
# Standardization is the process of transforming numerical features in a dataset so that they have:
# a mean (μ) of 0, and
# a standard deviation (σ) of 1

#In standardization two processes involved:
#1. Mean Centering 2.Scaling by Standard Deviation

# Standardization Formula:
# z=(𝑥-μ)/𝜎
# Where:
# 𝑥: original feature value
# 𝜇: mean of the feature
# 𝜎: standard deviation of the feature
# 𝑧: standardized value


#Note: standardization is also called as Z-Score Normalization


In [None]:
#StandardScaler is a class/module in scikit-learn that is specifically designed to apply the standardization formula:
# when this formula is applied:
# z=(𝑥-μ)/𝜎
# a mean (μ) of 0, and
# a standard deviation (σ) of 1
# The standardization formula is applied when you call:
# X_scaled = scaler.fit_transform(X)


# or in two steps:
# scaler.fit(X)          # calculates mean (μ) and std (σ)
# X_scaled = scaler.transform(X)  # applies the formula


In [7]:
#NOTE: To perform feature scaling, either standardization or normalization in both the cases it is preferable to perform train_test_split

In [127]:
import numpy as np
import pandas as pd

In [129]:
df=pd.read_csv("Social_Network_Ads.csv")

In [None]:
#syntax od using iloc
#df.iloc[rows, columns]
#df.iloc[:,2:] , it means selecting all rows and column selection after index 0 and 1.

In [19]:
# df.head() → Shows the first 5 rows of the DataFrame by default.
# df.sample() → Shows 1 random row of the DataFrame by default.
#df.sample(n)-> Shows n number of random rows from the dataframe

In [131]:
df.sample(2)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
97,15582492,Male,28,123000,1
143,15783029,Male,30,89000,0


In [133]:
df.head(2)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0


In [135]:
df=df.iloc[:,2:]

In [137]:
df.sample(2)

Unnamed: 0,Age,EstimatedSalary,Purchased
104,19,21000,0
79,26,17000,0


In [35]:
#Now we have to perform feature scaling i.e standardization method here and to perform any feature scaling either standardization or normalization,
#it is always recommended to perform train_test_split

# TRAIN_TEST_SPLIT

In [139]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.drop("Purchased",axis=1),df["Purchased"],test_size=0.3,random_state=0)

In [141]:
print(x_train.shape)
print(x_test.shape)

(280, 2)
(120, 2)


# Standard Scaler

In [143]:
#importing module StandardScaler from sklearn library to perform standardization
from sklearn.preprocessing import StandardScaler

#creating object os StandardScaler module
scaler=StandardScaler()

scaler.fit(x_train)  #mean and standard deviation is calculated here(here the learning process is done which is done on training data only)
x_train_scaled=scaler.transform(x_train)  #the step where formus is applied for standardization(here trasformation is do ne which is done in both testing and training data)
x_test_scaled=scaler.transform(x_test)

In [145]:
#Important Note
print(x_train.head(2))
print(x_train_scaled[:2])   # first 2 rows of numpy array
# x_train data is actually a dataframe but when standardization is done that is "x_train_scaled" it becomes numpy array 


     Age  EstimatedSalary
157   29            75000
109   38            80000
[[-0.84252154  0.1301563 ]
 [ 0.04175763  0.2777019 ]]


In [85]:
#To solve the above problem we have to convert the numpy array into data frame.. i.e. x_train_scaled into dataframe from numpy array

In [147]:
x_train_scaled=pd.DataFrame(x_train_scaled,columns=x_train.columns)
x_test_scaled=pd.DataFrame(x_test_scaled,columns=x_test.columns)

In [157]:
np.round(x_train.describe(),1)  #original mean is 37.6 for Age column and 70589.3 for Purchased column

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,37.6,70589.3
std,10.2,33948.5
min,18.0,15000.0
25%,30.0,44000.0
50%,37.0,71000.0
75%,45.0,88000.0
max,60.0,150000.0


In [159]:
np.round(x_train_scaled.describe(),1)

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,-0.0,0.0
std,1.0,1.0
min,-1.9,-1.6
25%,-0.7,-0.8
50%,-0.1,0.0
75%,0.7,0.5
max,2.2,2.3
