<a href="https://colab.research.google.com/github/rajat1911996sharma/Machine-Learning-Model-Life-Cycle/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Preprocessing Tools

#Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


#Importing Dataset

In [6]:
df = pd.read_csv('Data.csv')
#Now, we wil create a matrix of features and a vector of independent variable
X=df.drop('Purchased',axis=1).values
#can also be written as 
#X=df.iloc[:] means we are taking all the rows
#X=df.iloc[:, ] for column
#X=df.iloc[:, :-1] for selecting every variable except last
y=df['Purchased']
#y= df.iloc[:, -1].values
#we wanna get the last row
#we can do this by removeing the range in brackets [:. ':-1']
#with -1

In [5]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

#Taking Care of Missing Data

In [9]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [10]:
#We having a missing salary
# we will replace it by mean of Salaries
#we will use Scikit learn
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
#selecting 2,3
X[:, 1:3] = imputer.transform(X[:, 1:3])
#we need to replace the transformed values inside the dataset too

In [12]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#Encoding Categorical Data

##Encoding the Independent Variable

In [17]:
#we need to change the values that are in country and Purchased 
#because our machine can not handle this type of input
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough')
#we want to use encoder
#then we will specify OneHotEncoder 
#then we will choose on which columns to apply on
#then in remainder we mention passthrough
#otherwise they will o/p the unique cells single time and all other values removed
#onehotencoder used because the data is not Ordinal
#the countries can not be ranked

#the value we want to fit and transform
#ct.fit_transform(X)

#here, the value will be replaced in X
#X = ct.fit_transform(X)

X = np.array(ct.fit_transform(X))
#fit_tranform do not o/p numpy array
# numpy array as i/p is expected for machine learning algo.


In [18]:
X

array([[0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

##Encoding Dependent Variable

In [19]:
#Label encoder used when the data can be ordinal or ordered
#like Jr. Sr., Primary, high school
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


In [20]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

#Splitting the Dataset into Training Set and Test Set

In [21]:
#do we have apply feature scaling before or after splitting
#after the dataset splitting
#feature scaling takes place so that all features scale homogeneously

#test set is supposed to be a brand new set
#feature scaling get mean and std of dataset
#if applied before, it can get those values for all the datasets
#not supposed to have
#(info leakage is prevented)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)


In [22]:
X_train

array([[1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [23]:
X_test

array([[1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [24]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [26]:
y_test

array([0, 1])

#Feature Scaling

In [28]:
#why do we need feature scaling ?
#to avoid some features to be dominated by other features
#in such a way that dominated features are not even considered by the ML model

#not required in all the ML Models

#two main feature scaling techniques
#1. Standardisation
'''
(x-mean(x))/(std(x))
(-3,3) more or less
'''
#2. Normalisation
'''
(x-min(x))/(max(x)-min(x))
all values between (0,1)
'''

#when to use these ?
'''
Normalisation used when we have a normal distribution

Standardisation works well all the time

'''

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#here, we have no pararmeters

#Do we have to apply Standardisstion to dummy variable ?
'''
Simply, the answer is no
Goal of feature scaling is to have all the features in same range
Stanadardistaion gives value between (-3,3)
the dummy variable are already between (-3,3)
it will make it worse if we do it again
Only, apply to those numerical values
'''



#only to be applied on age and salary columns
#below you can see, 4th column is the age column
#so, we will select 3 upto all the other columns
X_train[:, 3:]  = scaler.fit_transform(X_train[:, 3:])

#the test variables need to be scaled to the train variables
#it makes th predictions relevant
X_test[:, 3:] = scaler.transform(X_test[:, 3:])


In [30]:
X_train

array([[1.0, 0.0, 0.0, 1.2909944487358056, -0.19159184384578545,
        -1.0781259408412425],
       [1.0, 0.0, 1.0, -0.7745966692414834, -0.014117293757057777,
        -0.07013167641635372],
       [0.0, 1.0, 0.0, -0.7745966692414834, 0.566708506533324,
        0.633562432710455],
       [1.0, 0.0, 0.0, 1.2909944487358056, -0.30453019390224867,
        -0.30786617274297867],
       [1.0, 0.0, 0.0, 1.2909944487358056, -1.9018011447007988,
        -1.420463615551582],
       [0.0, 1.0, 0.0, -0.7745966692414834, 1.1475343068237058,
        1.232653363453549],
       [1.0, 0.0, 1.0, -0.7745966692414834, 1.4379472069688968,
        1.5749910381638885],
       [0.0, 1.0, 0.0, -0.7745966692414834, -0.7401495441200351,
        -0.5646194287757332]], dtype=object)

In [29]:
X_test

array([[1.0, 0.0, 1.0, -0.7745966692414834, -1.4661817944830124,
        -0.9069571034860727],
       [0.0, 1.0, 0.0, -0.7745966692414834, -0.44973664397484414,
        0.2056403393225306]], dtype=object)