# Data Preprocessing
- Dealing With Duplicate Values
- Dealing With Missing Values
- Scaling
- Dealing With Categorical Values
- Splitting The Data For Training and Testing

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [64]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  11 non-null     object 
dtypes: float64(2), object(2)
memory usage: 480.0+ bytes


In [66]:
df.nunique()

Country      3
Age          9
Salary       9
Purchased    2
dtype: int64

In [67]:
print('Countries : ', df.Country.unique())
print('Purchased : ', df.Purchased.unique())

Countries :  ['France' 'Spain' 'Germany']
Purchased :  ['No' 'Yes']


## Dealing With Duplicate Values
- If Present then drop duplicate values

In [68]:
df.duplicated().sum()

1

In [69]:
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# ================================================

## Dealing With Missing Values
- If number of missing values is large with respect to total values we can drop the column
- If missing values are in numerical column then they can be replaced by mean of that column
- If missing values are in categorical column then they can be replaced by mode of that column
- If number of missing values is very low with respect to total records the records with missing values can be dropped

In [70]:
'''
If null values are not represented by np.nan in the dataset then
replace the symbol used to represent null values with np.nan.
'''

df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

### Using Pandas

In [71]:
avg_age = df.Age.mean()
avg_salary = df.Salary.mean()
print('Average Age    : ', avg_age)
print('Average Salary : ', avg_salary)

Average Age    :  38.77777777777778
Average Salary :  63777.77777777778


In [72]:
df.Age.replace(np.nan, avg_age)

0    44.000000
1    27.000000
2    30.000000
3    38.000000
4    40.000000
5    35.000000
6    38.777778
7    48.000000
8    50.000000
9    37.000000
Name: Age, dtype: float64

In [73]:
df.Salary.replace(np.nan, avg_salary)

0    72000.000000
1    48000.000000
2    54000.000000
3    61000.000000
4    63777.777778
5    58000.000000
6    52000.000000
7    79000.000000
8    83000.000000
9    67000.000000
Name: Salary, dtype: float64

In [74]:
df           # We need to set inplace = True for changes to be reflected to dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [75]:
df.Age.replace(np.nan, df.Age.mean(), inplace = True)
df.Salary.replace(np.nan, df.Salary.mean(), inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Using Scikit Learn

In [76]:
df2 = pd.read_csv('Data.csv')
df2.drop_duplicates(inplace = True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [77]:
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values = np.nan, strategy = 'mean')
impute.fit(df2[['Age','Salary']])
df2[['Age', 'Salary']] = impute.fit_transform(df2[['Age', 'Salary']])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# ================================================

# Scaling
- Used tp bring features to same scale
1. Standard Scaler
2. MinMax Scaler

In [78]:
X = df2[['Country', 'Age', 'Salary']].values
Y = df2[['Purchased']].values

# creating a copy to demonstrate both the scaling techniques
import copy
X2 = copy.copy(X)

In [79]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [80]:
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

## Standard Scaler
- X_scaled = (X - X_mean) / X_std
- Performs z - score normalization
- Unit Variance
- Zero Mean

In [81]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[: , 1:] = sc.fit_transform(X[: , 1:])
X

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [82]:
X[:,1].var()                          # Unit Variance

1.0

In [83]:
X[:,2].var()

1.0000000000000002

In [84]:
X[:,1].mean()                          # Similar to zero

-8.881784197001253e-17

In [85]:
X[:,2].mean()

4.274358644806853e-16

## Min Max Scaler
- X_scaled = (X - X_min) / (X_max - X_min)
- Limits the data between  0 to 1

In [86]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X2[:, 1:] = mm.fit_transform(X2[:, 1:])
X2

array([['France', 0.7391304347826089, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1304347826086958, 0.17142857142857149],
       ['Spain', 0.4782608695652175, 0.37142857142857144],
       ['Germany', 0.5652173913043479, 0.45079365079365075],
       ['France', 0.34782608695652173, 0.2857142857142856],
       ['Spain', 0.5120772946859904, 0.11428571428571432],
       ['France', 0.9130434782608696, 0.8857142857142857],
       ['Germany', 1.0, 1.0],
       ['France', 0.43478260869565233, 0.5428571428571427]], dtype=object)

# ================================================

# Dealing With Categorical Values
- One Hot Encoding
- Ordinal Encoding
- Label Encoding

### Using Pandas

In [87]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [88]:
dummy1 = pd.get_dummies(df.Country)
dummy1

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [89]:
df = pd.concat([dummy1, df], axis = 1)
df

Unnamed: 0,France,Germany,Spain,Country,Age,Salary,Purchased
0,1,0,0,France,44.0,72000.0,No
1,0,0,1,Spain,27.0,48000.0,Yes
2,0,1,0,Germany,30.0,54000.0,No
3,0,0,1,Spain,38.0,61000.0,No
4,0,1,0,Germany,40.0,63777.777778,Yes
5,1,0,0,France,35.0,58000.0,Yes
6,0,0,1,Spain,38.777778,52000.0,No
7,1,0,0,France,48.0,79000.0,Yes
8,0,1,0,Germany,50.0,83000.0,No
9,1,0,0,France,37.0,67000.0,Yes


In [90]:
# Ordinal Or Label
df.Purchased = df.Purchased.map({'Yes' : 1, 'No' : 0})
df

Unnamed: 0,France,Germany,Spain,Country,Age,Salary,Purchased
0,1,0,0,France,44.0,72000.0,0
1,0,0,1,Spain,27.0,48000.0,1
2,0,1,0,Germany,30.0,54000.0,0
3,0,0,1,Spain,38.0,61000.0,0
4,0,1,0,Germany,40.0,63777.777778,1
5,1,0,0,France,35.0,58000.0,1
6,0,0,1,Spain,38.777778,52000.0,0
7,1,0,0,France,48.0,79000.0,1
8,0,1,0,Germany,50.0,83000.0,0
9,1,0,0,France,37.0,67000.0,1


## Using Scikit Learn

In [50]:
X

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

## ColumnTransformer

In [92]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[0])], remainder = 'passthrough')
X = ct.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 0.758874361590019, 0.7494732544921677],
       [0.0, 0.0, 1.0, -1.7115038793306814, -1.4381784072687531],
       [0.0, 1.0, 0.0, -1.2755547779917342, -0.8912654918285229],
       [0.0, 0.0, 1.0, -0.1130238410878753, -0.253200423814921],
       [0.0, 1.0, 0.0, 0.17760889313808945, 6.632191985654332e-16],
       [1.0, 0.0, 0.0, -0.5489729424268225, -0.5266568815350361],
       [0.0, 0.0, 1.0, 0.0, -1.0735697969752662],
       [1.0, 0.0, 0.0, 1.3401398300419485, 1.3875383225057696],
       [0.0, 1.0, 0.0, 1.6307725642679132, 1.7521469327992565],
       [1.0, 0.0, 0.0, -0.2583402082008577, 0.29371249162530916]],
      dtype=object)

#### Applying multiple transformations

In [55]:
df[['Country','Age', 'Salary', 'Purchased']]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [58]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

X4 = df[['Country','Age', 'Salary', 'Purchased']].values

ct2 = ColumnTransformer(transformers = [('encoder1', OneHotEncoder(), [0]),
                                        ('scaler1' , StandardScaler(), [1,2]),
                                        ('Encoder2', OrdinalEncoder(), [3])])

ct2.fit_transform(X4)

array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         7.58874362e-01,  7.49473254e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -1.71150388e+00, -1.43817841e+00,  1.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        -1.27555478e+00, -8.91265492e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -1.13023841e-01, -2.53200424e-01,  0.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.77608893e-01,  6.63219199e-16,  1.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -5.48972942e-01, -5.26656882e-01,  1.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -1.07356980e+00,  0.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.34013983e+00,  1.38753832e+00,  1.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.000

## Label Encoding

In [98]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
Y

  return f(*args, **kwargs)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [99]:
le.fit_transform(df.Country)

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

# ================================================

# Splitting The data into training and testing sets
- The model is trained using train set
- The performance is evaluated on testing data (unseen data)

In [100]:
X

array([[1.0, 0.0, 0.0, 0.758874361590019, 0.7494732544921677],
       [0.0, 0.0, 1.0, -1.7115038793306814, -1.4381784072687531],
       [0.0, 1.0, 0.0, -1.2755547779917342, -0.8912654918285229],
       [0.0, 0.0, 1.0, -0.1130238410878753, -0.253200423814921],
       [0.0, 1.0, 0.0, 0.17760889313808945, 6.632191985654332e-16],
       [1.0, 0.0, 0.0, -0.5489729424268225, -0.5266568815350361],
       [0.0, 0.0, 1.0, 0.0, -1.0735697969752662],
       [1.0, 0.0, 0.0, 1.3401398300419485, 1.3875383225057696],
       [0.0, 1.0, 0.0, 1.6307725642679132, 1.7521469327992565],
       [1.0, 0.0, 0.0, -0.2583402082008577, 0.29371249162530916]],
      dtype=object)

In [101]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [107]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 5)

In [108]:
x_train

array([[0.0, 1.0, 0.0, 0.17760889313808945, 6.632191985654332e-16],
       [1.0, 0.0, 0.0, 1.3401398300419485, 1.3875383225057696],
       [0.0, 0.0, 1.0, -1.7115038793306814, -1.4381784072687531],
       [1.0, 0.0, 0.0, 0.758874361590019, 0.7494732544921677],
       [0.0, 1.0, 0.0, 1.6307725642679132, 1.7521469327992565],
       [0.0, 0.0, 1.0, 0.0, -1.0735697969752662],
       [0.0, 0.0, 1.0, -0.1130238410878753, -0.253200423814921]],
      dtype=object)

In [109]:
y_train

array([1, 1, 1, 0, 0, 0, 0])

In [110]:
x_test

array([[1.0, 0.0, 0.0, -0.2583402082008577, 0.29371249162530916],
       [1.0, 0.0, 0.0, -0.5489729424268225, -0.5266568815350361],
       [0.0, 1.0, 0.0, -1.2755547779917342, -0.8912654918285229]],
      dtype=object)

In [111]:
y_test

array([1, 1, 0])