In [1]:
# importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Replace the missing numerical data
from sklearn.impute import SimpleImputer
# Encoding the independant variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Encoding the dependant variable
from sklearn.preprocessing import LabelEncoder
# train-test-split
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler

In [2]:
dataset = pd.read_csv("Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [3]:
# Splitting the Dependant and Independant variables

# Collect all the rows except the last columns
X = dataset.iloc[:,:-1].values

# Collect all the values only from last columns
y = dataset.iloc[:,-1].values

print(f'The independant data is: \n {X}')
print(f'\n The dependant data is: \n {y}')

The independant data is: 
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

 The dependant data is: 
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [4]:
# Taking care of the missing data

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Collecting only the numerical columns
imputer.fit(X[:,1:])

# Replace the missing values with average
X[:, 1:] = imputer.transform(X[:,1:])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
# Encoding Categorical Data

# Encoding the independant Variable

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [6]:
# Encoding the dependant variable

le = LabelEncoder()

y = le.fit_transform(y)

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [7]:
# Splitting into train-test-split

# NOTE : TO prevent information leakage, feature scaling is done after splitting. 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =1)


In [8]:
print(f'X_train is: {X_train} \n')
print(f'X_test is: {X_test} \n')
print(f'y_train is: {y_train} \n')
print(f'y_test is: {y_test} \n')

X_train is: [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]] 

X_test is: [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]] 

y_train is: [0 1 0 0 1 1 0 1] 

y_test is: [0 1] 



In [9]:
# Feature Scaling (done after splitting the train and test data to avoid information leakage)

sc = StandardScaler()

# NOTE : do not feature scale the dummy variables. Apply only to numerical values

X_train[:,-2:] = sc.fit_transform(X_train[:,-2:])
X_test[:,-2:] = sc.fit_transform(X_test[:,-2:])

X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)