**Importing the Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Importing the Dataset**

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# All rows, but only column 1, 2, and 3
X = df.iloc[:, :3].values
X

# Alternative
# X = df.iloc[:, :-1].values
# X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
# All rows, but only last column
y = df.iloc[:, -1:].values
y

# Alternative
# y = df.iloc[:, 3:].values
# y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

**Taking Care of Missing Data**

In [6]:
# Missing data: age and salary column
from sklearn.impute import SimpleImputer
# We replace the null value with the average value
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
# We choose Age and Salary column, which is column 1--column 2
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

**Encoding Categorical Data**

In [8]:
# Encoding the independent variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [9]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

In [11]:
X = np.array(ct.fit_transform(X))

In [12]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [22]:
# Encoding the dependent variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y.ravel())

In [23]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

**Splitting the Dataset into the Training Set and Test set**

In [24]:
from sklearn.model_selection import train_test_split
# 80% training set, 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [25]:
X_train

array([[1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

In [26]:
X_test

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0]], dtype=object)

In [27]:
y_train

array([1, 0, 1, 0, 1, 1, 0, 0])

In [28]:
y_test

array([0, 1])

**Feature Scaling**

In [29]:
# Stardardization
# For linear regression, SVM, etc.

# Normalization
# KNN, K-Means, data with outliers.

In [35]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Age and Salary Column
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

In [36]:
X_train[:, 3:]

array([[-0.7529426005471072, -0.6260377781240918],
       [1.008453807952985, 1.0130429500553495],
       [1.7912966561752484, 1.8325833141450703],
       [-1.7314961608249362, -1.0943465576039322],
       [-0.3615211764359756, 0.42765697570554906],
       [0.22561095973072184, 0.05040823668012247],
       [-0.16581046438040975, -0.27480619351421154],
       [-0.013591021670525094, -1.3285009473438525]], dtype=object)

In [39]:
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [40]:
X_test[:, 3:]

array([[2.1827180802863797, 2.3008920936249107],
       [-2.3186282969916334, -1.7968097268236927]], dtype=object)