In [38]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer # needed for missing numbers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # needed for categorical variables
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### Get the data

In [6]:
data = pd.read_csv("../datasets/Data_Preprocessing/Data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Extracting features

In [30]:
X = data[["Country", "Age", "Salary"]].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Extracting responses

In [56]:
y = data['Purchased'].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Fix missing values

In [31]:
imputer_x = Imputer(missing_values="NaN", strategy="mean", axis=0) # for columns
X[:, 1:3] = imputer_x.fit_transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Handling categorical variables

In [33]:
# Encode the country names
le_x = LabelEncoder()
X[:, 0] = le_x.fit_transform(X[:, 0])
# One hot encoding
ohe_x = OneHotEncoder(categorical_features=[0])
X = ohe_x.fit_transform(X).toarray()

### Handling y

In [57]:
# Encode y, label encoder needs a 1-d array
le_y = LabelEncoder()
y = le_y.fit_transform(y)
# impute y
y = y.reshape(-1, 1) # a col vec is required
imputer_y = Imputer(missing_values="NaN", strategy="mean", axis=0)
y = imputer_y.fit_transform(y)
y

array([[0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.]])

### Train Test split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Feature scaling
Scaling categorical variables will result in a loss of interpretability

In [41]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [59]:
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

In [60]:
data

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.40000000e+01, 7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        3.00000000e+01, 5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.50000000e+01, 5.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        5.00000000e+01, 