In [78]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

# Data Preprocessing 

In [95]:
# Import data using Pandas
data = pd.read_csv('Data.csv')
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [96]:
X = data.iloc[:,:-1].values # Features 
y = data.iloc[:,-1].values # target cat --> encode to 0/1 

In [97]:
X.shape

(10, 3)

In [98]:
# Check the nan data in dataset 
print(data.isna().sum()),  print(data.isnull().sum())

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


(None, None)

In [99]:
# Handle the missing data 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(X[:,1:3])
X[:,1:3]= imputer.transform(X[:,1:3])
X # Imputed X 

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [100]:
# Encode the country name 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
countries_encoded = encoder.fit_transform(X[:,0].reshape(-1,1))

In [101]:
print(countries_encoded.toarray())

[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [102]:
X_new = np.concatenate((countries_encoded.toarray(), X[:,1:3]), axis=1)
X_new

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [103]:
# Encode y to 0/1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [104]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [110]:
# split dataset to training vs testing 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [116]:
X_train , X_test

(array([[1.0, 0.0, 0.0, -0.7529426005471072, -0.6260377781240918],
        [1.0, 0.0, 0.0, 1.008453807952985, 1.0130429500553495],
        [1.0, 0.0, 0.0, 1.7912966561752484, 1.8325833141450703],
        [0.0, 1.0, 0.0, -1.7314961608249362, -1.0943465576039322],
        [1.0, 0.0, 0.0, -0.3615211764359756, 0.42765697570554906],
        [0.0, 1.0, 0.0, 0.22561095973072184, 0.05040823668012247],
        [0.0, 0.0, 1.0, -0.16581046438040975, -0.27480619351421154],
        [0.0, 0.0, 1.0, -0.013591021670525094, -1.3285009473438525]],
       dtype=object),
 array([[0.0, 1.0, 0.0, 50.0, 83000.0],
        [0.0, 0.0, 1.0, 27.0, 48000.0]], dtype=object))

In [119]:
# Feature Scaling 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[:,3:] = scaler.fit_transform(X_train[:,3:])


In [124]:
# Have Bug in Stadardizer 
X_test[:,3:] = scaler.fit_transform(X_test[:,3:])
X_test


array([[0.0, 1.0, 0.0, 1.0, 1.0],
       [0.0, 0.0, 1.0, -1.0, -1.0]], dtype=object)