In [0]:
import pandas as pd
import numpy as np

In [0]:
#Loading
data = pd.read_csv('pre-process_datasample.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [0]:
#Sklearn expects your data to be in the form of numpy array !!!
#Sklearn also expects your data to be in the form of vector !! (minimum 2d array)
#Step 1: Seperate data as features and label vectors
features = data.iloc[:,0:3].values
label = data.Purchased.values

In [0]:
features.shape #vector :)

(10, 3)

In [0]:
label.shape # Scalar :(

(10,)

In [0]:
label = label.reshape(-1,1)
label.shape # Vector :)
label

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [0]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [0]:
#Dealing with Missing data
#Imputation in Sklearn
#Link for Doc https://scikit-learn.org/0.15/modules/generated/sklearn.preprocessing.Imputer.html#sklearn.preprocessing.Imputer
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN',
                 strategy = 'mean',
                 axis = 0) #column

#Apply the object
features[:,1:3] = imputer.fit_transform(features[:,1:3])
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [0]:
#Dealing with categorical data
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
features[:,0] = encode.fit_transform(features[:,0])
features
#Mathematical weightage issue
#To remove mathematical weigtage issue, we use hot encoding
#OneHotEncoding

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [0]:
encode.classes_

array(['France', 'Germany', 'Spain'], dtype=object)

In [0]:
#OneHotEncoding ---- categorical_features[weightedcolumn_index]
from sklearn.preprocessing import OneHotEncoder
hotencode = OneHotEncoder(categorical_features=[0]) #Which column contains the output of LabelEncoder
features = hotencode.fit_transform(features)
features 

<10x5 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in COOrdinate format>

In [0]:
features = features.toarray()
features

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [0]:
label

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [0]:
#Sklearn is friendly with categorical labels :)
#NO extra processing required !

In [0]:
#Feature Scaling ----> StandardScaler
# We are going to scale Features
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
features = scale.fit_transform(features)
features

array([[ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         7.58874362e-01,  7.49473254e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.71150388e+00, -1.43817841e+00],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
        -1.27555478e+00, -8.91265492e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.13023841e-01, -2.53200424e-01],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.77608893e-01,  6.63219199e-16],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -5.48972942e-01, -5.26656882e-01],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
         0.00000000e+00, -1.07356980e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         1.34013983e+00,  1.38753832e+00],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.63077256e+00,  1.75214693e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
        -2.58340208e-01

In [0]:
#this feature is the best candidate to create machine learning model.
# You need to create Training Set and the Testing set
# Training Set ---> Used by the ML algorithm to create the model
# Testing Set ----> Used to check the model performance w.r.t. prediction accuracy
# Best practice
# 65% dataset for training model
# 35% dataset for testing model
# 75%-25% , 80%-20%
# We will go ahead and make 80% - 20% ratio for our use-case
from sklearn.model_selection import train_test_split
#80% F, 20% F, 80% L, 20% L
X_train, X_test, y_train, y_test = train_test_split(features,
                                                   label,
                                                   test_size=0.2,
                                                   random_state=5)

In [0]:
X_train

array([[-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
        -1.27555478e+00, -8.91265492e-01],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.77608893e-01,  6.63219199e-16],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         1.34013983e+00,  1.38753832e+00],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.71150388e+00, -1.43817841e+00],
       [ 1.22474487e+00, -6.54653671e-01, -6.54653671e-01,
         7.58874362e-01,  7.49473254e-01],
       [-8.16496581e-01,  1.52752523e+00, -6.54653671e-01,
         1.63077256e+00,  1.75214693e+00],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
         0.00000000e+00, -1.07356980e+00],
       [-8.16496581e-01, -6.54653671e-01,  1.52752523e+00,
        -1.13023841e-01, -2.53200424e-01]])

In [0]:
X_test

array([[ 1.22474487, -0.65465367, -0.65465367, -0.25834021,  0.29371249],
       [ 1.22474487, -0.65465367, -0.65465367, -0.54897294, -0.52665688]])

In [0]:
y_train

array([['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['No']], dtype=object)

In [0]:
y_test

array([['Yes'],
       ['Yes']], dtype=object)