In [1]:
# Data Preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing DataSet
dataset = pd.read_csv('MachineLearningAZ/Part1_DataPreprocessing/Section2_Part1_DataPreProcessing/Python/Data.csv')

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [17]:
# Use iloc to separate features and labels
X = dataset.iloc[:, :-1].values  # iloc is used to address DataFrame using rows and columns

In [18]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [19]:
y = dataset.iloc[:, -1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [20]:
# Handling missing data
# 1. Either remove the complete row, which could be very dangerous as you might remove very important training data in the other columns
# 2. Replace missing data by the mean of the data for that column We'll do step 2

In [21]:
import sklearn
sklearn.__version__

'0.22.2.post1'

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
# The SimpleImputer class provides basic strategies for imputing missing values, like
# using the mean of the column or row values
# using the median of the column or row values
# using the mode of the column or row in which the missing values are located.
# This class also allows for different missing values encodings.
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
# Apply the imputer using the fit function.
# Here the fit method, when applied to the training dataset,learns the model parameters (for example, mean and standard deviation).
# We then need to apply the transform method on the training dataset to get the transformed (scaled) training dataset.
# We could also perform both of this steps in one step by applying fit_transform on the training dataset.
# Then why do we need 2 separate methods - fit and transform ?
# In practice we need to have a separate training and testing dataset and that is where having a separate fit and transform method helps.
# We apply fit on the training dataset and use the transform method on both - the training dataset and the test dataset.
# Thus the training as well as the test dataset are then transformed(scaled) using the model parameters that were learnt on applying
# the fit method the training dataset.
imputer = imputer.fit(X[:, 1:3])

In [24]:
# Finally impute using transform, transform also takes a parameter of what to impute. impute(mean, median, mode) values 
# are already computed at the fit() step
# X[:, 1:3] = imputer.transform(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [141]:
# Next, we need to encode categorical data to quantitative data ie columns like Country and Purchased.
# Machine Learning algo's are based on equations so all categorical data needs to be converted to quantitative data

In [142]:
# LabelEncoder is very trivial. Just a simple class to give numeric values from 0 to n-1 (where there are n different
# label types).
from sklearn.preprocessing import LabelEncoder
le_X = LabelEncoder()

In [143]:
# Fit the labels, ie let the LabelEncoder know about the labels
le_X.fit(X[:, 0])

# View the labels
print(le_X.classes_)

['France' 'Germany' 'Spain']


In [144]:
# Now transform, xform will return an array of the numeric values from 0 to n-1 assigned to each label.
le_X.transform(X[:, 0])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [145]:
# Inverse_transform
le_X.inverse_transform([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', 'Spain',
       'France', 'Germany', 'France'], dtype=object)

In [146]:
# You can also fit and transform in the same step. Infact a lot of classes have the fit(), tranform() and fit_transform()
# api's available. You can use fit_transform(X, y=None), in cases where the data to fit and the data to xform are the
# same; or both are known before hand.
le_X = LabelEncoder()
X[:, 0] = le_X.fit_transform(X[:, 0])

In [147]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [148]:
# Big Problem Above, The countries values are encoded as 0, 1, 2. This will make ML algo's think that these values 
# are comparable, ie 0(France) is less than 2(Spain) and 1(Germany) is greater than 0(France). If these were some
# other categorical data like size of shirts (Small, Medium, Large), it would have made sense, but here it doesnt
# make sense. To solve this, after we convert categorical data to quantitative, we have to separate the Countries
# into 3 columns (ie into as many columns as many country
# names). The value in these three column will be binary 0 or 1. So wherever the original country is France, the
# France column will have value 1 or other columns will have value 0. We use another Encoder for that next called
# OneHotEncoder

In [149]:
# OneHotEncoder is also very trivial. You give it a list of columns which contain categorical data. or 'all' if all
# columns in the array are categorical. For each categorical column it will create as many new columns as the no. of
# different categorical values in that column. For example, if col 0 has values [1, 2, 3, 1, 2, 3], then it will
# create 3 new columns for 1, 2 and 3. In each row, the value will be 0 or 1 to represent if it is 1, 2, or 3.
from sklearn.preprocessing import OneHotEncoder

In [150]:
ohe = OneHotEncoder(categorical_features=[0])

In [151]:
type(X)

numpy.ndarray

In [152]:
X = ohe.fit_transform(X)

In [153]:
# attributes of ohe
print(ohe.n_values_)  # array of categorical value count for each categorical column specified in the c'tor
print(ohe.feature_indices_) # an array of ints specifying the bin (start, end) for each categorical column

[3]
[0 3]


In [154]:
type(X)

scipy.sparse.coo.coo_matrix

In [155]:
X = X.toarray()

In [156]:
type(X)

numpy.ndarray

In [157]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [158]:
# Lets encode the label to also quantitative data. We dont need OneHotEncoder as ML algo already know that Labels
# are unrelated so it never compares them.
le_y = LabelEncoder()
y = le_y.fit_transform(y)

In [159]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [160]:
# Now lets split the data set into training and test set
from sklearn.cross_validation import train_test_split

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [164]:
print('X_train \n%s' % X_train)
print('X_test \n%s' % X_test)
print('y_train \n%s' % y_train)
print('y_test \n%s' % y_test)

X_train 
[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]]
X_test 
[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]]
y_train 
[1 1 1 0 1 0 0 1]
y_test 
[0 0]


In [172]:
# Feature scaling - VERY VERY IMPORTANT LESSON. 
# In ML, its very important that all features be on the same scale (range). Here we have age and salaries on different
# scales
print('Age range %s to %s' % (dataset['Age'].min(), dataset['Age'].max()))
print('Salary range %s to %s' % (dataset['Salary'].min(), dataset['Salary'].max()))

# A number of ML algos are dependent on the euclidian distance (ED) between two points. ED is the square root of
# summation of squares of the distance between the x points and the y points. So if we draw a plot with age on the
# x-axis and salary on the y-axis, the range of square of salaries will be even higher then the range of square of ages
# so it will be almost like the age's are non-existant compared to salaries. So its very important that they be in the
# same scale. By scaling we bring all features in the range like -1 to 1 or 0 to 1 (like that).

# Two methods of scaling exist:
# 1. Standardization - i.e. each value is changed by it's z-score
# 2. Normalization - i.e each value is reduced to (x-x.min()/(x.max()-x.min()))
# Lets attempt to do feature scaling the age and salary now.


Age range 27.0 to 50.0
Salary range 48000.0 to 83000.0


In [173]:
# again for feature scaling, we use StandardScalar from sklearn preprocessing module
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [174]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)  # no need to fit here as we already fitted based on training data above.

# A good question is whether or not we should feature scale the dummy variables (0,1) from the OneHotEncoder(). The
# answer is that it always depends on context. It depends on how much interpretation you want in your model. in our case
# if we scale the dummy variables, then we loose the meaning of which country does a value like 0.5 represent. 
# But anyway, scaling will never break your model, so we'll do it here anyway. 

In [176]:
print('X_train\n %s' % X_train)
print('X_test\n%s' % X_test)

X_train
 [[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
X_test
[[-1.          2.64575131 -0.77459667 -1.45882927 -0.90166297]
 [-1.          2.64575131 -0.77459667  1.98496442  2.13981082]]


In [177]:
# Even if sometimes your ML does not depend on ED, even then we need feature scaling as they will converge much fatser
# like for DecisionTree ML algos; otherwise they will take very long to run.

# And we dont need scaling on label (or dependent variable), as its a categorical type will value as 0 or 1
# in some cases we might need it (like regression models), but we'll see that later.