In [None]:
# Data Preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Importing DataSet
dataset = pd.read_csv('MachineLearningAZ/Part1_DataPreprocessing/Section2_Part1_DataPreProcessing/Python/Data.csv')

In [None]:
dataset

In [None]:
# Use iloc to separate features and labels
X = dataset.iloc[:, :-1].values  # iloc is used to address DataFrame using rows and columns

In [None]:
X

In [None]:
y = dataset.iloc[:, -1].values
y

In [None]:
# Handling missing data
# 1. Either remove the complete row, which could be very dangerous as you might remove very important training data in the other columns
# 2. Replace missing data by the mean of the data for that column We'll do step 2

In [None]:
import sklearn
sklearn.__version__

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# The SimpleImputer class provides basic strategies for imputing missing values, like
# using the mean of the column or row values
# using the median of the column or row values
# using the mode of the column or row in which the missing values are located.
# using a constant value
# This class also allows for different missing values encodings.
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
# Apply the imputer using the fit function.
# Here the fit method, when applied to the training dataset,learns the model parameters (for example, mean and standard deviation).
# We then need to apply the transform method on the training dataset to get the transformed (scaled) training dataset.
# We could also perform both of this steps in one step by applying fit_transform on the training dataset.
# Then why do we need 2 separate methods - fit and transform ?
# In practice we need to have a separate training and testing dataset and that is where having a separate fit and transform method helps.
# We apply fit on the training dataset and use the transform method on both - the training dataset and the test dataset.
# Thus the training as well as the test dataset are then transformed(scaled) using the model parameters that were learnt on applying
# the fit method the training dataset.
imputer = imputer.fit(X[:, 1:3])

In [None]:
# Finally impute using transform, transform also takes a parameter of what to impute. impute(mean, median, mode) values 
# are already computed at the fit() step
# X[:, 1:3] = imputer.transform(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

In [None]:
# Next, we need to encode categorical data to quantitative data ie columns like Country and Purchased.
# Machine Learning algo's are based on equations so all categorical data needs to be converted to quantitative data

In [None]:
# LabelEncoder is very trivial. Just a simple class to give numeric values from 0 to n-1 (where there are n different
# label types).
from sklearn.preprocessing import LabelEncoder
le_X = LabelEncoder()

In [None]:
# Fit the labels, ie let the LabelEncoder know about the labels
le_X.fit(X[:, 0])

# View the labels
print(le_X.classes_)

In [None]:
# Now transform, xform will return an array of the numeric values from 0 to n-1 assigned to each label.
le_X.transform(X[:, 0])

In [None]:
# Inverse_transform
le_X.inverse_transform([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [None]:
# You can also fit and transform in the same step. Infact a lot of classes have the fit(), tranform() and fit_transform()
# api's available. You can use fit_transform(X, y=None), in cases where the data to fit and the data to xform are the
# same; or both are known before hand.
le_X = LabelEncoder()
X[:, 0] = le_X.fit_transform(X[:, 0])

In [None]:
X

In [None]:
# Big Problem Above, The countries values are encoded as 0, 1, 2. This will make ML algo's think that these values 
# are numerically comparable, ie something like 0(France) is less than 2(Spain) and 1(Germany) is greater than 0(France).
# If these were some other categorical data like size of shirts (Small, Medium, Large), it would have made sense, but
# here it doesnt make sense. To solve this, after we convert categorical data to quantitative, we have to separate the
# Countries into 3 columns (ie into as many columns as many categories of this feature. The value in these three 
# column will be binary 0 or 1. So wherever the original country is France, the France column will have value 1 or 
# other columns will have value 0. We use another Encoder for that next called OneHotEncoder

In [None]:
# Infact we dont use OneHotEncoder directly, we use the ColumnTransformer class which is a high level class that can
# transform multiple categorical data based on different tranformers you give
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = column_transformer.fit_transform(X)

In [None]:
print (X)

In [None]:
# Lets encode the label to also quantitative data. We dont need OneHotEncoder here as labels are unrelated
# so it never compares them.
from sklearn.preprocessing import LabelEncoder
le_y = LabelEncoder()
y = le_y.fit_transform(y)

In [None]:
print(y)

In [None]:
# Now lets split the data set into training and test set
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) # can give random_state argument to fix the train/test set indices

In [None]:
X_test

In [None]:
# Feature scaling - VERY VERY IMPORTANT LESSON. 
# In ML, its very important that all features be on the same scale (range). 
# this way ML algo will give equal weight to all features. 
# Here we have age and salaries on different scales
print('Age range %s to %s' % (dataset['Age'].min(), dataset['Age'].max()))
print('Salary range %s to %s' % (dataset['Salary'].min(), dataset['Salary'].max()))

# A number of ML algos are dependent on the euclidian distance (ED) between two points. ED is the square root of
# summation of squares of the distance between the x points and the y points. So if we draw a plot with age on the
# x-axis and salary on the y-axis, the range of square of salaries will be even higher then the range of square of ages
# so it will be almost like the age's are non-existant compared to salaries. So its very important that they be in the
# same scale. By scaling we bring all features in the range like -1 to 1 or 0 to 1 (like that).

# Two methods of scaling exist:
# 1. Standardization - i.e. each value is changed by it's z-score
# 2. Normalization - i.e each value is reduced to (x-x.min()/(x.max()-x.min()))
# Lets attempt to do feature scaling the age and salary now.


In [None]:
# again for feature scaling, we use StandardScalar from sklearn preprocessing module
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [None]:
X_test

In [None]:
X_train[:, 3:] = sc_X.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc_X.transform(X_test[:, 3:])  # no need to fit here as we already fitted based on training data above.

# A good question is whether or not we should feature scale the dummy variables (0,1) from the OneHotEncoder(). The
# answer is that it always depends on context. It depends on how much interpretation you want in your model. in our case
# if we scale the dummy variables, then we loose the meaning of which country does a value like 0.5 represent.

In [None]:
print('X_train\n %s' % X_train)
print('X_test\n%s' % X_test)
# Even if sometimes your ML does not depend on ED, even then we need feature scaling as they will converge much fatser
# like for DecisionTree ML algos; otherwise they will take very long to run.

# And we dont need scaling on labels (or dependent variable), as its a categorical type will value as 0 or 1
# in some cases we might need it (like regression models), but we'll see that later.