In [48]:
'''
1. Data Pre-Processing
    1.1 Import the Libraries
    1.2 Import the Dataset
    1.3 Take care of missing values
    1.4 Encode Categorical data - Independent and Dependent Variables
    1.5 Splitting the dataset into the Training and Test set
    1.6 Feature Scaling
'''

'\n1. Data Pre-Processing\n    1.1 Import the Libraries\n    1.2 Import the Dataset\n    1.3 Take care of missing values\n    1.4 Encode Categorical data - Independent and Dependent Variables\n    1.5 Splitting the dataset into the Training and Test set\n    1.6 Feature Scaling \n'

In [40]:
# STEP 1: to import the required libraries

import numpy as np
# to work with arrays, np is a shortcut, usefull to call later
import matplotlib.pyplot  as plt
# to draw charts and graphs, pyplot specefically
import pandas as pd
# to import the dataset and use matrix of features

In [41]:
# STEP2 :is to import the required dataset

dataset = pd.read_csv('Data.csv')     # this will create the data frame,
#you know, all the values inside this dataset. And this data frame will
#be exactly this dataset variable.

# A dataset will always have features and dependent varibales(which we are
#training and later on used to train the model to predict, always the last
#column)

X =  dataset.iloc[:, :-1].values
# for matrix of features ,iloc stands for locate indexes, : stands for everyrow,
#:-1 stands for every column upto last but excluded as the upper bound gets
#excluded

Y = dataset.iloc[:, -1].values # for the dependent variable vector,
#-1 stands for the lsat column

print(X)
print(Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [42]:
# STEP3 :is to take care of missing data, by replacing it with the average

df = pd.read_csv('Data.csv')
missing_data = df.isnull().sum() # Identify missing data (assumes that missing
#data is represented as NaN)
print("Missing data: \n", missing_data) # Print the number of missing entries
#in each column


from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# an object created to replace all the missing values with the mean

imputer.fit(X[:,1:3]) # fitting that object in our dataset, connecting
#our imputer to our matrix of features.

X[:,1:3] = imputer.transform(X[:,1:3]) # this transform method will exactly do
#that replacement of the missing salary and age here by the mean of the salaries

print(X)

Missing data: 
 Country      0
Age          1
Salary       1
Purchased    0
dtype: int64
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [43]:
# STEP4 :is to encode categorical data

# to make understand the model that there is no relation between the
#categorical data, i.e. France, Spain and germany

# for that we do "one hot encoding", i.e splitting the dataset into


In [44]:
# Step 4.1 Encoding the independent variable (France, Spain, Germany)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0] )],
                       remainder='passthrough')

X = np.array(ct.fit_transform(X))

print(X)

#France encoded as 1.0.0, Spain as 0.0.1, Gerany as 0.1.0 vector below


[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [46]:
# Step 4.2 Encoding the dependent variable  (Yes,No)

#use another class called "Label Encoder" which will exactly encode these
#no's and yes's into zeros and ones.

from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
Y = le.fit_transform(Y)

print(Y)

[0 1 0 0 1 1 0 1 0 1]


In [47]:
# Step 5 Splitting the dataset into the Training and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, Y_test, Y_train = train_test_split(X,Y,test_size=0.2,
                                                    random_state=1)

print(X_train)
print(Y_train)
print(X_test)
print(Y_test)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
[0 1]
[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
[0 1 0 0 1 1 0 1]


In [49]:
# Step6: Feature Scaling

'''
apply feature scaling after the split, to avoid indeed information leakage
because simply the test set is supposed to be something new

why use feaure scaling? because for some of the machinery models
that's in order to avoid some features to be dominated by other features
in such a way that the dominated features are not even considered
by the machinery model.

Two types of Feature scaling are : Normalization and Standardization

Normalization is   x' = (x - x_min) / (x_max - x_min), which will
end up in a new column where the values range from 0 to 1

Normalization is recommended when you have
a normal distribution in most of your features.


Standardization is x' = (x- mean) / standard_deviation, which will
end up in a new column where the values range from -3 to 3,
if outliers present, can go beyond

Standardization actually works well ***ALL THE TIME***.

'''

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])
'''
Most frequently asked Q : do we have to apply feature scaling
standardization to the dummy variables in the matrix of features?

The answer is no, because simply, well remember the goal  of standardization or
feature scaling in general, it is to have all the  values of the features
in the same range.
'''

print(X_test)
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
