# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values # The features (the independent variables) => Takes all the rows and all the columns except the last column
y = dataset.iloc[:, -1].values # The dependent variable (that we want to predict) => Takes all the rows and the last column (dependent variable vector)

In [3]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # Computes the mean values for the specified dataframe. Get all rows and the columns from 1 (included) to 3 (excluded)
X[:, 1:3] = imputer.transform(X[:, 1:3]) # Actually does the replacement of the missing numerical values and returns a new dataframe. Should be use to replace the part of the dataframe in question

In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# We want to transform the country column (the independent variable with the country names) into some numerical values (because we cannot just feed Strings to the machine learning model later on). We don't want to give them just numbers because that may confuse the model. It can think that there is some kind of numerical relationship between these numbers and the dependent variable. Therefore, we encode the country column using OneHotEncoder, which gives each String a certain binary combination. We have 3 categories in this column: France, Germany, and Spain. So, using the OneHotEncoder, they will be given encoded values like 000, 100, and 010 (just an example). To apply this transformation, we use the class ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') # We want to do an encoding transformation, we pass a OneHotEncoder object, and we want to apply the encoder on the first column (of countries) which have the index 0. For the rest of the columns, we want to do nothing (to leave them intact), that's why the remainder takes the value 'passthrough'
X = np.array(ct.fit_transform(X)) # Fit and transform happen in the same step in the ColumnTransformer class

In [8]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [9]:
from sklearn.preprocessing import LabelEncoder

# The dependent variable (y) is also text (yes/no), we need also to encode that numerically before feeding the dataset to the model. Here, we have only two categories yes/no, so we can just transform them into 1 and 0. For that, we can simply use the LabelEncoder class (which also has a fit_transform method).
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

#### We have to split the dataset into training set and testing set *before* feature scaling. The reason is feature scaling  applies the mean and the standard deviation in the dataset. The test set must be brand new everytime. Applying feature scaling before will leak testing set data into the training set data, and we should avoid that.

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1) # test_size = 0.2 means 20% of the dataset will be used for testing (80% will be used for training). random_state = 1 is just for educational purposes: it fixes the randomization seed to 1 so that we always get the same training and test sets (just for education)

In [12]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [13]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [14]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [15]:
print(y_test)

[0 1]


## Feature Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

# We are going to do Feature Scaling using Standardization (because it works all the time, while Normalization works only if the feature's values are normally distributed. That's why we are going to use the StandardScaler class)
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # Fit calculates the mean and the standard deviation of each feature, and transform will apply them on the dataset. We don't want to apply the Feature Scaling (Standardization) to the dummy value generated for the country, which after being encoded were transformed into 3 columns with values of 0's and 1's. We don't need to do any changes in these values, therefore, we apply the Standardization from the fourth column on

X_test[:, 3:] = sc.transform(X_test[:, 3:]) # The same scaler is applied on the test set, but only the transform method is used because it must be fitted to the training test features (This is very important)


SyntaxError: invalid syntax (<ipython-input-17-1ff7646f9e93>, line 6)

In [17]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [18]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
