# Data Preprocessing

## Importing the libraries

In [74]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from prettytable import PrettyTable

## Importing the dataset we're going to work with

In [80]:
data = pd.read_csv('Data.csv')
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [81]:
data.shape

(10, 4)

In [82]:
# Este método permite tabular los dataframes
def generate_ascii_table(df):
    x = PrettyTable()
    x.field_names = df.columns.tolist()
    for row in df.values:
        x.add_row(row)
    print(x)
    return x

In [94]:
generate_ascii_table(data)

+---------+------+---------+-----------+
| Country | Age  |  Salary | Purchased |
+---------+------+---------+-----------+
|  France | 44.0 | 72000.0 |     No    |
|  Spain  | 27.0 | 48000.0 |    Yes    |
| Germany | 30.0 | 54000.0 |     No    |
|  Spain  | 38.0 | 61000.0 |     No    |
| Germany | 40.0 |   nan   |    Yes    |
|  France | 35.0 | 58000.0 |    Yes    |
|  Spain  | nan  | 52000.0 |     No    |
|  France | 48.0 | 79000.0 |    Yes    |
| Germany | 50.0 | 83000.0 |     No    |
|  France | 37.0 | 67000.0 |    Yes    |
+---------+------+---------+-----------+


<prettytable.PrettyTable at 0x7faf116b7160>

In [84]:
data.shape

(10, 4)

In any dataset we have the *features (independending variables)* and the *depending variable*. The features are the columns with which we're going to predict the depending variable. In this case, the depending variable we want to predict is 'Purchased', because the company want to predict if some customers are going to purchased certain products based on the information that we have in the other columns.

In [85]:
# First, we create the features with all the rows and all the columns, except the last one
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
print(x)
print('\n', y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### Missing data treatment
There are many ways to treat missing data, but we cite the more usuals: drop those columns whit missing data (only recommended when we have large datasets and a low percentage of missing values) or replace the missing data using the average value of the dataset (mainly recommended when the dataset we're working with is not large enough).
In this case, we are going to choose the second method, replace the missing values with the average value of the distribution due to we have only a few elements in our dataset.

In [86]:
# impute is a object of the sklearn library to work with missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# we have to pass as an argument the columns with numerical values only. In this case, the columns Age and Salary 
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [87]:
# independent variable
# We are going to use one hot encoding to encode the categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [88]:
# dependent variable
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting dataset into the training set and the test set
Through this process, we split the dataset in two parts: one part is used to train our model while the other part is used to test the performance of our trained model. 

In [89]:
from sklearn.model_selection import train_test_split
# The recommended percentage of data in training set is 80% and the remaining 20% for the test set.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
# the random_state parameter is used for initializing the internal random number generator, which will decide the splitting of data into train and test indices in your case.

In [90]:
# Used values to train the model. 8 customers taken randomly of the dataset
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [91]:
# Used values to test the model. 2 customers taken randomly of the dataset
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [92]:
# Purchase decisions corresponding to the same x_train customers
print(y_train)

[0 1 0 0 1 1 0 1]


In [93]:
# Purchase decisions corresponding to the same x_train customers
print(y_test)

[0 1]


## Feature scaling
This step goes always after split the dataset into training and test set, because in this way we avoid leaking information to the test set

#### Standardization
This method works well in the majority of the cases.


x(stand) = (x - mean(x)) / standard desviation(x)

#### Normalization
This method is recommended when we have a normal distribution of our dataset

x(norm) = (x - min(x)) / (max(x) - min(x))

In [95]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Have thing to consider is that is not necessary to apply standardization to the dummy values (results of one hot encoding)
x_train[:,3:] = sc.fit_transform(x_train[:,3:]) # we take the last two columns
x_test[:,3:] = sc.transform(x_test[:,3:])

In [96]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [97]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
