# Data Preprocessing

In [1]:
# Import Python libraries
import pandas as pd # pandas can read and manage datasets

### Import the Dataset - Data is given in .csv format, make sure the file is part of the root directory, along with this file.

#### Different ways to print the dataset
1. print(dataset) # prints raw data, no table.
2. dataset.head() # prints the first 5 columns, or can take a number as a parameter to print n number of cloumns.
3. dataset # print with variable which will display in an organized table format.

In [2]:
# Importing the dataset
dataset = pd.read_csv('Data.csv') # we use pandas to read csv
dataset # prints everything in the dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Note: the country (index 0) and purchased (index 3) columns are known as category values. Each column contains similar values. Index 0 has three categories: France, Spain and Germany, and index 3 has two catgeories: yes and no. 

In [3]:
x = dataset.iloc[:, :-1].values # the colon (:) means selection of all indexs (columns)
print(x) # prints every column but the last index 3 (Purchased Column)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
y = dataset.iloc[:, 3].values
print(y) # prints only the last column index 3 (Purchased Column)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Dealing with missing datasets

In [5]:
# Now lets take care of missing data (NaN)
# Import Scikit-Learn
# Imputer is a class of sklearn
from sklearn.preprocessing import Imputer # a machine modeling library with Imputer that takes care of missing data
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) # since there is a value missing, we use statistics to find the mean value and fill in the gap.
imputer = imputer.fit(x[:, 1:3]) # print index columns 1 and 2 only. Note: we assume that we will place 2 instead of 3, but we need to take in consideration the lower bounds. Therefore, in this case 3 is correct.
x[:, 1:3] = imputer.transform(x[:, 1:3])
print(x) # print x values

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


As you can see from the values from above, the average age of 38.777..8 (index 1 - column 2) replaces the missing NaN value, and the average salary of $63,777.777..8 (index 2 - column3) replaces the NaN value.

Note:depending on the dataset, the mean may not always be the rigth approach, with sklearn we can also find the median and the most frequent value along the axis of the dataset.   

## Categorical Data

In [6]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_x = LabelEncoder()
labelencoder_x.fit_transform(x[:,0]) # transforms each category into an integer value

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=int64)

In [7]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_x = LabelEncoder()
x[:,0] = labelencoder_x.fit_transform(x[:,0]) # code changed here
print(x) # calling x this time replaces each string category with its respective integer value

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


France = 0; Spain = 1; Germany = 2;
However, since its an integer value, python will assume that Germany (2) is bigger than Spain (1) and Spain is bigger than France (0). Bugs can come out of this, therefore, we add dummy varibales where each catgeory will have its own column, and each integer value will be displayed in each column that corresponds to index 0

In [8]:
# Encoding categorical data with OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[:,0] = labelencoder_x.fit_transform(x[:,0]) 
onehotencoder = OneHotEncoder(categorical_features = [0])
x = onehotencoder.fit_transform(x).toarray()
print(x) # call x varibale

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]


In [9]:
# Display purchase column category
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y) # call y, prints index 3 - last column

[0 1 0 0 1 1 0 1 0 1]


# Spliting the Dataset

In [10]:
# Spliting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split 
# the training and testing features of the matrice of x and y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0) # 20% of dataset  

In [11]:
# Output dataset into test and training 
print(x_train)

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]]


In [12]:
print(x_test)

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]]


In [13]:
print(y_train)

[1 1 1 0 1 0 0 1]


In [14]:
print(y_test)

[0 0]


# Feature Scaling
Scaling the page column and salary column.

In [15]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
print(x_train) # ignore the -1 and 1 values

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]


In [16]:
print(x_test)

[[-1.          2.64575131 -0.77459667 -1.45882927 -0.90166297]
 [-1.          2.64575131 -0.77459667  1.98496442  2.13981082]]
