# Data Preprocessing Template

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv("Data.csv")

In [3]:
# first three columns of the dataset - non boolean features and NOT dependent var vector
# iloc -> index location [rows, columns]
# : -> all , :-1 all but last
X = dataset.iloc[:, :-1].values

In [4]:
# last column of the dataset -> dependent classifiers
y = dataset.iloc[:, -1].values

In [5]:
print(list(zip(X, y)))

[(array(['France', 44.0, 72000.0], dtype=object), 'No'), (array(['Spain', 27.0, 48000.0], dtype=object), 'Yes'), (array(['Germany', 30.0, 54000.0], dtype=object), 'No'), (array(['Spain', 38.0, 61000.0], dtype=object), 'No'), (array(['Germany', 40.0, nan], dtype=object), 'Yes'), (array(['France', 35.0, 58000.0], dtype=object), 'Yes'), (array(['Spain', nan, 52000.0], dtype=object), 'No'), (array(['France', 48.0, 79000.0], dtype=object), 'Yes'), (array(['Germany', 50.0, 83000.0], dtype=object), 'No'), (array(['France', 37.0, 67000.0], dtype=object), 'Yes')]


## Handling Missing Data

In [6]:
from sklearn.impute import SimpleImputer
# which missing values, how it would be replaced
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# fit take all the columns of x with numerical values
imputer.fit(X[:,1:3])
# fit returns None, but transform returns columns so assign
X[:,1:3] = imputer.transform(X[:,1:3])
# set as the average of the rest

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encode Categorical data

what does this mean and why do we do this? -> simply for the computer to process it. We can turn these into one hot encoded columns.

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# transformers('what to do', 'what processcing are you using', qwhat columns to apply)
# remainer = what to do with the rest of the columns in the dataset
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encode Dependent Variable

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
print("X_train: {}".format(X_train))
print("X_test: {}".format(X_test))
print("y_train: {}".format(y_train))
print("y_test: {}".format(y_test))

X_train: [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
X_test: [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
y_train: [0 1 0 0 1 1 0 1]
y_test: [0 1]


## Feature Scaling

Question - What is feature scaling? -> goal of the feature scaling is to have the values in a certain rangfe

### Standardization
$X_{stand} = {{X - mean(X)} \over {\sigma(X)}}$

Take each value and subtract the average, then divide by the std of X. This results in value between -3 and 3

- all the time

### Normalization
$X_{norm} = {{X - min(X)} \over {max(x) - min(x)}}$

As X is positive, this always results in value between 0 and 1
- recommended for Normal distribution

## Regressions

### Simple Linear Regression
$y = b_{0} + b_{1}x_{1}$

### Multiple Linear Regression
$y = b_{0} + b_{1}x_{1} + b_{2}x_{2} + ... + b_{n}x_{n}$

### Polynomial Linear Regression
$y = b_{0} + b_{1}x_{1} + b_{2}x_{1}^{2} + ... + b_{n}x_{1}^{n}$

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
# If we are gonna do this, why split test and train first? Why not apply this to X, then run the split?
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])