# This is the template for preprocessing the CSV data before you feed it into any Machine Learning algorithm

In [3]:
"""Data_Preprocessing_Template.ipynb: This is the template code for preproecssing the CSV Data 
before you use it to feed into any Machine Learning Algorithm."""

__author__ = "Deepak Saini"
__copyright__ = "Copyright 2019"
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Deepak Saini"
__email__ = "sainideepakdl@gmail.com"
__status__ = "Prototype"

## Import the required libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read the CSV file and put in DataFrame datastructure

In [5]:
dataset = pd.read_csv("Data.csv")

#### Create the matrix of features
#### Read all the rows and all the columns except the last one. the last column is the predicted value/dependent variable

In [6]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,-1].values

### The feature data consists of information related to the Retail Customers.
### Features - 
### 1. **Country** is the country to which the customer belongs. It is of Categorical type.
### 2. **Age** of the Customer. It is of type numeric.
### 3. **Estimated Salary** of the Customer. It is of type numeric.

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### The Predicted Values - Whether the customer purchases the Product or Not.  It is of Categorical type.

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## How to take care of the missing data?

#### We can see one row with missing Age and another row with missing Salary. 
#### There are many options. Few of them are -
#### 1. We can ignore the rows with missing values. Not the best option here because there are already very limited training data
#### 2. Replace missing data with average/median of their respective column values
#### 3. Replace missing data with the most frequently appearing value in their respective columns.
#### and many such more....

In [9]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### Here we are going to replace the missing value with the mean of the other values in that column

In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [11]:
X[:, 1:]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [12]:
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [13]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encoding categorical data.
#### Specifically, we here want to encode the feature **Country**. The ML algo works with numeric values

In [14]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

#### We will use OneHotEncoding. It will create binary vector for each country

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
clTrans = ColumnTransformer(transformers=  [('encoder',OneHotEncoder(), [0])] , remainder='passthrough')

In [17]:
type(X)

numpy.ndarray

In [18]:
X = clTrans.fit_transform(X)

In [19]:
type(X)

numpy.ndarray

In [20]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


#### We should now encode the dependent variable as well. It currently has the values Yes and No

In [21]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
type(y)

numpy.ndarray

In [22]:
y = labelEncoder.fit_transform(y)
type(y)

numpy.ndarray

In [23]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


#### Splitting the dataset into the training and test data.

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [26]:
X_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [27]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

### Feature Scaling
#### Sacling can be in two ways -
#### 1. Standardization -  Xstnd =  (x-mean(x))/stddev(x)           
#### the standardized values would be in the range -3 to 3. Standardization works well all the time.
#### 2. Normalization -  Xnorm = (x-min(x))/(max(x) - min(x))
#### the normalized values would be in the range 0 and 1. Normalization is recommended when you have a normal distribution is most of the features


### Applying Standarization on the features

In [29]:
from sklearn.preprocessing import StandardScaler
scaler =  StandardScaler()

#### Scaling is not required on the encoded features. the values of these features are already either 0 or 1.

In [32]:
X_train[:,3:] = scaler.fit_transform(X_train[:, 3:])
X_test[:,3:] = scaler.transform(X_test[:, 3:])

In [37]:
print(X_train)

[[0.0 1.0 0.0 0.2630675731713538 0.1238147854838185]
 [1.0 0.0 0.0 -0.25350147960148617 0.4617563176278856]
 [0.0 0.0 1.0 -1.9753983221776195 -1.5309334063940294]
 [0.0 0.0 1.0 0.05261351463427101 -1.1114197802841526]
 [1.0 0.0 0.0 1.6405850472322605 1.7202971959575162]
 [0.0 0.0 1.0 -0.08131179534387283 -0.16751412153692966]
 [1.0 0.0 0.0 0.9518263102018072 0.9861483502652316]
 [1.0 0.0 0.0 -0.5978808481167128 -0.48214934111933727]]


In [36]:
print(X_test)

[[0.0 1.0 0.0 -1.4588292694047795 -0.9016629672292141]
 [0.0 1.0 0.0 1.984964415747487 2.139810822067393]]
