# This is the template for preprocessing the CSV data before you feed it into any Machine Learning algorithm

In [110]:
"""Data_Preprocessing_Template.ipynb: This is the template code for preproecssing the CSV Data 
before you use it to feed into any Machine Learning Algorithm."""

__author__ = "Deepak Saini"
__copyright__ = "Copyright 2019"
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Deepak Saini"
__email__ = "sainideepakdl@gmail.com"
__status__ = "Prototype"

## Import the required libraries

In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read the CSV file and put in DataFrame datastructure

In [112]:
dataset = pd.read_csv("Data.csv")

#### Create the matrix of features
#### Read all the rows and all the columns except the last one. the last column is the predicted value/dependent variable

In [113]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,-1].values

### The feature data consists of information related to the Retail Customers.
### Features - 
### 1. **Country** is the country to which the customer belongs. It is of Categorical type.
### 2. **Age** of the Customer. It is of type numeric.
### 3. **Estimated Salary** of the Customer. It is of type numeric.

In [114]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### The Predicted Values - Whether the customer purchases the Product or Not.  It is of Categorical type.

In [115]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## How to take care of the missing data?

#### We can see one row with missing Age and another row with missing Salary. 
#### There are many options. Few of them are -
#### 1. We can ignore the rows with missing values. Not the best option here because there are already very limited training data
#### 2. Replace missing data with average/median of their respective column values
#### 3. Replace missing data with the most frequently appearing value in their respective columns.
#### and many such more....

In [116]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

#### Here we are going to replace the missing value with the mean of the other values in that column

In [117]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [118]:
X[:, 1:]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [119]:
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [120]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encoding categorical data.
#### Specifically, we here want to encode the feature **Country**. The ML algo works with numeric values

In [121]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

#### We will use OneHotEncoding. It will create binary vector for each country

In [122]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [123]:
clTrans = ColumnTransformer(transformers=  [('encoder',OneHotEncoder(), [0])] , remainder='passthrough')

In [124]:
type(X)

numpy.ndarray

In [125]:
X = clTrans.fit_transform(X)

In [126]:
type(X)

numpy.ndarray

In [127]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


#### We should now encode the dependent variable as well. It currently has the values Yes and No

In [128]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
type(y)

numpy.ndarray

In [129]:
y = labelEncoder.fit_transform(y)
type(y)

numpy.ndarray

In [130]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


#### Splitting the dataset into the training and test data.

In [131]:
from sklearn.model_selection import train_test_split

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [139]:
X_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [140]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)