# Data Preprocessing


## Data Reading and Matrix conversion

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle


df = pd.read_csv('Data.csv')
print("data=\n",df)

df = shuffle(df).reset_index(drop=True)
print("Shuffled data =\n",df)



X = df.iloc[:,:-1].values # make X data into matrix/array
print("X=\n",X)

y = df.iloc[:,-1].values # make y data into matrix/array
print("y=\n",y)

data=
    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
Shuffled data =
    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1  Germany  50.0  83000.0        No
2   France  48.0  79000.0       Yes
3   France  35.0  58000.0       Yes
4    Spain  38.0  61000.0        No
5    Spain   NaN  52000.0        No
6  Germany  40.0      NaN       Yes
7    Spain  27.0  48000.0       Yes
8   France  37.0  67000.0       Yes
9  Germany  30.0  54000.0        No
X=
 [['France' 44.0 72000.0]
 ['Germany' 50.0 83000.0]
 ['France' 48.0 79000.0]
 ['France' 35.0 58000.0]
 ['Spain' 38.0 61000.0]
 ['Spain' nan 52000.0]
 ['Germany' 40.0 nan]
 ['Spain' 

## Missing Data

In [27]:
# replace missing data with mean of the column
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer = imputer.fit(X[:, 1:3])

X[:, 1:3] = imputer.transform(X[:, 1:3])
print("Transformed X = \n",X)





Transformed X = 
 [['Spain' 38.77777777777778 52000.0]
 ['Germany' 40.0 63777.77777777778]
 ['Germany' 30.0 54000.0]
 ['France' 44.0 72000.0]
 ['France' 37.0 67000.0]
 ['Germany' 50.0 83000.0]
 ['France' 48.0 79000.0]
 ['Spain' 38.0 61000.0]
 ['Spain' 27.0 48000.0]
 ['France' 35.0 58000.0]]


## Categorical Data

In [28]:
#changing categories(text) into numbers for computation
from sklearn.preprocessing import LabelEncoder
le_X = LabelEncoder()
le_X.fit_transform(X[:,0]) #change countries names to numbers
X[:,0] = le_X.fit_transform(X[:,0])
print("Categorized X = \n", X)

le_y = LabelEncoder()
le_y.fit_transform(y) #change countries names to numbers
y = le_X.fit_transform(y)
print("Categorized y = \n",y)

Categorized X = 
 [[2 38.77777777777778 52000.0]
 [1 40.0 63777.77777777778]
 [1 30.0 54000.0]
 [0 44.0 72000.0]
 [0 37.0 67000.0]
 [1 50.0 83000.0]
 [0 48.0 79000.0]
 [2 38.0 61000.0]
 [2 27.0 48000.0]
 [0 35.0 58000.0]]
Categorized y = 
 [0 1 0 0 1 0 1 0 1 1]


## DummyEncoding

In [29]:
#to avoid the model to assume ranking countries as they aare categorized with numbers
from sklearn.preprocessing import OneHotEncoder  
encoder = OneHotEncoder(categorical_features=[0])
X = encoder.fit_transform(X).toarray()
print("Encoded X = \n", X)

Encoded X = 
 [[0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]]


## Training/Test set data

In [30]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("X Training = \n",X_train)
print("X Test = \n",X_test)
print("y Training = \n",y_train)
print("y Test = \n",y_test)

X Training = 
 [[1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]]
X Test = 
 [[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [0.0e+00 0.0e+00 1.0e+00 2.7e+01 4.8e+04]]
y Training = 
 [1 1 1 1 0 0 0 0]
y Test = 
 [0 1]


## Feature Scaling

In [31]:
# as the eucledian metric will be dominated by the parameter with has higher scale(Here Salary)
# without feature scaling model may take long time to converge
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
print("StandarScaled X Training Data = \n",X_train)

X_test = sc_X.transform(X_test)
print("StandarScaled X Test Data = \n",X_test)

StandarScaled X Training Data = 
 [[ 1.         -0.57735027 -0.57735027 -0.85883097  0.00281791]
 [ 1.         -0.57735027 -0.57735027 -1.25394809 -0.91018421]
 [-1.          1.73205081 -0.57735027 -0.26615528 -0.3240594 ]
 [ 1.         -0.57735027 -0.57735027  1.3143132   1.22015407]
 [-1.         -0.57735027  1.73205081 -0.66127241 -0.60585017]
 [ 1.         -0.57735027 -0.57735027  0.52407896  0.51004131]
 [-1.         -0.57735027  1.73205081 -0.50761575 -1.5188523 ]
 [-1.          1.73205081 -0.57735027  1.70943033  1.62593279]]
StandarScaled X Test Data = 
 [[-1.          1.73205081 -0.57735027 -2.24174089 -1.31596294]
 [-1.         -0.57735027  1.73205081 -2.83441657 -1.92463102]]
