In [1]:
#import 3 crucial libraries for data preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
X = dataset.iloc[:,:-1].values
X_header = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1].values
print(X)
print('\n')
print(X_header)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [5]:
#taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [6]:
imputer = imputer.fit(X[:, 1:3])
X[: , 1:3] = imputer.transform(X[:, 1:3])

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
#encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [9]:
label_encoder_X = LabelEncoder()  #takes no parameters
X[:, 0] = label_encoder_X.fit_transform(X[:, 0])

#onehotencoder = OneHotEncoder(categorical_features = [0])
#X = onehotencoder.fit_transform(X)
#now ML will think that Spain is greater than france or vice-versa
# to prevent his we're gonna use dummy variables, import onehot encoder

In [10]:
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

In [12]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [13]:
label_encoder_y = LabelEncoder()  #takes no parameters
y = label_encoder_y.fit_transform(y)


In [14]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [15]:
#experimental
X_frame=pd.DataFrame(X)
y_frame = pd.DataFrame(y)
pd.concat([X_frame,y_frame],axis=1).rename(columns={0 : 'a'})
#experimental

Unnamed: 0,a,1,2,3,4,a.1
0,1.0,0.0,0.0,44.0,72000.0,0
1,0.0,0.0,1.0,27.0,48000.0,1
2,0.0,1.0,0.0,30.0,54000.0,0
3,0.0,0.0,1.0,38.0,61000.0,0
4,0.0,1.0,0.0,40.0,63777.777778,1
5,1.0,0.0,0.0,35.0,58000.0,1
6,0.0,0.0,1.0,38.777778,52000.0,0
7,1.0,0.0,0.0,48.0,79000.0,1
8,0.0,1.0,0.0,50.0,83000.0,0
9,1.0,0.0,0.0,37.0,67000.0,1


In [16]:
#importing the library to split the dataset into Training and Test sets
from sklearn.cross_validation import train_test_split



In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling

### features are not in the same scale:
age 27 -50, salary 48k - 83k
euclidean distance will be dominated by salary

### standardisation

x_stand = (x-mean(x))/st.dev(x)

### normalization

x_norm = (x - min(x)) / (max(x) - min(x))


In [18]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

## Even if the model doesn't depend on euclidean distances, we still need to do feature scaling because the algorithm will converge much faster

In [19]:
print(X_train)
print(X_test)

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
[[ 0.  0.  0. -1. -1.]
 [ 0.  0.  0.  1.  1.]]


In [20]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])