In [1]:
import pandas as pd
# Imputer - handle missing values
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split



In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [20]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1].values

# Handle Missing Values

In [21]:
im = Imputer(missing_values='NaN', strategy='mean')

In [22]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [23]:
X[:,1:3] = im.fit_transform(X[:,1:3])

In [24]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Label Encoding

In [25]:
encode = LabelEncoder()

In [26]:
X[:,0] = encode.fit_transform(X[:,0])

In [27]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [12]:
onehot = OneHotEncoder(categorical_features=[0])

In [13]:
X = onehot.fit_transform(X)

In [14]:
X

<10x5 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in COOrdinate format>

In [15]:
X.toarray()

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

# Feature Scaling

In [28]:
scaler = StandardScaler()

In [29]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [30]:
X = scaler.fit_transform(X)



In [31]:
X

array([[ -1.08347268e+00,   7.58874362e-01,   7.49473254e-01],
       [  1.32424438e+00,  -1.71150388e+00,  -1.43817841e+00],
       [  1.20385853e-01,  -1.27555478e+00,  -8.91265492e-01],
       [  1.32424438e+00,  -1.13023841e-01,  -2.53200424e-01],
       [  1.20385853e-01,   1.77608893e-01,   6.63219199e-16],
       [ -1.08347268e+00,  -5.48972942e-01,  -5.26656882e-01],
       [  1.32424438e+00,   0.00000000e+00,  -1.07356980e+00],
       [ -1.08347268e+00,   1.34013983e+00,   1.38753832e+00],
       [  1.20385853e-01,   1.63077256e+00,   1.75214693e+00],
       [ -1.08347268e+00,  -2.58340208e-01,   2.93712492e-01]])

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [33]:
x_train.shape

(8, 3)

In [35]:
x_test.shape

(2, 3)