In [1]:
# Data preprocessing

# Import library
import pandas as pd

# Import dataset
dataset = pd.read_csv('Data.csv')
print dataset

   Country   Age   Salary Purchased
0   France   NaN  72000.0        No
1    Spain  27.0      NaN       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [2]:
# Create metric of feature
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values
print X
print Y

[['France' nan 72000.0]
 ['Spain' 27.0 nan]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [12]:
# Taking care of missing data
# Using imputer library of sklearn to handle this issue
# Doc: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
from sklearn.preprocessing import Imputer
# Determine which one is missing
missingValues = 'NaN' 
# How process missing data
# mean : using average along the axis
# median: 
# most_frequent: using the most frequent value along the axis
# strategy = 'mean'
# axis column: 0, row: 1 , default: 0
axis = 0
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
# fit imputer object to dataset
# take all column index 1,2
imputer = imputer.fit(X[:, 1:3])
# Replace data by the mean of column
X[:, 1:3] = imputer.transform(X[:, 1:3])
print X

[['France' 38.125 72000.0]
 ['Spain' 27.0 65750.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 65750.0]
 ['France' 35.0 58000.0]
 ['Spain' 38.125 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [13]:
# Categorical variable
# Library: LabelEncoder OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Create label Object
labelEncoderX = LabelEncoder()
labelEncoderY = LabelEncoder()
# transform data
X[:, 0] = labelEncoderX.fit_transform(X[:,0])
Y = labelEncoderY.fit_transform(Y)
print X
print Y

[[0 38.125 72000.0]
 [2 27.0 65750.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 65750.0]
 [0 35.0 58000.0]
 [2 38.125 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]
[0 1 0 0 1 1 0 1 0 1]


In [14]:
# Create dummy data to prevent thinking which value is better than other
# France Spain
#   1      0
#   0      1
# Create object
oneHot = OneHotEncoder(categorical_features = [0])
X = oneHot.fit_transform(X).toarray()
print X

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.81250000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    6.57500000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.57500000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.81250000e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]


In [15]:
# Split dataset into training set and test set
# Adapt new set and new situation
# Library: 
from sklearn.cross_validation import train_test_split
# Good choice Test size is 0.2
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print XTrain
print XTest
print YTrain
print YTest

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.57500000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    6.57500000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.81250000e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.81250000e+01
    7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]]
[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]]
[1 1 1 0 1 0 0 1]
[0 0]




In [None]:
# Feature scaling
# Problem : all variables are not same scale, Ex: Salary : 15.000 -> 90.000 $ , age: 25 -> 60
# Scaling method: standardisation , normalisation
# x_std = (x - x_mean) / standard deviation (x)
# x_nor = (x - x_min) / (x_max - x_min)