In [26]:
import pandas
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
# Test more robust imputation methods
# from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute

%matplotlib inline

In [61]:
dataset = pandas.DataFrame(
    [['France', 44, 7.2e04, 'No'],
     ['Spain', 27, 4.8e04, 'Yes'],
     ['Germany', 30, 5.4e04, 'No'],
     ['Spain', 38, 6.1e04, 'No'],
     ['Germany', 40, np.nan, 'Yes'],
     ['France', 35, 5.8e04, 'Yes'],
     ['Spain', np.nan, 5.2e04, 'No'],
     ['France', 48, 7.9e04, 'Yes'],
     ['Germany', 50, 8.3e04, 'No'],
     ['France', 37, 6.7e04, 'Yes']],
    columns=['Country', 'Age', 'Salary', 'Purchased'])

In [62]:
features = dataset[['Country', 'Age', 'Salary']].values
response = dataset['Purchased'].values

## Imputation

In [63]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(features[:, 1:3])
features[:, 1:3] = imputer.transform(features[:, 1:3])

In [64]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encode Categorical Data

In [65]:
labelencoder = LabelEncoder()
# LabelEncoder makes assumption about data ordering (i.e 2 > 0; Germany > France)
features[:, 0] = labelencoder.fit_transform(features[:, 0])
# Lets use dummy encode
one_hot_encoder = OneHotEncoder(categorical_features=[0])
features = one_hot_encoder.fit_transform(features).toarray()

response_label_encoder = LabelEncoder()
response_labels = response_label_encoder.fit_transform(response)

In [66]:
response_labels

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Split train/test set

In [67]:
features_train, features_test, resp_train, resp_test = train_test_split(features, response_labels, test_size=0.5)

In [68]:
features_train

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])