# Data Preparation for XGBoost
Example of options to best handle missing data This example uses the Horse Colic dataset, which contain approx 30% missing data. Binary classification.

In [1]:
import numpy as np
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer

## Load data

In [2]:
# Download data from https://archive.ics.uci.edu/ml/datasets/Horse+Colic
dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None)
dataset = dataframe.values

In [3]:
# See a sample of the dataset (first 5 rows)
dataset[0:5,]

array([['2', 1, 530101, '38.50', '66', '28', '3', '3', '?', '2', '5',
        '4', '4', '?', '?', '?', '3', '5', '45.00', '8.40', '?', '?',
        '2', 2, 11300, 0, 0, 2],
       ['1', 1, 534817, '39.2', '88', '20', '?', '?', '4', '1', '3', '4',
        '2', '?', '?', '?', '4', '2', '50', '85', '2', '2', '3', 2, 2208,
        0, 0, 2],
       ['2', 1, 530334, '38.30', '40', '24', '1', '1', '3', '1', '3',
        '3', '1', '?', '?', '?', '1', '1', '33.00', '6.70', '?', '?',
        '1', 2, 0, 0, 0, 1],
       ['1', 9, 5290409, '39.10', '164', '84', '4', '1', '6', '2', '2',
        '4', '4', '1', '2', '5.00', '3', '?', '48.00', '7.20', '3',
        '5.30', '2', 1, 2208, 0, 0, 1],
       ['2', 1, 530255, '37.30', '104', '35', '?', '?', '6', '2', '?',
        '?', '?', '?', '?', '?', '?', '?', '74.00', '7.40', '?', '?',
        '2', 2, 4300, 0, 0, 2]], dtype=object)

## Separate into X (features) and y (label)

In [4]:
X = dataset[:,0:27]
y = dataset[:,27]

## Option 1: Set missing values to zero

In [5]:
X_zero = np.copy(X)
X_zero[X_zero == '?'] = 0
X_zero = X_zero.astype('float32')

In [6]:
# Check a missing value to ensure it's now zero
X_zero[0,8]

0.0

## Option 2: Set missing values to non-zero (e.g. one)

In [7]:
X_nonzero = np.copy(X)
X_nonzero[X_nonzero == '?'] = 1
X_nonzero = X_nonzero.astype('float32')

In [8]:
# Check a missing value to ensure it's now one
X_nonzero[0,8]

1.0

## Option 3: Set missing values to NaN

In [9]:
X_nan = np.copy(X)
X_nan[X_nan == '?'] = np.nan
X_nan = X_nan.astype('float32')

In [10]:
# Check a missing value to ensure it's now NaN
X_nan[0,8]

nan

## Option 4: Impute missing values as the mean

In [11]:
imputer = Imputer()
X_imputed = imputer.fit_transform(X_nan)

In [12]:
# Check a missing value to ensure it's now the mean value
X_imputed[0,8]

2.853755

## Encode the y-label as integers

In [13]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

## Configure the train-test split 80/20

In [14]:
seed = 8
test_size = 0.2

# Option 1
X_zero_train, X_zero_test, y_zero_train, y_zero_test = train_test_split(X_zero, label_encoded_y, test_size=test_size, random_state=seed)

# Option 2
X_nonzero_train, X_nonzero_test, y_nonzero_train, y_nonzero_test = train_test_split(X_nonzero, label_encoded_y, test_size=test_size, random_state=seed)

# Option 3
X_nan_train, X_nan_test, y_nan_train, y_nan_test = train_test_split(X_nan, label_encoded_y, test_size=test_size, random_state=seed)

# Option 4
X_imputed_train, X_imputed_test, y_imputed_train, y_imputed_test = train_test_split(X_imputed, label_encoded_y, test_size=test_size, random_state=seed)

## Fit the model to the training data

In [15]:
# Option 1
model_zero = XGBClassifier()
model_zero.fit(X_zero_train, y_zero_train)

# Option 2
model_nonzero = XGBClassifier()
model_nonzero.fit(X_nonzero_train, y_nonzero_train)

# Option 3
model_nan = XGBClassifier()
model_nan.fit(X_nan_train, y_nan_train)

# Option 4
model_imputed = XGBClassifier()
model_imputed.fit(X_imputed_train, y_imputed_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Make predictions

In [16]:
# Option 1
y_zero_pred = model_zero.predict(X_zero_test)
zero_predictions = [round(value) for value in y_zero_pred]

# Option 2
y_nonzero_pred = model_nonzero.predict(X_nonzero_test)
nonzero_predictions = [round(value) for value in y_nonzero_pred]

# Option 3
y_nan_pred = model_nan.predict(X_nan_test)
nan_predictions = [round(value) for value in y_nan_pred]

# Option 4
y_imputed_pred = model_imputed.predict(X_imputed_test)
imputed_predictions = [round(value) for value in y_imputed_pred]

  if diff:
  if diff:
  if diff:
  if diff:


## Evaluate model

In [17]:
# Option 1
tn, fp, fn, tp = confusion_matrix(y_zero_test, zero_predictions).ravel()

accuracy = (tn+tp)/(tn+tp+fp+fn)
precision = (tp)/(fp+tp)
recall = (tp)/(fn+tp)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 88.33%
Precision: 90.24%
Recall: 92.50%


In [18]:
# Option 2
tn, fp, fn, tp = confusion_matrix(y_nonzero_test, nonzero_predictions).ravel()

accuracy = (tn+tp)/(tn+tp+fp+fn)
precision = (tp)/(fp+tp)
recall = (tp)/(fn+tp)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 83.33%
Precision: 87.50%
Recall: 87.50%


In [19]:
# Option 3
tn, fp, fn, tp = confusion_matrix(y_nan_test, nan_predictions).ravel()

accuracy = (tn+tp)/(tn+tp+fp+fn)
precision = (tp)/(fp+tp)
recall = (tp)/(fn+tp)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 86.67%
Precision: 88.10%
Recall: 92.50%


In [20]:
# Option 4
tn, fp, fn, tp = confusion_matrix(y_imputed_test, imputed_predictions).ravel()

accuracy = (tn+tp)/(tn+tp+fp+fn)
precision = (tp)/(fp+tp)
recall = (tp)/(fn+tp)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 86.67%
Precision: 92.11%
Recall: 87.50%


It's worthwhile to try both approaches (automatic handling and imputing) to find which option gives the best result.