# Data Preparation for XGBoost
Example on preparing datasets that contain categorical data. Binary classification for breast cancer dataset (label and one hot encoded).


In [1]:
from numpy import column_stack
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## Load data

In [2]:
# Dataset available at http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/
data = read_csv('breast-cancer.csv', header=None)
dataset = data.values

In [3]:
# See a sample of the dataset (first 5 rows)
dataset[0:5,]

array([['no-recurrence-events', '30-39', 'premeno', '30-34', '0-2', 'no',
        3, 'left', 'left_low', 'no'],
       ['no-recurrence-events', '40-49', 'premeno', '20-24', '0-2', 'no',
        2, 'right', 'right_up', 'no'],
       ['no-recurrence-events', '40-49', 'premeno', '20-24', '0-2', 'no',
        2, 'left', 'left_low', 'no'],
       ['no-recurrence-events', '60-69', 'ge40', '15-19', '0-2', 'no', 2,
        'right', 'left_up', 'no'],
       ['no-recurrence-events', '40-49', 'premeno', '0-4', '0-2', 'no',
        2, 'right', 'right_low', 'no']], dtype=object)

## Separate into X (features) and y (label)

In [4]:
X = dataset[:,0:9]
y = dataset[:,9]

## Encode string-based features as integers

In [5]:
columns = []

# for each feature
for i in range(0, X.shape[1]):
    
    # convert the categorical (string) feature to an integer numpy array
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    
    # reshape the numpy array so that it has multiple rows and 1 column
    feature = feature.reshape(X.shape[0], 1)
    
    # convert to one-hot encoding
    onehot_encoder = OneHotEncoder(sparse=False)
    feature = onehot_encoder.fit_transform(feature)
    
    # append to list of one-hot encoded features
    columns.append(feature)

# convert list to array
features_encoded_x = column_stack(columns)

In [6]:
# See a sample of the one-hot encoded features
features_encoded_x[0:5,]

array([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.]])

In [7]:
# Encode the y-label as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

In [8]:
# See a sample of the encoded y-label
label_encoded_y[0:5,]

array([0, 0, 0, 0, 0])

## Configure the train-test split 80/20

In [9]:
seed = 8
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(features_encoded_x, label_encoded_y, test_size=test_size, random_state=seed)

## Fit the model to the training data

In [10]:
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


## Make predictions

In [11]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

  if diff:


## Evaluate model

In [12]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [13]:
accuracy = (tn+tp)/(tn+tp+fp+fn)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 72.41%


In [14]:
precision = (tp)/(fp+tp)
print("Precision: %.2f%%" % (precision * 100.0))

Precision: 30.77%


In [15]:
recall = (tp)/(fn+tp)
print("Recall: %.2f%%" % (recall * 100.0))

Recall: 36.36%
