# Data Preparation for Gradient Boosting

As XGBoost models represent all problems as a regression predictive modeling problem that only takes numerical values as input. Therefore, we need to convert data into the expected format

## 1. Label Encodes String Class Values

Here, we used the iris dataset as sample data

We can easily convert the string values to integer values using the ***LabelEncoder***

In [11]:
# Load packages
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [9]:
# Load data
data = pd.read_csv('iris.csv', header=None)
dataset = data.values

# Split data into X and y 
X = dataset[:,0:4]
y = dataset[:,4]

In [12]:
# Encode String calss values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

In [24]:
# Fit the model
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=0.3, random_state=666)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
print(model)

XGBClassifier(objective='multi:softprob')


In [25]:
# Predict
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [26]:
# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.9777777777777777


## 2. One Hot Encode Categorical Data

One-Hot encoder used on the label encoded variable

In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [28]:
# Load the data
data = pd.read_csv('breast-cancer.csv', header=None)
dataset = data.values

# Split data into X and y
X = dataset[:, 0:9]
X = X.astype(str)
Y = dataset[:, 9]

In [30]:
# Encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
  label_encoder = LabelEncoder()
  feature = label_encoder.fit_transform(X[:,i])
  feature = feature.reshape(X.shape[0], 1)
  onehot_encoder = OneHotEncoder(sparse=False, categories="auto")
  feature = onehot_encoder.fit_transform(feature)

  if encoded_x is None:
    encoded_x = feature
  else:
    encoded_x = np.concatenate((encoded_x, feature), axis=1)
print("X shape: ", encoded_x.shape)

X shape:  (286, 43)


In [32]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)


In [33]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=0.33, random_state=1)
# fit the model
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier()
Accuracy: 71.58%


## 3. Support for Missing Data

XGBoost can automatically learn how to best handle missing data. 

Missing data is handled the same way that sparse or zero values are handled, by minimizing the loss function

In [37]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# load data
dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None)
dataset = dataframe.values
# split data into X and y
X = dataset[:,0:27]
Y = dataset[:,27]
# set missing values to 0
X[X == '?'] = 0
# convert to numeric
X = X.astype('float32')
# encode Y class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier()
Accuracy: 83.84%


Use ***SimpleImputer*** handle missing data, default is mean

In [38]:

# binary classification, missing data, impute with mean
import numpy
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
# load data
dataframe = read_csv("horse-colic.csv", delim_whitespace=True, header=None)
dataset = dataframe.values
# split data into X and y
X = dataset[:,0:27]
Y = dataset[:,27]
# set missing values to 0
X[X == '?'] = numpy.nan
# convert to numeric
X = X.astype('float32')
# impute missing values as the mean
imputer = SimpleImputer()
imputed_x = imputer.fit_transform(X)
# encode Y class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(imputed_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier()
Accuracy: 79.80%


Reference:

https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/