# Data Preparation for Gradient Boosting with XGBoost in Python

## 0. Introduction

This notebook contains:
  1. Label encoding on iris data
  2. One hot encoding on breast-cancer data
  3. Missing value imputation on horse-colic data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [31]:
iris = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
breast_cancer = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
horse_colic = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv"

## 1. Label encoding on iris data

In [10]:
data = pd.read_csv(iris, header=None, na_values='?')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [11]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((240, 27), (60, 27), (240,), (60,))

In [12]:
print(y_train)

[1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0
 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1
 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 0 1
 1 1 0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1
 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0
 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1
 1 1 1 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1]


In [15]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
score = accuracy_score(y_test, y_preds)
print(f"Accuracy: {score*100:.2f}%")

Accuracy: 90.00%


## 2. One-hot encoding on breast-cancer data

In [16]:
data = pd.read_csv(breast_cancer, header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [27]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [28]:
encode_x = None
for i in range(0, X.shape[1]):
  le = LabelEncoder()
  oh = OneHotEncoder(sparse=False, categories='auto')
  feature = le.fit_transform(X.iloc[:,i])
  feature = feature.reshape(X.shape[0], 1)
  feature = oh.fit_transform(feature)
  if encode_x is None:
    encode_x = feature
  else:
    encode_x = np.concatenate((encode_x, feature), axis=1)



In [29]:
le = LabelEncoder()
encode_y = le.fit_transform(y)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(encode_x, encode_y, test_size=.2, random_state=42)
model = XGBClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
score = accuracy_score(y_test, y_preds)
print(f"Accuracy: {score*100:.2f}%")

Accuracy: 72.41%


## 3. Missing value imputation on horse-colic data

In [33]:
data = pd.read_csv(horse_colic, header=None, na_values='?')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [36]:
si = SimpleImputer(strategy='mean')
le = LabelEncoder()
X, y = data.iloc[:, :-1], data.iloc[:, -1]
imputed_X = si.fit_transform(X)
encoded_y = le.fit_transform(y)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(imputed_X, encoded_y, test_size=.2, random_state=42)
model = XGBClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
score = accuracy_score(y_test, y_preds)
print(f"Accuracy: {score*100:.2f}%")

Accuracy: 88.33%
