# Classification using XGBClassifier()

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
! head -n 5 pima-diabetes.csv

In [0]:
import numpy as np
dataset = np.genfromtxt('pima-diabetes.csv', delimiter=",", skip_header = True) 
np.random.shuffle(dataset)

In [0]:
# Select last column (diabetes/not) as the output (Y)
Y = dataset[:, -1]
# All others as the input (X)
X = dataset[:, :-1]

In [0]:
from sklearn.model_selection import train_test_split
xtr, xva, ytr, yva = train_test_split(X, Y, test_size = 0.2, random_state = 123)

### Build and fit a model

In [0]:
import xgboost as xgb
# Following are some important parameters:
#  n_estimators=100
#  max_depth=3
xg_cla = xgb.XGBClassifier(n_estimators=1, max_depth=3)
# Try adding: , eval_set=[(xva, yva)]
xg_cla.fit(xtr, ytr)

### Evaluate on the training set itself

In [0]:
preds = xg_cla.predict(xtr)

In [0]:
np.set_printoptions(formatter = {'float': '{: 0.1f}'.format})
print(ytr[:10])
print(preds[:10])

### Evaluate on the validation set

In [0]:
preds = xg_cla.predict(xva)

In [0]:
np.set_printoptions(formatter = {'float': '{: 0.1f}'.format})
print(yva[:10])
print(preds[:10])

**ToDo: Implement evaluation based on precision, accuracy, and recall**  

### Visualize one of the trees

In [0]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [50, 10]

In [0]:
xgb.plot_tree(xg_cla, num_trees=0)

### Visualize a tree with appropriate feature names

In [0]:
f = open('pima-diabetes.csv')
line = f.readline()
f.close()
cols = line.strip().split(',')
print(cols)
# Remove the output column
del cols[-1]
f = open('fmap.txt', 'w')
for i, col in enumerate(cols):
    f.write(str(i) + '\t' + str(col) + '\tq\n')
f.close()

! cat fmap.txt

In [0]:
xgb.plot_tree(xg_cla, num_trees=0, fmap='fmap.txt')