## Decision Tree

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import subprocess



In [2]:
from IPython.display import Image
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss,classification_report, confusion_matrix, accuracy_score 
from sklearn.tree import DecisionTreeClassifier, export_graphviz


In [3]:
seed = 104
X, y = make_classification(n_samples=1000, n_features=20, n_informative=8, n_repeated=2, random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=seed)
print("Train label distribution:", Counter(y_train))
#help(Counter)
#?Counter
#X_train
print("Test label distribution", Counter(y_test)) 


Train label distribution: Counter({1: 402, 0: 398})
Test label distribution Counter({0: 101, 1: 99})


In [4]:
X.shape

(1000, 20)

In [5]:
y.shape

(1000,)

In [6]:
X_train.shape

(800, 20)

In [7]:
y_train.shape

(800,)

In [8]:
y_test.shape

(200,)

In [9]:
dt = DecisionTreeClassifier(random_state=seed)
dt.fit(X_train, y_train)
pred = dt.predict(X_test)


In [10]:
dt_acc = accuracy_score(y_test, pred)
dt_log_loss = log_loss(y_test, pred)
print("accuracy acore".format(dt_acc))
print("Log loss".format(dt_log_loss))


accuracy acore
Log loss


In [11]:
dt_acc


0.83

In [12]:
dt_log_loss


5.871679942852138

In [13]:
dt.tree_.node_count #Huge number of nodes


155

In [14]:
y_test[:5,]


array([0, 1, 0, 0, 1])

In [15]:
pred[:5,]

array([0, 1, 0, 0, 1])

In [16]:
dt_prob = dt.predict_proba(X_test)
dt_prob[:5,]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [17]:
confusion_matrix(y_test,pred)

array([[79, 22],
       [12, 87]])

In [18]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       101
           1       0.80      0.88      0.84        99

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.83      0.83      0.83       200



In [19]:
accuracy_score(y_test,pred)

0.83

In [20]:
log_loss(y_test,pred)

5.871679942852138

## Adaboost

In [21]:
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                             algorithm='SAMME',
                             n_estimators=1000,
                             random_state=seed)
# We have 1000 trees

In [22]:
adaboost.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                               

In [23]:
y_pred= adaboost.predict(X_test)

In [24]:
y_pred_prob = adaboost.predict_proba(X_test)

In [25]:
accuracy_score(y_test,y_pred)

0.9

In [26]:
confusion_matrix(y_test,y_pred)

array([[87, 14],
       [ 6, 93]])

In [27]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90       101
           1       0.87      0.94      0.90        99

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.90      0.90       200



In [28]:
log_loss(y_test,y_pred)

3.4539336113111823

In [29]:
y_test[:10,]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [30]:
y_pred[:10,]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [31]:
y_pred_prob[:10,]

array([[0.51038507, 0.48961493],
       [0.4945453 , 0.5054547 ],
       [0.50161204, 0.49838796],
       [0.50244857, 0.49755143],
       [0.4812046 , 0.5187954 ],
       [0.50208086, 0.49791914],
       [0.50024514, 0.49975486],
       [0.50840377, 0.49159623],
       [0.50666863, 0.49333137],
       [0.50128578, 0.49871422]])

In [32]:
adaboost.estimator_errors_[100] #Error obtained by tree 100

0.47162019912211595

In [33]:
adaboost.estimator_weights_[300] #Relative importance of tree 100

0.1606775278795549

## Gradient Boosted Trees

In [34]:
gbc =  GradientBoostingClassifier(max_depth=1,
                                 n_estimators=1000,
                                 warm_start=True,
                                 random_state=seed)

In [35]:
gbc.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=104, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0, warm_start=True)

In [44]:
gbc_pred = gbc.predict(X_test)

In [45]:
gbc_pred_prob = gbc.predict_proba(X_test)

In [47]:
#print(y_test)
#print(gbc_pred)

In [48]:
print(classification_report(y_test, gbc_pred))

              precision    recall  f1-score   support

           0       0.95      0.85      0.90       101
           1       0.86      0.95      0.90        99

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.90      0.90       200



In [49]:
confusion_matrix(y_test, gbc_pred)

array([[86, 15],
       [ 5, 94]])

In [50]:
accuracy_score(y_test, gbc_pred)

0.9

## Using standard interface

In [51]:
import numpy as np
import xgboost as xgb