In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

import shutil
import tempfile
import urllib

#import datasets
temp_data = tempfile.mkdtemp()
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
dataset = temp_data + '/balance-scale.data'
urllib.request.urlretrieve(path,dataset)
dataset_new = pd.read_csv(dataset,sep=',',header=None)
shutil.rmtree(temp_data)
dataset_new.head()

Unnamed: 0,0,1,2,3,4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [2]:
dataset_new.describe()

Unnamed: 0,1,2,3,4
count,625.0,625.0,625.0,625.0
mean,3.0,3.0,3.0,3.0
std,1.415346,1.415346,1.415346,1.415346
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0


In [3]:
dataset_new.shape

(625, 5)

In [4]:
#Splitting Data into train and Test data sets
X = dataset_new.values[:,1:4]
Y = dataset_new.values[:,0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)
print('X Values:',X,'\n')
print('Y Values:',Y,'\n')
print('X Train Set:',X_train,'\n')
print('X Test Set:',X_test,'\n')
print('Y Train Set:',Y_train,'\n')
print('Y Test Set:',Y_test,'\n')

X Values: [[1 1 1]
 [1 1 1]
 [1 1 1]
 ...
 [5 5 5]
 [5 5 5]
 [5 5 5]] 

Y Values: ['B' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'B' 'R' 'R' 'R' 'B' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'B' 'R'
 'R' 'L' 'R' 'R' 'R' 'R' 'B' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'L' 'L' 'L' 'B' 'R' 'L' 'B' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R'
 'B' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'L' 'B' 'L' 'L' 'R'
 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'B' 'R' 'R' 'R' 'R' 'L'
 'B' 'R' 'R' 'R' 'B' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'B' 'R' 'L' 'B' 'R' 'R' 'R' 'L' 'R'
 'R' 'R' 'R' 'B' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'L' 'L'
 'L' 'L' 'B' 'R' 'R' 'L' 'B' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R'
 'R' 'R' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'B' 'R' 'L' 'L' 'R' 'R' 'R' 'L'
 'B' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' '

In [5]:
#gini impurity classifier
gini = DecisionTreeClassifier(random_state= 100,max_depth=3,min_samples_leaf=5) #default criterion is gini
gini.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [6]:
#Entropy 
entropy = DecisionTreeClassifier(criterion='entropy',random_state= 100,max_depth=3,min_samples_leaf=5)
entropy.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [7]:
#prediction
y_pred_gini = gini.predict(X_test)
y_pred_entropy = entropy.predict(X_test)
print(y_pred_entropy,'\n')
print(y_pred_gini,'\n')

['L' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'L' 'L'
 'L' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'L' 'L' 'R' 'R' 'R' 'L' 'L' 'L'
 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'L' 'L' 'R' 'R' 'L'
 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R'
 'R' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'L' 'R' 'L'
 'R' 'R' 'L' 'L' 'L' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'R'
 'L' 'L' 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'R' 'L' 'L'
 'L' 'L' 'L' 'R' 'R' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'L' 'R' 'L' 'L' 'L' 'R'
 'L' 'R' 'R' 'L' 'R' 'R' 'L' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'R' 'R'
 'R' 'L' 'R' 'L' 'L' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'L' 'R'
 'R' 'R' 'L' 'R' 'L' 'R' 'R' 'R'] 

['L' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'R' 'R' 'L' 'R' 'R' 'L' 'L'
 'L' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'L'
 'R' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'L' 'R' 'R' 'R' 'R'
 'R' 'L' 'R' 'R

In [8]:
print(gini.feature_importances_)
print(entropy.feature_importances_)

[0.35376318 0.30633381 0.33990301]
[0.36988243 0.34582392 0.28429365]


In [9]:
print(gini.score(X_train,Y_train))
print(entropy.score(X_train,Y_train))

0.7528604118993135
0.7551487414187643


In [10]:
#Confusion matrix
print('Confusion Matrix Gini:',confusion_matrix(Y_test,y_pred_gini))
print('Confusion Matrix Entropy:',confusion_matrix(Y_test,y_pred_entropy))

Confusion Matrix Gini: [[ 0  5  8]
 [ 0 56 29]
 [ 0  7 83]]
Confusion Matrix Entropy: [[ 0  5  8]
 [ 0 58 27]
 [ 0 10 80]]


In [11]:
#accuracy check
print('Accuracy_Entropy:', accuracy_score(Y_test,y_pred_entropy))
print('Accuracy_Gini:', accuracy_score(Y_test,y_pred_gini))

Accuracy_Entropy: 0.7340425531914894
Accuracy_Gini: 0.7393617021276596


In [12]:
#Report
print('Classification Report Entropy',classification_report(Y_test,y_pred_entropy))
print('Classification Report Gini',classification_report(Y_test,y_pred_gini))

Classification Report Entropy               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.79      0.68      0.73        85
           R       0.70      0.89      0.78        90

   micro avg       0.73      0.73      0.73       188
   macro avg       0.50      0.52      0.50       188
weighted avg       0.69      0.73      0.71       188

Classification Report Gini               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.82      0.66      0.73        85
           R       0.69      0.92      0.79        90

   micro avg       0.74      0.74      0.74       188
   macro avg       0.51      0.53      0.51       188
weighted avg       0.70      0.74      0.71       188

