## Import

In [31]:
from sklearn import datasets
from sklearn import model_selection
import pandas as pd
import warnings
import numpy as np
#from sklearn.tree import DecisionTreeClassifier
import time 
import matplotlib.pyplot as plt
import graphviz
from enum import Enum
import sklearn as skl
from sklearn import tree
import math
warnings.filterwarnings('ignore')

In [32]:
constLeft: str = "Left"
constRight: str = "Right"
constSpace: str = "--"
constLabel: str = "Label"

constAccuracy: str = "Accuracy"
constPrecision: str = "Precision"
constRecall: str = "Recall"
constF1score: str = "F1score"
constTimeTaken: str = "TimeTaken"

constOriginal: str = "Original"
constLibrary: str = "Library"

constModelName: str = "ModelName"  #Model 1, 2, 3....
constModelType: str = "ModelType" #Original or Library
constCriterion: str ="Criterion" #Gini or entropy
constMaxDepth: str = "MaxDepth" #user input
constTreeDepth:str =  "treeDepth" #final
constRows:str = "Rows"
constColumns:str = "Features"
constDataset:str = "Dataset"
constNumberOfLeaves:str = "NumberOfLeaves"

# Decision Tree


## Helper methods for Decision tree



*italicized text*


In [1]:
#@title

#Calculation of gini value
def giniValue(localtrainingSet, dataset) -> float:
  giniValue = sumOfProbabilities = 0 
  classes = dataset[constLabel].unique()
  lenLTS = len(localtrainingSet)

  if len(classes)>1 and lenLTS > 0:
    for className in classes:
      localTreeClass = localtrainingSet[localtrainingSet[constLabel] == className] #Filter for each class
      sumOfProbabilities+= (len(localTreeClass) / lenLTS) ** 2 
    giniValue = 1 - sumOfProbabilities
  return giniValue #return the Gini Index value for this split

#find the best partition that minimizes the Gini index:
def getGiniIndexValue( splitPoints: list, colName: str, dataset, columnType) -> tuple(): #returns tuple with (Gini index value, the split value)
  localLeftTree = localRightTree = [] #= giniIndexes = []
  D = len(dataset)
  lowestImpurity = 1; giniIndex = None

  for midVal in splitPoints:
    if columnType == "str":
      localLeftTree = dataset[dataset[colName] == midVal] 
      localRightTree = dataset[dataset[colName] != midVal]  
    else:
      localLeftTree = dataset[dataset[colName] <= midVal] 
      localRightTree = dataset[dataset[colName] > midVal]  

    D1 = len(localLeftTree); D2 = len(localRightTree);

    if D >0 and D1>0 and D2>0:        #weighted average of gini impurities
        localGiniImpurity = ( (D1/ D) * (giniValue(localLeftTree, dataset)) + (D2/ D) * (giniValue(localRightTree, dataset)))
        if localGiniImpurity < lowestImpurity:
          lowestImpurity = localGiniImpurity
          giniIndex = (localGiniImpurity, midVal)
  
  if giniIndex != None:
    return giniIndex# returns the tuple with the smallest gini index value
  
  return False #Split could not be determined


In [3]:

#Calculation of entropy value
def entropyValue(localtrainingSet, dataset) -> float:
  entropyValue = sumOfLocalEntropies = 0 
  classes = dataset[constLabel].unique()
  lenLTS = len(localtrainingSet)

  if len(classes)>1 and lenLTS > 0:
    for className in classes:
      localTreeClass = localtrainingSet[localtrainingSet[constLabel] == className] #Filter for each class
      probLocalTreeClass = len(localTreeClass) / lenLTS
      if probLocalTreeClass>0:
        sumOfLocalEntropies = probLocalTreeClass *  math.log2(probLocalTreeClass)
    entropyValue = sumOfLocalEntropies #1 - sumOfProbabilities
  return entropyValue

#find the best partition that maximizes Information Gain:
def getBestInfoGainSplit( splitPoints: list, colName: str, dataset) -> tuple(): #returns tuple with (Entropy value, the split value)
  localLeftTree = localRightTree = [] 
  highestInfoGain = 0; infoGainIndex = None

  for midVal in splitPoints:
      localLeftTree = dataset[dataset[colName] <= midVal] 
      localRightTree = dataset[dataset[colName] > midVal]  
      D1 = len(localLeftTree); D2 = len(localRightTree); D = len(dataset)

      if D >0 and D1>0 and D2>0:        #weighted average of entropies
          postSplitEntropy = ( (D1/ D) * (entropyValue(localLeftTree, dataset)) + (D2/ D) * (entropyValue(localRightTree, dataset)))
          preSplitEntropy = entropyValue(dataset, dataset)
          infoGain = preSplitEntropy - postSplitEntropy
          if infoGain >0 and infoGain> highestInfoGain:
              highestInfoGain = infoGain
              infoGainIndex = (infoGain, midVal)

  if infoGainIndex!=None:
      return infoGainIndex #returns the tuple with the highest info gain values
  return False #Split could not be determined

#@title
def getBestEntropySplit( splitPoints: list, colName: str, dataset) -> tuple(): #returns tuple with (Entropy value, the split value)
  localLeftTree = localRightTree = [] = entropySplits = []
  D = len(dataset)
  for midVal in splitPoints:
      localLeftTree = dataset[dataset[colName] <= midVal] 
      localRightTree = dataset[dataset[colName] > midVal]  
      D1 = len(localLeftTree); D2 = len(localRightTree); 

      if D >0 and D1>0 and D2>0:        #weighted average of entropies
          localEntropy = ( (D1/ D) * (entropyValue(localLeftTree, dataset)) + (D2/ D) * (entropyValue(localRightTree, dataset)))
          entropySplits.append((localEntropy, midVal))

  if len(entropySplits) > 0 :
      return sorted(entropySplits)[0]# returns the tuple with the smallest entropy
  return False #Split could not be determined

#@title
def handleNumericalCols(columnData):
    splitPoints =[]
    columnData = columnData.unique()
    for i in range(columnData.shape[0] - 1):
        splitPoints.append((columnData[i] + columnData[i+1])/2)
    return splitPoints;

def handleCategoricalCols(columnData):
    return columnData.unique()

def showTree(node, spaces, side):
  if node.nodeType == NodeTypes.LEAF:
    print(f"{spaces}{side} [{NodeTypes.LEAF.value}]: {constLabel} = {node.label}") 
  elif node.nodeType == NodeTypes.PARENT:
    print(f"{spaces}{side} [{NodeTypes.PARENT.value}]: {node} ")
  elif node.nodeType == NodeTypes.ROOT:
    print(f"[{NodeTypes.ROOT.value}]: {node} ")

  if node.left != None:
    showTree(node.left, spaces*2, constLeft)
  if node.right != None:
    showTree(node.right, spaces*2, constRight)
  return;



In [5]:
#@title
def getBestEntropySplit( splitPoints: list, colName: str, dataset) -> tuple(): #returns tuple with (Entropy value, the split value)
  localLeftTree = localRightTree = [] = entropySplits = []
  for midVal in splitPoints:
      localLeftTree = dataset[dataset[colName] <= midVal] 
      localRightTree = dataset[dataset[colName] > midVal]  
      D1 = len(localLeftTree); D2 = len(localRightTree); D = len(dataset)

      if D >0 and D1>0 and D2>0:        #weighted average of entropies
          localEntropy = ( (D1/ D) * (entropyValue(localLeftTree, dataset)) + (D2/ D) * (entropyValue(localRightTree, dataset)))
          entropySplits.append((localEntropy, midVal))

  if len(entropySplits) > 0 :
      return sorted(entropySplits)[0]# returns the tuple with the smallest entropy
  return False #Split could not be determined


In [6]:
#@title
def handleNumericalCols(columnData):
    splitPoints =[]
    columnData = columnData.unique()
    for i in range(columnData.shape[0] - 1):
        splitPoints.append((columnData[i] + columnData[i+1])/2)
    return splitPoints;

In [7]:
#@title
def handleCategoricalCols(columnData):
    splitPoints =[]
    columnData = columnData.unique()
    return splitPoints;

In [8]:
#@title
def showTree(node, spaces, side):
  if node.nodeType == NodeTypes.LEAF:
    print(f"{spaces}{side} [{NodeTypes.LEAF.value}]: {constLabel} = {node.label}") 
  elif node.nodeType == NodeTypes.PARENT:
    print(f"{spaces}{side} [{NodeTypes.PARENT.value}]: {node} ")
  elif node.nodeType == NodeTypes.ROOT:
    print(f"[{NodeTypes.ROOT.value}]: {node} ")

  if node.left != None:
    showTree(node.left, spaces*2, constLeft)
  if node.right != None:
    showTree(node.right, spaces*2, constRight)
  return;

## Helper Classes

In [7]:
#@title
class NodeTypes():
  LEAF = "Leaf"
  PARENT = "Decision"
  ROOT = "Root"

In [8]:
#@title
class CriterionTypes:
  GINI = "gini"
  ENTROPY = "entropy"

In [9]:
#@title
class Node:  #class definition
    def __init__(self, feature, splitValue, testCriteria, nodeType=None, criteriaValue = None, left=None, right=None):  
        #Question - column, value
        self.feature = feature
        self.testCriteria = testCriteria
        self.splitValue = splitValue 
        self.nodeType = nodeType # (Root, parent, child) 
        self.label = None
        self.criteriaValue = criteriaValue
        self.left = left
        self.right = right

    def __repr__(self):
        # This is just a helper method to print the question in a readable format.
        return "Is %s %s %s?" % (
            self.feature, self.testCriteria, self.splitValue)

In [10]:
#@title
class Tree:
  def __init__(self, rootNode: Node = None):
    self.root = rootNode

In [72]:
feature1 = ["fd"]
target1 = ["rr"]
dTreeOrg = DecisionTree(maxDepth = 0)
dTreeOrg.modelFit(feature1, target1)

maxDepth of Decision Tree is set to 0. Please change and try again


In [83]:
"""## Synthetic Train and Test datasets"""
datasetS2 = pd.DataFrame([[1,2,1],[3,2,1],[5,2,1],[8,3,1],[11,3,1],[12,3,1],[15,4,1]], columns=["feature1", "feature2", constLabel])


dfFeaturesS2 = datasetS2.iloc[:,:2]
dfTargetS2 = datasetS2.iloc[:,2].to_frame()
trainFeatureSetS2, testFeatureSetS2, trainTargetSetS2, testTargetSetS2 = model_selection.train_test_split(dfFeaturesS2, dfTargetS2,
                                                                                                      test_size = 0.3,
                                                                                                      random_state = 42)


## TC7 - All labels belong to one class

dTreeOrgS2 = DecisionTree()
dTreeOrgS2.modelFit(trainFeatureSetS2, trainTargetSetS2)
dTreeOrgS2.modelPredict(testFeatureSetS2)

Dataset has only one label and doesn't need Classification.


[1, 1, 1]

## Decision Tree

In [81]:
#@title

class DecisionTree:     #Declaring DecisionTree
  #Constructor
  def __init__(self, criterion: str = "gini", maxDepth: int = 10) -> None:
    self.criterion = criterion
    self.maxDepth = maxDepth
    self.treeDepth = 0
    self.numberLeafNodes = 0
    
  def _prechecksOnDataSet(self, featureSet, targetSet):
    if len(featureSet)==0 or len(targetSet) == 0:
      print(f"One of the input datasets is empty")
      return False
    elif type(self.maxDepth) != int:
      print("Error: Max depth needs to be an integer")
      return False
    elif self.maxDepth <1:
      print(f"maxDepth of Decision Tree is set to {self.maxDepth}. Please change and try again")
      return False
    elif self.criterion != "" and (self.criterion!= CriterionTypes.ENTROPY and self.criterion != CriterionTypes.GINI):
      print("Error: Criteria Type is etiher 'entropy' or 'gini'")
      return False
    elif type(targetSet) != pd.DataFrame or type(featureSet) != pd.DataFrame:
      self.featureSet = pd.DataFrame(featureSet)
      self.targetSet = pd.DataFrame(targetSet)
    

    if len(targetSet[constLabel].value_counts()) <2: #Atleast 2 classes in the target
      print("Dataset has only one label and doesn't need Classification.")
    elif self.maxDepth <1:
      print(f"maxDepth of Decision Tree is set to {self.maxDepth}. Please change and try again")
    # elif self.targetSet.isna().sum()>0 or self.featureSet.isna().sum()>0:
    #     print("NA values found. Please fix before proceeding.")
    return True
      
  def modelFit(self, featureSet, targetSet=None):
      self.featureSet = featureSet
      self.targetSet = targetSet
      
      if (self._prechecksOnDataSet(featureSet, targetSet)) :
          self._train(self.featureSet.join(self.targetSet))
      

  def _train(self, trainingSet):
      rootNode = self._findBestSplit(trainingSet)
      rootNode.nodeType = NodeTypes.ROOT
      self.Tree = Tree(rootNode)

  def _findBestSplit(self, trainingSet, treeDepth = 0) -> Node:
    if self.treeDepth< treeDepth: 
      self.treeDepth = treeDepth 
    if len(trainingSet[constLabel].unique()) == 1 or (treeDepth >= self.maxDepth):
        leafNode = Node(feature=None, splitValue= None, testCriteria=None
                        , criteriaValue = 0, nodeType = NodeTypes.LEAF, left = None, right=None)
        leafNode.nodeType = NodeTypes.LEAF
        self.numberLeafNodes  = self.numberLeafNodes+1
        leafNode.label = trainingSet[constLabel].mode()[0] #gives the most probable label by taking the label with highest count
        return leafNode        
        
    listOfAllColsWithInfo = []   
    besNode = None
    #sortedData = trainingSet.copy()
    
    for columnName in trainingSet:
      if columnName == constLabel:
        break; 
      sortedData = trainingSet.sort_values(by=columnName)
      columnData = sortedData[columnName]       
      columnType = columnData.dtype.name
      testCriteriaVar = "<="

      if (columnType == "float64" or  columnType == "int64" ): # continuous integer features, sort it in ascending order 
          splitPoints = handleNumericalCols(columnData)
      elif columnType == "str":
          testCriteriaVar = "=="
          splitPoints = handleCategoricalCols(columnData)
      else:
          splitPoints = []
          
      if len(splitPoints) == 0:
          continue;

      #Get Gini Index value and the split point value
      if self.criterion == CriterionTypes.ENTROPY:
          entropyVal = getBestEntropySplit(splitPoints, columnName, sortedData)
          if entropyVal == False:
              continue
          else:
              impurityValue, split = entropyVal
      else:
          giniIndexVal= getGiniIndexValue(splitPoints, columnName, sortedData, columnType)
          if giniIndexVal == False:
              continue
          else:  
              impurityValue, split = giniIndexVal
      
      #if impurityValue < currentImpurity:
      #  currentImpurity = impurityValue
      node = Node(criteriaValue=impurityValue, feature=columnName, splitValue=split, 
                        testCriteria=testCriteriaVar, nodeType = NodeTypes.PARENT)
              
      listOfAllColsWithInfo.append(node)

    #We use the split point value calculated to get the left and right branches of the decision tree
    bestNode:Node = sorted(listOfAllColsWithInfo, key = lambda x: x.criteriaValue)[0]
    if testCriteriaVar == "==":
      leftTree = trainingSet[trainingSet[bestNode.feature] == bestNode.splitValue]
    else:
      leftTree = trainingSet[trainingSet[bestNode.feature] <= bestNode.splitValue]
    bestNode.left = self._findBestSplit(leftTree, treeDepth + 1 )   

    if testCriteriaVar == "==":
      rightTree = trainingSet[trainingSet[bestNode.feature] != bestNode.splitValue]
    else:
      rightTree = trainingSet[trainingSet[bestNode.feature] > bestNode.splitValue]
    bestNode.right  = self._findBestSplit(rightTree, treeDepth + 1 )
    
    return bestNode

  def modelPredict(self, testFeatureSet):
      if len(testFeatureSet) == 0:
          print(f"Empty dataset. Please check again")
          return;
      predictedClassesList = []
      for rowIndex in range(testFeatureSet.shape[0]):
        if self.treeDepth == 0:
          predictedClass = self.Tree.root.label
        else:
          predictedClass = self._traverseTree(testFeatureSet.iloc[rowIndex,:], self.Tree.root)
        predictedClassesList.append(predictedClass)
      return predictedClassesList       
      
  def _traverseTree(self, row, node:Node):
    if node.nodeType == NodeTypes.LEAF:
        return node.label
    if node.testCriteria == "<=":
      if row[node.feature] <= node.splitValue:
          return self._traverseTree(row, node.left)
      else:
          return self._traverseTree(row, node.right)
            
  def showTree(self):
    showTree(self.Tree.root,"--", "root")


encoding

Ref: https://www.kaggle.com/prashant111/decision-tree-classifier-tutorial#12.-Feature-Engineering-


In [15]:
datasetS1 = pd.DataFrame([[1,2,0],[3,2,0],[5,2,0],[8,3,1],[11,3,1],[12,3,1],[15,4,1]], columns=["feature1", "feature2", constLabel])

#@title
dfFeaturesS1 = datasetS1.ix[]

In [46]:
tree.DecisionTreeClassifier(max_depth="dsfld")

DecisionTreeClassifier(max_depth='dsfld')

[link text](https://archive.ics.uci.edu/ml/datasets/wine)

## Helper methods for evaluation

In [16]:
def getEvaluationMetrics(testTargetSet, predictedTargetSet): 
  accuracy = skl.metrics.accuracy_score(y_true = testTargetSet, y_pred=predictedTargetSet)
  precision = skl.metrics.precision_score(y_true = testTargetSet, y_pred=predictedTargetSet, average='weighted')
  recall = skl.metrics.recall_score(y_true = testTargetSet, y_pred=predictedTargetSet, average='weighted')
  f1score = skl.metrics.f1_score(y_true = testTargetSet, y_pred=predictedTargetSet, average='weighted')
  #classificationReport = skl.metrics.classification_report(testTargetSet, predictedTargetSet)
  return {
      constAccuracy: round(accuracy,5),
      constPrecision: round(precision,5),
      constRecall: round(recall,5),
      constF1score: round(f1score,5)
      }

In [17]:
def printEvaluationMetrics(predictedTargetSet):
  evalMetrics = getEvaluationMetrics(predictedTargetSet)
  print (evalMetrics)
  print(f"Accuracy score of model: {evalMetrics[constAccuracy]}")
  print(f"Precision score of model: {evalMetrics['precision']}")
  print(f"Recall score of model: {evalMetrics['recall']}")
  print(f"F1 of model: {evalMetrics['f1score']}")

In [18]:
#@title
def getTimeTaken(startTime, endTime):
  return round((endTime - startTime), 5)

In [19]:
#@title
def getAllMetrics(modelName, modelType, modelCriteria, maxDepth, treeDepth, rows, columns, evalMetrics, timeTaken, numberOfLeaves, dataSetName):
  return {
      constModelName: modelName,
      constModelType: modelType,
      constCriterion: modelCriteria,
      constMaxDepth: maxDepth,
      constTreeDepth: treeDepth,
      constRows: rows,
      constColumns: columns,
      constAccuracy: evalMetrics[constAccuracy],
      constPrecision: evalMetrics[constPrecision],
      constRecall: evalMetrics[constRecall],
      constF1score: evalMetrics[constF1score], 
      constTimeTaken: timeTaken,
      constNumberOfLeaves: numberOfLeaves,
      constDataset: dataSetName
  }

In [20]:
#@title
dfEvaluationMetrics = pd.DataFrame(columns=[ constModelName, constModelType, constCriterion, constMaxDepth, constTreeDepth,
                                            constRows, constColumns, constAccuracy, constPrecision,
                                            constRecall, constF1score, constTimeTaken])

# UCI Datasets

## TC1 - Wine Dataset

In [21]:
#@title
wineDataset = datasets.load_wine()
featureSet1, targetSet1= wineDataset.data, wineDataset.target
dfFeatures1 = pd.DataFrame(featureSet1, columns = wineDataset.feature_names)
dfTarget1 = pd.DataFrame(targetSet1, columns = [constLabel])

"""## Train and Test datasets"""
trainFeatureSet1, testFeatureSet1, trainTargetSet1, testTargetSet1 = model_selection.train_test_split(dfFeatures1, dfTarget1,
                                                                                                      train_size = 0.7, test_size = 0.3,
                                                                                                      random_state=42)
  

### Self-made Model (Model1)

In [22]:
#@title
#Train
dTreeOrg1 = DecisionTree()
dTreeOrgStartTime = time.time()
dTreeOrg1.modelFit(trainFeatureSet1, trainTargetSet1)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 

In [23]:
#Test
predictedValues1 = dTreeOrg1.modelPredict(testFeatureSet1)

In [24]:
eval1=getEvaluationMetrics(testTargetSet1, predictedTargetSet = pd.DataFrame(predictedValues1, columns=[constLabel]) )

In [25]:
newRow = getAllMetrics("Model1",constOriginal, dTreeOrg1.criterion, dTreeOrg1.maxDepth, dTreeOrg1.treeDepth,
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval1, dTreeOrgTimeTaken, dTreeOrg1.numberLeafNodes, "Wine")

In [26]:
dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

In [27]:
dTreeOrg1.showTree()

[Root]: Is color_intensity <= 3.82? 
----Left [Decision]: Is ash <= 3.0700000000000003? 
--------Left [Decision]: Is od280/od315_of_diluted_wines <= 3.8200000000000003? 
----------------Left [Leaf]: Label = 1
----------------Right [Leaf]: Label = 0
--------Right [Leaf]: Label = 0
----Right [Decision]: Is flavanoids <= 1.4? 
--------Left [Leaf]: Label = 2
--------Right [Decision]: Is proline <= 724.5? 
----------------Left [Decision]: Is alcohol <= 13.145? 
--------------------------------Left [Leaf]: Label = 1
--------------------------------Right [Leaf]: Label = 0
----------------Right [Leaf]: Label = 0


### sklearn model (Model2)

In [28]:
dTreeSkl1 = tree.DecisionTreeClassifier(criterion='gini', random_state=42) 

In [29]:
#Train
dTreeSklStartTime = time.time()
dTreeSkl1.fit(trainFeatureSet1, trainTargetSet1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01357


In [30]:
#Test
predictedValuesSkl1 = dTreeSkl1.predict(testFeatureSet1)

In [31]:
eval2 = getEvaluationMetrics(testTargetSet1, predictedTargetSet = pd.DataFrame(predictedValuesSkl1, columns=[constLabel]))

In [32]:
newRow = getAllMetrics("Model2",constLibrary, dTreeSkl1.criterion, dTreeSkl1.max_depth, dTreeSkl1.get_depth(),
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl1.get_n_leaves(),"Wine")

In [33]:
dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

In [41]:
print(tree.export_text(dTreeSkl1, feature_names=wineDataset.feature_names))

|--- color_intensity <= 3.82
|   |--- proline <= 1010.00
|   |   |--- ash <= 3.07
|   |   |   |--- class: 1
|   |   |--- ash >  3.07
|   |   |   |--- class: 0
|   |--- proline >  1010.00
|   |   |--- class: 0
|--- color_intensity >  3.82
|   |--- flavanoids <= 1.40
|   |   |--- class: 2
|   |--- flavanoids >  1.40
|   |   |--- proline <= 724.50
|   |   |   |--- alcohol <= 13.14
|   |   |   |   |--- class: 1
|   |   |   |--- alcohol >  13.14
|   |   |   |   |--- class: 0
|   |   |--- proline >  724.50
|   |   |   |--- class: 0



## TC 1.1 - Wine DataSet with Max Depth changes

### Self-made Model (Model3)



```
get
```



In [34]:
#Train
dTreeOrg11 = DecisionTree(maxDepth=2)
dTreeOrgStartTime = time.time()
dTreeOrg11.modelFit(trainFeatureSet1, trainTargetSet1)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgTimeTaken)

Time taken to train Decision Tree: 12.0581


In [35]:
#Test
predictedValues11 = dTreeOrg11.modelPredict(testFeatureSet1)

In [36]:
eval1 = getEvaluationMetrics(testTargetSet1, predictedTargetSet = pd.DataFrame(predictedValues11, columns=[constLabel]))

In [37]:
newRow = getAllMetrics("Model3",constOriginal, dTreeOrg11.criterion, dTreeOrg11.maxDepth, dTreeOrg11.treeDepth,
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval1, dTreeOrgTimeTaken,dTreeOrg11.numberLeafNodes,"Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model4)

In [38]:
dTreeSkl11 = tree.DecisionTreeClassifier(criterion='gini', random_state=42, max_depth=2)

In [39]:
#Train
dTreeSklStartTime = time.time()
dTreeSkl11.fit(trainFeatureSet1, trainTargetSet1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.00761


In [40]:
#Test
predictedValuesSkl11 = dTreeSkl11.predict(testFeatureSet1)

In [41]:
eval2 = getEvaluationMetrics(predictedValuesSkl1, testTargetSet1)
newRow = getAllMetrics("Model4", constLibrary, dTreeSkl11.criterion, dTreeSkl11.max_depth, dTreeSkl11.get_depth(),
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval2, dTreeSklTimeTaken,dTreeSkl11.get_n_leaves(), "Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC 1.2 Wine dataset with Entropy Criteria

### Self- made model (Model5)

In [42]:
#Train
dTreeOrg12 = DecisionTree(criterion = CriterionTypes.ENTROPY)
dTreeOrgStartTime = time.time()
dTreeOrg12.modelFit(trainFeatureSet1, trainTargetSet1)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgTimeTaken)

Time taken to train Decision Tree: 68.31401


In [43]:
#Test
predictedValues12 = dTreeOrg12.modelPredict(testFeatureSet1)

In [44]:
eval1 = getEvaluationMetrics(testTargetSet1, predictedValues12)

In [45]:
newRow = getAllMetrics("Model5",constOriginal, dTreeOrg12.criterion, dTreeOrg12.maxDepth, dTreeOrg12.treeDepth,
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval1, dTreeOrgTimeTaken, dTreeOrg12.numberLeafNodes, "Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model6)

In [46]:
dTreeSkl12 = tree.DecisionTreeClassifier(criterion=CriterionTypes.ENTROPY, random_state=42)
dTreeSkl12.fit(trainFeatureSet1, trainTargetSet1)

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [47]:
#Train
dTreeSklStartTime = time.time()
dTreeSkl12.fit(trainFeatureSet1, trainTargetSet1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.00624


In [48]:
#Test
predictedValuesSkl12 = dTreeSkl12.predict(testFeatureSet1)

In [49]:
getEvaluationMetrics(predictedValuesSkl12, testTargetSet1)

{'Accuracy': 0.85185,
 'F1score': 0.85296,
 'Precision': 0.85995,
 'Recall': 0.85185}

In [50]:
eval2 = getEvaluationMetrics(predictedValuesSkl12, testTargetSet1)
newRow = getAllMetrics("Model6",constLibrary, dTreeSkl12.criterion, dTreeSkl12.max_depth, dTreeSkl12.get_depth(),
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl12.get_n_leaves(),"Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC 1.3 Wine dataset with Entropy Criteria and  MaxDepth = 3

### Self- made model (Model11)

In [51]:
#Train
dTreeOrg12 = DecisionTree(criterion = CriterionTypes.ENTROPY, maxDepth=3)
dTreeOrgStartTime = time.time()
dTreeOrg12.modelFit(trainFeatureSet1, trainTargetSet1)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgTimeTaken)

Time taken to train Decision Tree: 23.23381


In [52]:
#Test
predictedValues12 = dTreeOrg12.modelPredict(testFeatureSet1)

In [53]:
eval1 = getEvaluationMetrics(testTargetSet1, predictedValues12)

In [54]:
newRow = getAllMetrics("Model11",constOriginal, dTreeOrg12.criterion, dTreeOrg12.maxDepth, dTreeOrg12.treeDepth,
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval1, dTreeOrgTimeTaken, dTreeOrg12.numberLeafNodes ,"Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model12)

In [55]:
dTreeSkl12 = tree.DecisionTreeClassifier(criterion=CriterionTypes.ENTROPY, max_depth=3, random_state=42)
dTreeSkl12.fit(trainFeatureSet1, trainTargetSet1)

DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)

In [56]:
#Train
dTreeSklStartTime = time.time()
dTreeSkl12.fit(trainFeatureSet1, trainTargetSet1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
#print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

In [57]:
#Test
predictedValuesSkl12 = dTreeSkl12.predict(testFeatureSet1)

In [58]:
eval2 = getEvaluationMetrics(predictedValuesSkl12, testTargetSet1)

In [59]:
newRow = getAllMetrics("Model12",constLibrary, dTreeSkl12.criterion, dTreeSkl12.max_depth, dTreeSkl12.get_depth(),
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl12.get_n_leaves(),"Wine")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC2 - Audit Dataset

In [60]:
auditData = pd.read_csv("audit_risk.csv")

In [61]:
auditData = auditData[~auditData['LOCATION_ID'].isin(['LOHARU', 'NUH', 'SAFIDON'])]
auditData = auditData.astype({'LOCATION_ID': "float64"})

In [62]:
auditData.dropna()

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23.0,4.18,0.6,2.508,2.50,0.2,0.500,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6.0,0.00,0.2,0.000,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6.0,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3,3.89,6.0,0.00,0.2,0.000,10.80,0.6,6.480,10.80,6.0,0.6,3.6,11.75,0.6,7.050,2,0.2,0.4,0,0.2,0.0,4.4,17.530,0.4,0.5,3.5060,1
4,3.89,6.0,0.00,0.2,0.000,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,55.57,9.0,0.49,0.2,0.098,0.40,0.2,0.080,0.89,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.578,0.4,0.5,0.3156,0
772,55.57,16.0,0.47,0.2,0.094,0.37,0.2,0.074,0.84,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.568,0.4,0.5,0.3136,0
773,55.57,14.0,0.24,0.2,0.048,0.04,0.2,0.008,0.28,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.456,0.4,0.5,0.2912,0
774,55.57,18.0,0.20,0.2,0.040,0.00,0.2,0.000,0.20,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.440,0.4,0.5,0.2880,0


In [63]:
auditDataShape = auditData.shape
dfFeatures2 = auditData.iloc[:,:auditDataShape[1]-1]
dfTarget2 = auditData.iloc[:,-1].to_frame(name=constLabel)

In [64]:
trainFeatureSet2, testFeatureSet2, trainTargetSet2, testTargetSet2 = model_selection.train_test_split(dfFeatures2, dfTarget2,
                                                                                                      test_size = 0.3, random_state=42)

### Self-Made Model (Model13)

In [65]:
#Train
dTreeOrg2 = DecisionTree(criterion = CriterionTypes.ENTROPY, maxDepth=3)
dTreeOrgStartTime = time.time()
dTreeOrg2.modelFit(trainFeatureSet2, trainTargetSet2)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgTimeTaken)

Time taken to train Decision Tree: 60.65805


In [66]:
#Test
predictedValues2 = dTreeOrg2.modelPredict(testFeatureSet2)

In [67]:
eval1 = getEvaluationMetrics(testTargetSet2, predictedValues2)

In [68]:
newRow = getAllMetrics("Model13",constOriginal, dTreeOrg2.criterion, dTreeOrg2.maxDepth, dTreeOrg2.treeDepth,
                       trainFeatureSet2.shape[0],trainFeatureSet2.shape[1], eval1, dTreeOrgTimeTaken, dTreeOrg2.numberLeafNodes, "Audit")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn (Model14)

In [69]:
# dTreeSkl2 = tree.DecisionTreeClassifier(random_state=42)

# dTreeSklStartTime = time.time()
# dTreeSkl2.fit(trainFeatureSet2, trainTargetSet2)
# dTreeSklEndTime = time.time()
# dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
# print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

In [70]:
#trainFeatureSet2.isinf().sum()
#trainFeatureSet2.dtypes

In [71]:
#Test
#predictedValuesSkl2 = dTreeSkl2.predict(testFeatureSet2)

In [72]:
#eval2 = getEvaluationMetrics(predictedValuesSkl2, testTargetSet2)

In [73]:
# newRow = getAllMetrics("Model14",constLibrary, dTreeSkl2.criterion, dTreeSkl2.max_depth, dTreeSkl2.get_depth(),
#                        trainFeatureSet2.shape[0],trainFeatureSet2.shape[1], eval2, dTreeSklTimeTaken,"Audit")

# dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC 2.1 - Audit Dataset with Max Depth=3

### Self-Made Model (Model15)

In [74]:
#Train
dTreeOrg21 = DecisionTree( maxDepth=3)
dTreeOrgStartTime = time.time()
dTreeOrg21.modelFit(trainFeatureSet2, trainTargetSet2)
dTreeOrgEndTime = time.time()
dTreeOrgTimeTaken = getTimeTaken(dTreeOrgStartTime, dTreeOrgEndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgTimeTaken)

Time taken to train Decision Tree: 10.40143


In [75]:
#Test
predictedValues21 = dTreeOrg21.modelPredict(testFeatureSet2)

In [76]:
eval1 = getEvaluationMetrics(testTargetSet2, predictedValues21)

In [77]:
newRow = getAllMetrics("Model15",constOriginal, dTreeOrg21.criterion, dTreeOrg21.maxDepth, dTreeOrg21.treeDepth,
                       trainFeatureSet2.shape[0],trainFeatureSet2.shape[1], eval1, dTreeOrgTimeTaken, dTreeOrg21.numberLeafNodes,"Audit")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn (Model16)

In [78]:
# dTreeSkl21 = tree.DecisionTreeClassifier(max_depth=3, random_state=42)

# dTreeSklStartTime = time.time()
# #dTreeSkl21.fit(trainFeatureSet2, trainTargetSet2)
# dTreeSklEndTime = time.time()
# dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
# print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

In [79]:
# #Test
# predictedValuesSkl21 = dTreeSkl21.predict(testFeatureSet2)

In [80]:
# eval2 = getEvaluationMetrics(predictedValuesSkl21, testTargetSet2)

In [81]:
# newRow = getAllMetrics("Model16",constLibrary, dTreeSkl21.criterion, dTreeSkl21.max_depth, dTreeSkl21.get_depth(),
#                        trainFeatureSet2.shape[0],trainFeatureSet2.shape[1], eval2, dTreeSklTimeTaken,"Audit")

# dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC3 - Breast Cancer Dataset

In [174]:
breastCancerDataset = datasets.load_breast_cancer()
featureSet3, targetSet3 = breastCancerDataset.data, breastCancerDataset.target
dfFeatures3 = pd.DataFrame(featureSet3, columns = breastCancerDataset.feature_names)
dfTarget3 = pd.DataFrame(targetSet3, columns = [constLabel])

"""## Train and Test datasets"""
trainFeatureSet3, testFeatureSet3, trainTargetSet3, testTargetSet3 = model_selection.train_test_split(dfFeatures3, dfTarget3,
                                                                                                      train_size = 0.7, test_size = 0.3, random_state=42)

In [176]:
trainFeatureSet3.shape

(398, 30)

### Self-made model (Model7)

In [153]:
#Train
dTreeOrg3 = DecisionTree()
dTreeOrg3StartTime = time.time()
dTreeOrg3.modelFit(trainFeatureSet3, trainTargetSet3)
dTreeOrg3EndTime = time.time()
dTreeOrg3TimeTaken = getTimeTaken(dTreeOrg3StartTime, dTreeOrg3EndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrg3TimeTaken)

Time taken to train Decision Tree: 186.61167


In [84]:
#Test
predictedValues3 = dTreeOrg3.modelPredict(testFeatureSet3)

In [85]:
#Metrics
eval1 = getEvaluationMetrics(predictedValues3, testTargetSet3)

In [86]:
newRow = getAllMetrics("Model7",constOriginal, dTreeOrg3.criterion, dTreeOrg3.maxDepth, dTreeOrg3.treeDepth,
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval1, dTreeOrg3TimeTaken, dTreeOrg3.numberLeafNodes, "BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model8)

In [87]:
dTreeSkl3 = tree.DecisionTreeClassifier(criterion=CriterionTypes.GINI, random_state=42)

dTreeSklStartTime = time.time()
dTreeSkl3.fit(trainFeatureSet3, trainTargetSet3)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01358


In [88]:
predictedValuesS3 = dTreeSkl3.predict(testFeatureSet3)

In [89]:
eval2 = getEvaluationMetrics(predictedValuesS3, testTargetSet3)

In [90]:
#@title
newRow = getAllMetrics("Model8",constLibrary, dTreeSkl3.criterion, dTreeSkl3.max_depth, dTreeSkl3.get_depth(),
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl3.get_n_leaves() ,"BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC 3.1 Breast cancer dataset with max depth =3

### Self-made model (Model31)

In [91]:
#Train
dTreeOrg3 = DecisionTree(maxDepth=3)
dTreeOrg3StartTime = time.time()
dTreeOrg3.modelFit(trainFeatureSet3, trainTargetSet3)
dTreeOrg3EndTime = time.time()
dTreeOrg3TimeTaken = getTimeTaken(dTreeOrg3StartTime, dTreeOrg3EndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrg3TimeTaken)

Time taken to train Decision Tree: 140.35681


In [92]:
#Test
predictedValues3 = dTreeOrg3.modelPredict(testFeatureSet3)

In [93]:
#Metrics
eval1 = getEvaluationMetrics(predictedValues3, testTargetSet3)

$$GINI(x) = 1- \sum_{i=1}^{n}p_i^2 $$


In [95]:
newRow = getAllMetrics("Model31",constOriginal, dTreeOrg3.criterion, dTreeOrg3.maxDepth, dTreeOrg3.treeDepth,
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval1, dTreeOrg3TimeTaken, dTreeOrg3.numberLeafNodes, "BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model32)

In [96]:
dTreeSkl3 = tree.DecisionTreeClassifier(criterion=CriterionTypes.GINI, max_depth=3, random_state=42)

dTreeSklStartTime = time.time()
dTreeSkl3.fit(trainFeatureSet3, trainTargetSet3)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01525


In [97]:
predictedValuesS3 = dTreeSkl3.predict(testFeatureSet3)

In [98]:
eval2 = getEvaluationMetrics(predictedValuesS3, testTargetSet3)

In [99]:
#@title
newRow = getAllMetrics("Model31",constLibrary, dTreeSkl3.criterion, dTreeSkl3.max_depth, dTreeSkl3.get_depth(),
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl3.get_n_leaves() ,"BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

## TC 3.2 Breast cancer dataset with Entropy Criteria

### Self-made model (Model33)

In [100]:
#Train
dTreeOrg3 = DecisionTree(criterion=CriterionTypes.ENTROPY)
dTreeOrg3StartTime = time.time()
dTreeOrg3.modelFit(trainFeatureSet3, trainTargetSet3)
dTreeOrg3EndTime = time.time()
dTreeOrg3TimeTaken = getTimeTaken(dTreeOrg3StartTime, dTreeOrg3EndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrg3TimeTaken)

Time taken to train Decision Tree: 721.21652


In [101]:
#Test
predictedValues3 = dTreeOrg3.modelPredict(testFeatureSet3)

In [102]:
#Metrics
eval1 = getEvaluationMetrics(predictedValues3, testTargetSet3)

In [103]:
newRow = getAllMetrics("Model33",constOriginal, dTreeOrg3.criterion, dTreeOrg3.maxDepth, dTreeOrg3.treeDepth,
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval1, dTreeOrg3TimeTaken, dTreeOrg3.numberLeafNodes, "BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model34)

In [104]:
dTreeSkl3 = tree.DecisionTreeClassifier(criterion=CriterionTypes.ENTROPY, random_state=42)

dTreeSklStartTime = time.time()
dTreeSkl3.fit(trainFeatureSet3, trainTargetSet3)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01626


In [105]:
predictedValuesS3 = dTreeSkl3.predict(testFeatureSet3)

In [106]:
eval2 = getEvaluationMetrics(predictedValuesS3, testTargetSet3)

In [107]:
#@title
newRow = getAllMetrics("Model34",constLibrary, dTreeSkl3.criterion, dTreeSkl3.max_depth, dTreeSkl3.get_depth(),
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl3.get_n_leaves() ,"BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

##TC 3.3 Breast cancer DS with max depth = 3 and entropy criteria

### Self-made model (Model35)

In [108]:
#Train
dTreeOrg3 = DecisionTree(maxDepth=3, criterion= CriterionTypes.ENTROPY)
dTreeOrg3StartTime = time.time()
dTreeOrg3.modelFit(trainFeatureSet3, trainTargetSet3)
dTreeOrg3EndTime = time.time()
dTreeOrg3TimeTaken = getTimeTaken(dTreeOrg3StartTime, dTreeOrg3EndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrg3TimeTaken)

Time taken to train Decision Tree: 205.61709


In [109]:
#Test
predictedValues3 = dTreeOrg3.modelPredict(testFeatureSet3)

In [110]:
#Metrics
eval1 = getEvaluationMetrics(predictedValues3, testTargetSet3)

In [111]:
newRow = getAllMetrics("Model35",constOriginal, dTreeOrg3.criterion, dTreeOrg3.maxDepth, dTreeOrg3.treeDepth,
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval1, dTreeOrg3TimeTaken, dTreeOrg3.numberLeafNodes, "BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model36)

In [112]:
dTreeSkl3 = tree.DecisionTreeClassifier(criterion=CriterionTypes.ENTROPY, max_depth=3, random_state=42)

dTreeSklStartTime = time.time()
dTreeSkl3.fit(trainFeatureSet3, trainTargetSet3)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01508


In [113]:
predictedValuesS3 = dTreeSkl3.predict(testFeatureSet3)

In [114]:
eval2 = getEvaluationMetrics(predictedValuesS3, testTargetSet3)

In [115]:
#@title
newRow = getAllMetrics("Model36",constLibrary, dTreeSkl3.criterion, dTreeSkl3.max_depth, dTreeSkl3.get_depth(),
                       trainFeatureSet3.shape[0],trainFeatureSet3.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl3.get_n_leaves() ,"BreastCancer")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

# Synthetic Datasets

## TC1 - Numerical data

In [116]:
datasetS1 = pd.DataFrame([[1,2,0],[3,2,0],[5,2,0],[8,3,1],[11,3,1],[12,3,1],[15,4,1]], columns=["feature1", "feature2", constLabel])

In [117]:
#@title
dfFeaturesS1 = datasetS1.iloc[:,:2]
dfTargetS1 = datasetS1.iloc[:,2].to_frame()
"""## Train and Test datasets"""
trainFeatureSetS1, testFeatureSetS1, trainTargetSetS1, testTargetSetS1 = model_selection.train_test_split(dfFeaturesS1, dfTargetS1,
                                                                                                      train_size = 0.7, test_size = 0.3,
                                                                                                      random_state = 42)
   

### Self-made model (Model9)

In [118]:
#Train
dTreeOrgS1 = DecisionTree()
dTreeOrgS1StartTime = time.time()
dTreeOrgS1.modelFit(trainFeatureSetS1, trainTargetSetS1)
dTreeOrgS1EndTime = time.time()
dTreeOrgS1TimeTaken = getTimeTaken(dTreeOrgS1StartTime, dTreeOrgS1EndTime) 
print(f"Time taken to train Decision Tree:", dTreeOrgS1TimeTaken)

Time taken to train Decision Tree: 0.02339


In [119]:
predictedValuesS1 = dTreeOrgS1.modelPredict(testFeatureSetS1)

In [120]:
eval1 = getEvaluationMetrics(predictedValuesS1, testTargetSetS1)

In [121]:
newRow = getAllMetrics("Model9",constOriginal, dTreeOrgS1.criterion, dTreeOrgS1.maxDepth, dTreeOrgS1.treeDepth,
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval1, dTreeOrgS1TimeTaken, dTreeOrgS1.numberLeafNodes, "Synthetic1")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

### sklearn model (Model10)

In [122]:
dTreeSkl3 = tree.DecisionTreeClassifier(criterion=CriterionTypes.GINI, random_state=42)

dTreeSklStartTime = time.time()
dTreeSkl3.fit(trainFeatureSetS1, trainTargetSetS1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
#print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

In [123]:
predictedValuesS3 = dTreeSkl3.predict(testFeatureSetS1)

In [124]:
eval2 = getEvaluationMetrics(predictedValuesS3, testTargetSetS1)
newRow = getAllMetrics("Model8",constLibrary, dTreeSkl3.criterion, dTreeSkl3.max_depth, dTreeSkl3.get_depth(),
                       trainFeatureSet1.shape[0],trainFeatureSet1.shape[1], eval2, dTreeSklTimeTaken, dTreeSkl3.get_n_leaves(), "Synthetic1")

dfEvaluationMetrics = dfEvaluationMetrics.append(newRow, ignore_index=True)

# Test Cases for original model

## TC1 - Empty dataset

In [29]:
maxDepth: int = "ddd"

dTreeOrg = DecisionTree(criterion = maxDepth)


Error: Criteria Type is etiher 'entropy' or 'gini'


## TC2 - All labels belong to one class

In [36]:
datasetS2 = pd.DataFrame([[1,2,1],[3,2,1],[5,2,1],[8,3,1],[11,3,1],[12,3,1],[15,4,1]], columns=["feature1", "feature2", constLabel])

In [37]:
#@title
dfFeaturesS2 = datasetS2.iloc[:,:2]
dfTargetS2 = datasetS2.iloc[:,2].to_frame()
"""## Train and Test datasets"""
trainFeatureSetS2, testFeatureSetS2, trainTargetSetS2, testTargetSetS2 = model_selection.train_test_split(dfFeaturesS2, dfTargetS2,
                                                                                                      test_size = 0.3,
                                                                                                      random_state = 42)
   

In [51]:
#Train
dTreeOrgS2 = DecisionTree(criterion= 'fmdfd')
startTime = time.time()
#dTreeOrgS2.modelFit(trainFeatureSetS2, trainTargetSetS2)
endTime = time.time()

Error: Criteria Type is etiher 'entropy' or 'gini'


In [128]:
dTreeOrgS2.modelPredict(testFeatureSetS2)

[1, 1, 1]

In [129]:
dTreeSklS2 = tree.DecisionTreeClassifier()
dTreeSklS2.fit(trainFeatureSetS2, trainTargetSetS2)
dTreeSklS2.predict(testFeatureSetS2)

array([1, 1, 1])

##  TC3 - Categorical data

In [130]:
training_data_1 = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
header = ["color", "diameter", "label"]
testing_data_1 = [
        ['Green', 3, 'Apple'],
        ['Yellow', 4, 'Apple'],
        ['Red', 2, 'Grape'],
        ['Red', 1, 'Grape'],
        ['Yellow', 3, 'Lemon'],
    ]

In [131]:
#pd.DataFrame(training_data_1)[0].unique()

## TC4 - All feature values are same

### Self-made Model (Model19)

In [132]:
#Train

In [133]:
#Test


In [134]:
#Metrics


### Sklearn Model (Model20)

In [135]:
dTreeSkl = tree.DecisionTreeClassifier(criterion='gini', random_state=42) 

In [136]:
#Train
dTreeSklStartTime = time.time()
dTreeSkl.fit(trainFeatureSet1, trainTargetSet1)
dTreeSklEndTime = time.time()
dTreeSklTimeTaken = getTimeTaken(dTreeSklStartTime, dTreeSklEndTime) 
print(f"Time taken to train Decision Tree:", dTreeSklTimeTaken)

Time taken to train Decision Tree: 0.01059


In [137]:
## Test


In [138]:
dfEvaluationMetrics.index = dfEvaluationMetrics.index+1

In [139]:
dfEvaluationMetrics.to_csv('Evaluation Metrics.csv')

# Rough notes (Ignore)

In [140]:
#Node("ddd", 5, "<=")

In [141]:
# dot_data = skl.tree.export_graphviz(dTreeSkl, out_file=None,  
#                 filled=True, rounded=False,
#                 special_characters=True,
#                 feature_names = wineDataset.feature_names,
#                 class_names = wineDataset.target_names)
# graph = graphviz.Source(dot_data)  
# graph

In [142]:
# plt.figure(figsize=(12, 6))
# tree.plot_tree(dTreeSkl, feature_names= dfFeatures.columns)
# plt.show()
# print(tree.export_text(dTreeSkl))

In [143]:
#@title
# from sklearn.tree import _tree

# def tree_to_code(tree, feature_names):
#     tree_ = tree.tree_
#     feature_name = [
#         feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
#         for i in tree_.feature
#     ]
#     print "def tree({}):".format(", ".join(feature_names))

#     def recurse(node, depth):
#         indent = "  " * depth
#         if tree_.feature[node] != _tree.TREE_UNDEFINED:
#             name = feature_name[node]
#             threshold = tree_.threshold[node]
#             print "{}if {} <= {}:".format(indent, name, threshold)
#             recurse(tree_.children_left[node], depth + 1)
#             print "{}else:  # if {} > {}".format(indent, name, threshold)
#             recurse(tree_.children_right[node], depth + 1)
#         else:
#             print "{}return {}".format(indent, tree_.value[node])

#     recurse(0, 1)

## Chefboost

In [144]:
#pip install chefboost

In [145]:
#from chefboost import Chefboost as chef

In [146]:
#df = dfFeatures.join(dfTarget)
config = {'algorithm': 'CART'} #Algo selection 
#Other options for classification- CART, C4.5, CHAID, ID3,
#df['Class'] = df['Class'].astype(object) #requires 'Decision'/ Label column to be object dtype


In [147]:
#model2 = chef.fit(df, config = config, target_label = 'Class')

In [148]:
#prediction = chef.predict(model2, param = testFeatureSet.to_numpy())

In [149]:
#testFeatureSet.to_numpy()[0,12] <=746.8932584269663

In [150]:
prabha_cute = True
while prabha_cute:
  print('💕')
  break

💕


#### Visualizing

In [151]:
# dot_data = skl.tree.export_graphviz(dTreeSkl, out_file=None,  
#                 filled=True, rounded=False,
#                 special_characters=True,
#                 feature_names = wineDataset.feature_names,
#                 class_names = wineDataset.target_names)
# graph = graphviz.Source(dot_data)  
# graph

In [152]:
# plt.figure(figsize=(12, 6))
# tree.plot_tree(dTreeSkl, feature_names= trainFeatureSet1.columns)
# plt.show()
# #print(tree.export_text(dTreeSkl))
