In [1]:
#Decision Tree Implementation with "Pseudo" Feature Embedding
#Reference: https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/ 

#Importing dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Decision Tree Classification Model from scratch

In [2]:
#Node Class
#Each node holds feature index, threshold, left and right output, and info gain value
class Node():
    def __init__(self, featureIndex=None, threshold=None, left=None, right=None, infoGain=None, value=None):
        #for decision node
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.left = left
        self.right = right
        self.infoGain = infoGain
        
        #for leaf node
        self.value = value

In [3]:
#Tree Class
#Include functions that help configure and build a decision tree
class DecisionTreeClassifier():

    #initializing the tree
    def __init__(self,minSamplesSplit,maxDepth): 
        self.root = None
        self.minSamplesSplit = minSamplesSplit
        self.maxDepth = maxDepth
    
    #predict only one data point
    def onePrediction(self,x,tree):        
        if tree.value != None: 

            return tree.value
        featureVal = x[tree.featureIndex]

        if featureVal <= tree.threshold:

            return self.onePrediction(x,tree.left)

        else:
            
            return self.onePrediction(x,tree.right)

    #test/predict new dataset
    def predict(self,X):        
        preditions = [self.onePrediction(x,self.root) for x in X]
        return preditions

    #output leaf node
    def calculateLeafValue(self,Y):
        Y = list(Y)
        return max(Y, key=Y.count)
    
    #output gini value
    def getGini(self,y):        
        classLabels = np.unique(y)
        gini = 0

        for cls in classLabels:
            pCls = len(y[y == cls]) / len(y)
            gini += pCls ** 2

        return 1 - gini
    
    #output entropy value
    def entropy(self,y):
        classLabels = np.unique(y)
        entropy = 0

        for cls in classLabels:
            pCls = len(y[y == cls]) / len(y)
            entropy += -pCls * np.log2(pCls)

        return entropy

    #output info gain value
    def informationGain(self,parent,leftChild,rightChild,mode="entropy"):        
        weightLeft = len(leftChild) / len(parent)
        weightRight = len(rightChild) / len(parent)

        if mode == "gini":
            gain = self.getGini(parent) - (weightLeft * self.getGini(leftChild) + weightRight * self.getGini(rightChild))
        
        else:
            gain = self.entropy(parent) - (weightLeft * self.entropy(leftChild) + weightRight * self.entropy(rightChild))

        return gain

    #spliting the data left and right
    def split(self,dataset,featureIndex,threshold):
        datasetLeft = np.array([row for row in dataset if row[featureIndex] <= threshold])
        datasetRight = np.array([row for row in dataset if row[featureIndex] > threshold])

        return datasetLeft, datasetRight        

    #finding the best split after trying different features and thresholds
    def getBestSplit(self,dataset,numFeatures):        
        bestSplit = {}
        maxInfoGain = -float("inf")
        
        for featureIndex in range(numFeatures):
            featureValues = dataset[:,featureIndex]
            possibleThresholds = np.unique(featureValues)

            for threshold in possibleThresholds:
                datasetLeft, datasetRight = self.split(dataset,featureIndex,threshold)

                if len(datasetLeft) > 0 and len(datasetRight) > 0:
                    y, leftY, rightY = dataset[:,-1], datasetLeft[:, -1], datasetRight[:,-1]
                    currInfoGain = self.informationGain(y,leftY,rightY,"gini")
                    
                    if currInfoGain > maxInfoGain:
                        bestSplit["featureIndex"] = featureIndex
                        bestSplit["threshold"] = threshold
                        bestSplit["datasetLeft"] = datasetLeft
                        bestSplit["datasetRight"] = datasetRight
                        bestSplit["infoGain"] = currInfoGain
                        maxInfoGain = currInfoGain
                        
        return bestSplit

    #Recursion function using other functions to build an optimal tree
    def buildTree(self, dataset, currDepth=0):        
        X, Y = dataset[:,:-1], dataset[:,-1]
        numSamples, numFeatures = np.shape(X)
        
        if numSamples >= self.minSamplesSplit and currDepth <= self.maxDepth:
            bestSplit = self.getBestSplit(dataset, numFeatures)

            if bestSplit["infoGain"] > 0:
                leftSubtree = self.buildTree(bestSplit["datasetLeft"], currDepth + 1)
                rightSubtree = self.buildTree(bestSplit["datasetRight"], currDepth + 1)

                return Node(bestSplit["featureIndex"], bestSplit["threshold"], leftSubtree, rightSubtree, bestSplit["infoGain"])
        
        leafValue = self.calculateLeafValue(Y)

        return Node(value=leafValue)

    #train decision tree model
    def fit(self,X,Y):
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.buildTree(dataset)

    #print dicision tree
    def printTree(self, tree=None, indent=" "):
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("featureIndex X"+str(tree.featureIndex), "<=", tree.threshold, "? infoGain:", tree.infoGain)
            print("%sLeft:" % (indent), end="")
            self.printTree(tree.left, indent + indent)
            print("%sRight:" % (indent), end="")
            self.printTree(tree.right, indent + indent)

#### Training and Testing

In [4]:
#Reading and spliting our dataset to training and testing data
trainingData = pd.read_csv("../Data/TrainingData.csv")
testingData = pd.read_csv("../Data/TestingData.csv")

# Training Data 
targets = trainingData["home_team_result"]
targets = targets.values.reshape(-1,1)
features = trainingData.drop(["home_team_name", "away_team_name", "home_team_goal_count", "away_team_goal_count", "home_team_result","winner_encoded"],axis=1).values
featuresTrain = features
targetsTrain = targets

# Testing Data
targets = testingData["home_team_result"]
targets = targets.values.reshape(-1,1)
features = testingData.drop(["home_team_name", "away_team_name", "home_team_goal_count", "away_team_goal_count", "home_team_result"],axis=1).values
featuresTest = features
targetsTest = targets


In [5]:
#Training
dtree = DecisionTreeClassifier(20, 20)
dtree.fit(featuresTrain, targetsTrain)

#Testing
test = dtree.predict(featuresTest) 
print('Accuracy: ',accuracy_score(targetsTest, test)*100,'%')

Accuracy:  56.25 %


In [6]:
#print decision tree
dtree.printTree()

featureIndex X9 <= 2.0 ? infoGain: 0.10018631436314374
 Left:1.0
 Right:featureIndex X10 <= 7.0 ? infoGain: 0.0455988179079273
  Left:featureIndex X6 <= 9.0 ? infoGain: 0.08368241770947288
    Left:0.0
    Right:1.0
  Right:0.0
