In [9]:
# Code For Building Hybrid Classification Tree with 2-Means clustering in each Decision Node and Majority Voting in the Leaf Nodes
import numpy as np
import pandas as pd
import os
import errno
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

class C_Node:

    def __init__(self):   # Defining the Node of a Tree
        self.m_NodeIndx = -1
        self.m_NodeDepth = -1
        self.m_ParentNodeIndx = -1
        self.m_LeftChildIndx = -1
        self.m_RightChildIndx = -1
        self.m_IsDecisionNode = None
        self.m_Label = None  # Majority Voting
        self.m_Centroids=None
        self.m_DataLength=None
        
    def setNode(self,nodeIndx,nodeDepth,parentNodeIndx):   # Function to Set the Node of the Tree
        self.m_NodeIndx = nodeIndx
        self.m_NodeDepth = nodeDepth
        self.m_ParentNodeIndx = parentNodeIndx

class C_Tree:
    
    def __init__(self,maxDepth,maxNodeNum,path,dataNumThresh): # Defining the Tree
        self.m_MaxDepth = maxDepth
        self.m_MaxNodeNum = maxNodeNum
        self.m_CurrNodeNum = 0 
        self.m_NodeArray = [C_Node() for i in range(self.m_MaxNodeNum)]
        self.m_Path = path
        self.m_DataNumThresh = dataNumThresh

    def twoMeans(self,dataFileName):  # Function to Perform K-Means Clustering
        datalist = np.genfromtxt(dataFileName, delimiter=',')
        X = datalist[:,:datalist.shape[1]-1]
        y = np.array([int(i) for i in datalist[:,-1]])
        label, label_count = np.unique(y, return_counts=True)

        kmeans = KMeans(n_clusters=2).fit(X)
        centroids = kmeans.cluster_centers_
        cluster_labels = kmeans.labels_

        data_left=[]
        data_right=[]
        label, label_count = np.unique(cluster_labels, return_counts=True)
        
        for i in range(len(X)):
            if cluster_labels[i]==label[0]:
                data_left.append(datalist[i])
            if cluster_labels[i]==label[1]:
                data_right.append(datalist[i]) 
                
        data_left=np.array(data_left)
        data_right=np.array(data_right)
        return data_left,data_right,centroids

    def decisionRule(self,x,node): # Decision for moving in the tree for a test point
        mean1 = node.m_Centroids[0]
        mean2 = node.m_Centroids[1]
        dist1 = np.linalg.norm(x-mean1,2)
        dist2 = np.linalg.norm(x-mean2,2)
        if dist1 <= dist2:
            return 0
        else:
            return 1
    
    def splitDataFile(self,node_obj,data_left,data_right):  # For splitting the Data into two
        filename1 = self.m_Path+"/"+"data_"+str(node_obj.m_LeftChildIndx)+".csv"
        filename2 = self.m_Path+"/"+"data_"+str(node_obj.m_RightChildIndx)+".csv"
        os.makedirs(os.path.dirname(filename1), exist_ok=True)
        os.makedirs(os.path.dirname(filename2), exist_ok=True)
        np.savetxt(filename1, data_left, delimiter = ",")
        np.savetxt(filename2, data_right, delimiter = ",")                                                 
    
    def checkTerminationCondition(self,node,datafilename): # For Checking the Termination of Node
        datalist = np.genfromtxt(datafilename, delimiter=',')
        if len(datalist.shape) == 1:
            IsDecisionNode = False
            dataLength = 1
            Label = datalist[-1]
        else:    
            dataLength = datalist.shape[0]
            X = datalist[:,:-1]
            y = datalist[:,-1]
            label,label_count = np.unique(y,return_counts=True)

            data_left,data_right,centroids = self.twoMeans(datafilename)

            if (dataLength<=self.m_DataNumThresh or node.m_NodeDepth >= self.m_MaxDepth):        
                IsDecisionNode = False
                Label = label[np.argmax(label_count)]
            else:
                IsDecisionNode = True
                Label = None

        return IsDecisionNode,dataLength,Label,data_left,data_right,centroids
        
    def fit(self,X_train,y_train):  # For Building the Overall Tree
        train_data = np.hstack((X_train,np.matrix(y_train).T))
        fileName = self.m_Path+"/"+"data_0.csv"
        train_data = pd.DataFrame(train_data)
        train_data.to_csv(fileName,index=False,header=False )
        
        self.m_NodeArray[0].setNode(0,0,-1)   # Setting Root Node of the Tree
        self.m_CurrNodeNum = self.m_CurrNodeNum+1

        for nodeCount in range(self.m_MaxNodeNum): 

            if (self.m_NodeArray[nodeCount].m_NodeIndx==nodeCount and 
                self.m_NodeArray[nodeCount].m_LeftChildIndx==-1 and 
                self.m_NodeArray[nodeCount].m_RightChildIndx==-1 and 
                self.m_NodeArray[nodeCount].m_NodeDepth>=0):

                    dataFileName = self.m_Path+"/"+"data_"+str(self.m_NodeArray[nodeCount].m_NodeIndx)+".csv" 

                    isDecisionNode,dataPointNum,label,data_left,data_right,centroids = self.checkTerminationCondition(
                        self.m_NodeArray[nodeCount],dataFileName)

                    self.m_NodeArray[nodeCount].m_DataLength = dataPointNum
                    self.m_NodeArray[nodeCount].m_Label = label
                    self.m_NodeArray[nodeCount].m_Centroids = centroids
    
                    if isDecisionNode == False:
                        self.m_NodeArray[nodeCount].m_IsDecisionNode=False
                        print(nodeCount)
                        print("label-"+str(self.m_NodeArray[nodeCount].m_Label))
                        print("depth-"+str(self.m_NodeArray[nodeCount].m_NodeDepth))
                        print("point_count-"+str(dataPointNum))
                        print("----------------------")

                    if isDecisionNode == True:
                        self.m_NodeArray[nodeCount].m_IsDecisionNode=True
                        self.m_NodeArray[nodeCount].m_LeftChildIndx=self.m_CurrNodeNum
                        self.m_NodeArray[nodeCount].m_RightChildIndx=self.m_CurrNodeNum+1
                        lci = self.m_CurrNodeNum
                        rci = self.m_CurrNodeNum+1
                        print(str(self.m_NodeArray[nodeCount].m_NodeIndx)+"-----node index")
                        print(str(self.m_NodeArray[nodeCount].m_Label)+"-------node Label")
                        print(str(self.m_NodeArray[nodeCount].m_NodeDepth)+"------node Depth")
                        print(str(dataPointNum)+"----- no. of datapoints")
                        print("--------------")
                        
                        self.m_NodeArray[lci].setNode(lci,self.m_NodeArray[nodeCount].m_NodeDepth+1,
                                self.m_NodeArray[nodeCount].m_NodeIndx)

                        self.m_NodeArray[rci].setNode(rci,self.m_NodeArray[nodeCount].m_NodeDepth+1,
                                self.m_NodeArray[nodeCount].m_NodeIndx)

                        self.splitDataFile(self.m_NodeArray[nodeCount],data_left,data_right)

                        self.m_CurrNodeNum = self.m_CurrNodeNum+2

            else:
                print("Tree Model Trained!!!!!!!!")
                break  

    def predict(self,X_test):  # Predicting the Labels of the Unkown Test Point
        pred = np.empty((X_test.shape[0],1))
        for i in range(X_test.shape[0]):
            nodeCount=0
            x = X_test[i]
            while(nodeCount < self.m_MaxNodeNum and self.m_NodeArray[nodeCount].m_IsDecisionNode == True):

                if self.decisionRule(x,self.m_NodeArray[nodeCount])== 0:
                    nodeCount = self.m_NodeArray[nodeCount].m_LeftChildIndx
                else:
                    nodeCount = self.m_NodeArray[nodeCount].m_RightChildIndx

            y_pred = self.m_NodeArray[nodeCount].m_Label
            pred[i][0] = y_pred
        return(pred)

##################################################################################################################

dataNumThresh = 20
depth = 5
path = os.getcwd()

Data = np.genfromtxt("./pendigits.csv",delimiter=',')
X = Data[:,:-1]
y = Data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

clf = C_Tree(depth,2**(depth+1)-1,path,dataNumThresh)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred, normalize=True)
print("Accuracy",accuracy)
print("*******************************************")

0-----node index
None-------node Label
0------node Depth
8793----- no. of datapoints
--------------
1-----node index
None-------node Label
1------node Depth
6670----- no. of datapoints
--------------
2-----node index
None-------node Label
1------node Depth
2123----- no. of datapoints
--------------
3-----node index
None-------node Label
2------node Depth
2097----- no. of datapoints
--------------
4-----node index
None-------node Label
2------node Depth
4573----- no. of datapoints
--------------
5-----node index
None-------node Label
2------node Depth
1035----- no. of datapoints
--------------
6-----node index
None-------node Label
2------node Depth
1088----- no. of datapoints
--------------
7-----node index
None-------node Label
3------node Depth
1014----- no. of datapoints
--------------
8-----node index
None-------node Label
3------node Depth
1083----- no. of datapoints
--------------
9-----node index
None-------node Label
3------node Depth
2039----- no. of datapoints
--------------
