In [6]:
class Node:
    
    def __init__(self, feature, threshold, answer=0, left=None, right=None):
        self.left=left
        self.right=right
        self.feature=feature
        self.threshold=threshold
        self.answer=answer
        pass 
    
    def setleft(self, left):
        self.left=left
        
    def setright(self, right):
        self.right=right 

In [7]:
import numpy as np
from sklearn.preprocessing import LabelEncoder   

def accuracy_score(true, pred):
    return (true==pred).mean()

def cvtToFeasibleDtype(colArray):
    if colArray.dtype==np.dtype('int32') or  colArray.dtype==np.dtype('int64'):
        return colArray
    try:
        return colArray.astype(float)
    except:
        return colArray.astype(str)    

def squaredSumOfFractions(yi):    
    _, counts=np.unique(yi, return_counts=True)
    return np.linalg.norm(counts/yi.shape[0])**2
    
def computeGini(x, y, threshold):
    leftclassfraction=np.count_nonzero(x<=threshold)/len(x)
    rightclassfraction=np.count_nonzero(x>threshold)/len(x)
    return leftclassfraction*(1 - squaredSumOfFractions(y[x<=threshold])) + rightclassfraction*(1 - squaredSumOfFractions(y[x>threshold]))
    
class DecisionTree:
    
    def __init__(self, maxHeight=10, minSamplesSplit=3):
        self.maxHeight=maxHeight
        self.minSamplesSplit=minSamplesSplit
        pass
    
    def selectFeature(self, X, y):

        featurewiseImpurity=[]
        thresholds=[]

        for feature in range(X.shape[0]):
            
            thresholdwiseImpurity=[]
            uniquevals=np.sort(np.unique(X[feature]))
            
            possiblethresholds=(uniquevals[1:] + uniquevals[:-1]) / 2
            if len(uniquevals)==1:
                possiblethresholds=[uniquevals[0]]
            
            for threshold in possiblethresholds:
                thresholdwiseImpurity.append(computeGini(X[feature], y, threshold))            
            
            whichthreshold=np.argmin(thresholdwiseImpurity)
            thresholds.append(possiblethresholds[whichthreshold])
            featurewiseImpurity.append(thresholdwiseImpurity[whichthreshold])
        
        whichfeature = np.argmin(featurewiseImpurity)
        return whichfeature, thresholds[whichfeature], featurewiseImpurity[whichfeature]
        
    def build_tree(self, X, y, giniIndex, height=0):

        if np.unique(y).shape[0]==1 or height>=self.maxHeight or X.shape[1]<self.minSamplesSplit:
            return Node('', '', np.bincount(y).argmax())
        
        feature, threshold, newGini = self.selectFeature(X, y)  
        if newGini>giniIndex or np.unique(X[feature]).shape[0]==1:
            return Node('', '', np.bincount(y).argmax())
        
        root=Node(feature, threshold, np.bincount(y).argmax())        
        leftRows, rightRows = X[feature]<=threshold, X[feature]>threshold
        root.setleft(self.build_tree(X[:, leftRows], y[leftRows], newGini, 
                                     height=height+1))
        root.setright(self.build_tree(X[:, rightRows], y[rightRows], newGini, 
                                      height=height+1))
        return root
    
    def preprocess(self, X, y=None, fit=True):
        
        X = np.array(X).T
        X = [cvtToFeasibleDtype(X[col]) for col in range(len(X))]
        
        if not fit:            
            X = np.array([self.encoders[col].transform(X[col]) 
                    if col in self.encoders else X[col] for col in range(len(X))])            
            if y is not None:
                return X, self.le.transform(y)
            return X
        
        self.dtypes = [col.dtype for col in X]    
        self.encoders = {i:LabelEncoder() for i, dtype in enumerate(self.dtypes) 
                         if dtype.char == 'U'}        
        X = np.array([self.encoders[col].fit_transform(X[col]) 
                    if dtype.char == 'U' else X[col] for col, dtype in enumerate(self.dtypes)])    
        if y is not None:
            self.le=LabelEncoder()
            return X, self.le.fit_transform(y)
        return X
    
    def fit(self, X, y):
        
        X, y = self.preprocess(X, y)
        self.root=self.build_tree(X, y, 1-squaredSumOfFractions(y))
    
    def predict(self, X):
        
        X=self.preprocess(X, fit=False).T
        predictions=[]
        for x in X:
            root=self.root
            while root.left:
                if x[root.feature] <= root.threshold:
                    root=root.left
                else:
                    root=root.right
            predictions.append(root.answer)
        return np.array(predictions)
    
    def predict_classes(self, X):
        return self.le.inverse_transform(self.predict(X))
    
    def score(self, X, y):
        return accuracy_score(self.predict(X), self.le.transform(y))

In [11]:
import random

class RandomForest():
    
    def __init__(self, num_of_trees=200):
        
        self.num_of_trees=num_of_trees
        pass
    
    def preprocess(self, X, y=None, fit=True):
        
        X = np.array(X).T
        X = [cvtToFeasibleDtype(X[col]) for col in range(len(X))]
        
        if not fit:            
            X = np.array([self.encoders[col].transform(X[col]) 
                    if col in self.encoders else X[col] for col in range(len(X))])            
            if y is not None:
                return X, self.le.transform(y)
            return X
        
        self.dtypes = [col.dtype for col in X]    
        self.encoders = {i:LabelEncoder() for i, dtype in enumerate(self.dtypes) 
                         if dtype.char == 'U'}        
        X = np.array([self.encoders[col].fit_transform(X[col]) 
                    if dtype.char == 'U' else X[col] for col, dtype in enumerate(self.dtypes)])    
        if y is not None:
            self.le=LabelEncoder()
            return X, self.le.fit_transform(y)
        return X
    
    def fit(self, X, y):
        
        X, y = self.preprocess(X, y)
        X = X.T
        self.trees=[]
        self.features=[]
        
        num_of_samples=int(np.sqrt(X.shape[0]))
        num_of_features=int(np.ceil(np.log2(X.shape[1]+1)))
        
        for _ in range(self.num_of_trees):
            
            features=random.sample(range(X.shape[1]), k=num_of_features)
            indicies=random.choices(range(X.shape[0]), k=num_of_samples)
            tree=DecisionTree()
            tree.fit(X[indicies][:, features], y[indicies])
            self.trees.append(tree)
            self.features.append(features)
            
    def predict(self, X):
        
        X=self.preprocess(X, fit=False).T
        results=[]
        
        for tree, feature in zip(self.trees, self.features):           
            results.append(tree.predict_classes(X[:, feature]))
        return np.apply_along_axis(lambda polls : np.bincount(polls).argmax(), 0, np.array(results))
    
    def predict_classes(self, X):
        
        return self.le.inverse_transform(self.predict(X))
    
    def score(self, X, y):
        return accuracy_score(self.predict(X), y)

In [14]:
from sklearn.metrics import classification_report
import pandas as pd
df=pd.read_csv('wealth.csv')
X=df.drop(columns=['income'])
y=df['income']

cls=RandomForest()
cls.fit(X, y)
pred=cls.predict_classes(X)
print(classification_report(y, pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

middle-class       0.76      1.00      0.86     24720
  rich-class       0.00      0.00      0.00      7841

    accuracy                           0.76     32561
   macro avg       0.38      0.50      0.43     32561
weighted avg       0.58      0.76      0.66     32561



In [13]:
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)