# Code 

In [1]:
#import python module
import numpy as np

### For coding Random Forest, we have to create two classes, decision tree and random forest.

In [4]:
#creating decision tree
class DecisionTree():
    '''
    x: independant variables of training set
    y: dependent variables of training set
    n_feature: number of features in sample
    f_idxs: 
    idxs: stores the indices of the rows this tree contains
    sample_size: size of sample
    depth: depth of each decision tree
    min_leaf: minimum number of rows in a node
    
    '''
    def __init__(self, x, y, n_features, f_idxs, idxs, depth=7, min_leaf=4):
        '''
        decision tree constructor
        '''
        self.x, self.y, self.idxs, self.min_leaf, self.f_idxs = x, y, idxs, min_leaf, f_idxs
        self.depth = depth
        self.n_features = n_features
        self.n, self.c = len(idxs), x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        '''
        This function loops through all the columns sequentially and finds the best split among them all.
        It makes only single split.
        '''
        for i in self.f_idxs: 
            self.find_better_split(i)
       

    def find_better_split(self, var_idx):
        '''
        It finds the best possible split in a certain column.
        '''
        x, y = self.x.values[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.
    
        for i in range(0,self.n-self.min_leaf-1):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf or xi==sort_x[i+1]:
                continue
            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

                
#We will now define some property decorator to make our code more concise.
    @property
    def split_name(self):
        '''
        It will return the name of the column we’re splitting over.
        '''
        return self.x.columns[self.var_idx]
    
    @property
    def split_col(self):
        '''
        It will segregate a column with selected rows.
        '''
        return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self):
        '''
        It will identify leaf nodes.
        '''
        return self.score == float('inf') or self.depth <= 0 
    

    def predict(self, x):
        '''
        makes prediction for our decision tree
        '''
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        '''
        Predicts the rows for decision tree.
        '''
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)


In [7]:
#creating random forest class
class RandomForest():
    '''
    x: independant variables of training set
    y: dependent variables of training set
    n_trees: number of uncorrelated trees
    n_feature: number of features in sample
    sample_size: size of sample
    depth: depth of each decision tree
    min_leaf: minimum number of rows in a node
    '''
    
    def __init__(self, x, y, n_trees, n_features, sample_size, depth=7, min_leaf=4):
        '''
        In this constructor, we defined our random forest regressor.
        '''
        np.random.seed(10)
        if n_features == 'sqrt':
            self.n_features = int(np.sqrt(x.shape[1]))
        elif n_features == 'log2':
            self.n_features = int(np.log2(x.shape[1]))
        else:
            self.n_features = n_features
        print(self.n_features, "sha: ",x.shape[1])    
        self.x, self.y, self.sample_sz, self.depth, self.min_leaf  = x, y, sample_sz, depth, min_leaf
        self.trees = [self.create_tree() for i in range(n_trees)]

    def create_tree(self):
        '''
        This constructor creates a new decision tree by calling Decision Tree.
        '''
        idxs = np.random.permutation(len(self.y))[:self.sample_sz]
        f_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        return DecisionTree(self.x.iloc[idxs], self.y[idxs], self.n_features, f_idxs,
                    idxs=np.array(range(self.sample_sz)),depth = self.depth, min_leaf=self.min_leaf)
        
    def predict(self, x):
        '''
        Here we predict our output 
        '''
        return np.mean([t.predict(x) for t in self.trees], axis=0)

def std_agg(cnt, s1, s2):
    '''
    Predicts the accuracy of our Random Forest Classifier
    '''
    return math.sqrt((s2/cnt) - (s1/cnt)**2)