#Random Foreset  
##Algorithm   
###Input:   
Training data D with shape (n, m) where n is the number of samples and m is the number of features.   
###Steps:
1. Sample q dataset each of shape (n, m) say $D_1, D_2, ... D_q$ with replacement from D.   
2. In each of the dataset $D_i$ select u out of m features where u<=m, before each split and train a full decision tree $h_j(x)$.   
3. The final predictor is:   
  * For regression, an average output from q regressors is assigned to the new example.   
  $$ h(x) = \frac{1}{q} Σ_{j=1}^q h_j(x)   
  $$   
  * For classification, a majority voting is taken and class label with maximum number of votes is assigned to the new example.

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.tree import DecisionTreeClassifier

###Random Forest Bagging   
Define a function bag to create bootstrap samples $D_1, D-2, ... D_q $ from the original dataset $D$.   
The key step is np.random.choice with size = n_samples and replace=True, which ensures that the bootstrap samples has the same number of samples as the original dataset and it is obtained by sampling with replacement.

In [None]:
def bag(X, y):
  n_samples = X.shape[0]

  indices = np.random.choice(n_samples, size=n_samples, replace=True, random_state=1)
 # indices = np.random.choice(n_samples, size=n_samples, replace=True)

  return X[indices], y[indices]

###Majority Voting

In [None]:
def most_common_label(y):
  counter = Counter(y)
  most_common = counter.most_common(1)[0][0]
  return most_common

###Random Forest Class

In [None]:
class RandomForest:
  def __init__(self, n_trees=10, min_sapmle_split=2, max_depth=100, max_features=None):
    self.n_trees = n_trees
    self.min_sapmle_split = min_sapmle_split
    self.max_depth = max_depth
    self.max_features = max_features
    self.trees = []
  
  def fit(self, X, y):
    self.trees = []
    for _ in range(self.n_trees):
      tree = DecisionTreeClassifier(min_samples_split=self.min_sapmle_split,
                                    max_depth = self.max_depth,
                                    max_features=self.max_features
                                    )
      X_samples, y_samples = bag(X, y)
      tree.fit(X_samples, y_samples)
      self.trees.append(tree)
  
  def predict(self, X):
    tree_predict = np.array([tree.predict(X) for tree in self.trees])
    tree_predict = np.swapaxes(tree_predict, 0, 1)   #each of the trees will give out prediction
    y_predict = [most_common_label(tree_pred) for tree_pred in tree_predict]
    return np.array(y_predict)

### Random Forest Implementation:

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

def accuracy(y_true, y_pred):
  accuracy = np.sum(y_pred == y_true) / len(y_true)
  return accuracy

data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
clf = RandomForest(n_trees=10, max_depth=100, max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)
print('Accuracy: ', acc)

Accuracy:  0.9473684210526315


In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[36  6]
 [ 0 72]]


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92        42
           1       0.92      1.00      0.96        72

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114

