In [61]:
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import pandas as pd

In [62]:
def bootstrap_sample(X, y):
    index = np.random.choice(X.shape[0], X.shape[0], replace=True)
    return X[index], y[index]
def most_common_label(y):
    return Counter(y).most_common(1)[0][0]


In [63]:
def entropy(y):
    return -np.sum([p * np.log2(p) for p in np.bincount(y) / len(y) if p > 0])

In [64]:
class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

In [82]:
class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []
    def predict(self, X):
        tree_preds = np.swapaxes(np.array([tree.predict(X) for tree in self.trees]), 0, 1)
        return np.array([most_common_label(tree_pred) for tree_pred in tree_preds])
    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                n_feats=self.n_feats,
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    

    

In [83]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        if depth >= self.max_depth or len(np.unique(y)) == 1 or X.shape[0] < self.min_samples_split:
            return Node(value=self._most_common_label(y))

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, np.random.choice(X.shape[1], self.n_feats, replace=False))

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        for feat_idx in feat_idxs:
            for threshold in np.unique(X[:, feat_idx]):
                gain = self._information_gain(y, X[:, feat_idx], threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)
        # compute the weighted avg. of the loss for the children
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        n_l, n_r = len(left_idxs), len(right_idxs)
        child_entropy = (n_l / len(y)) * e_l + (n_r / len(y)) * e_r
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        # information gain is difference in loss before vs. after split
        return entropy(y) - child_entropy

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

In [84]:
def accuracy(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred) 
        print(cm)
        
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        plt.matshow(cm)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
        return accuracy

In [85]:
if __name__ == "__main__":
    df=pd.read_csv("data3.csv")
    X = df.drop('species', axis=1)
    y = df['species']
    X = X.to_numpy()
    y1 = np.where(y == 'Iris-setosa',1,-1)
    y2 = np.where(y == 'Iris-versicolor',2 ,-1)
    y3 = np.where(y == 'Iris-virginica',3,-1)
    y = y.to_numpy()
    for i in range(len(y)):
        if y1[i] == 1 :
            y[i] = 1
        elif y2[i] == 2:
            y[i] = 2
        elif y3[i] == 3:
            y[i] = 3
    y = y.astype(int)       
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2)
    clf = RandomForest(n_trees=8, max_depth=10)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)

    print("Accuracy:", acc)

Accuracy: 0.8666666666666667


In [99]:
if __name__ == "__main__":
    df=pd.read_csv("data3.csv")
    X = df.drop('species', axis=1)
    y = df['species']
    X = X.to_numpy()
    y1 = np.where(y == 'Iris-setosa',1,-1)
    y2 = np.where(y == 'Iris-versicolor',2 ,-1)
    y3 = np.where(y == 'Iris-virginica',3,-1)
    y = y.to_numpy()
    for i in range(len(y)):
        if y1[i] == 1 :
            y[i] = 1
        elif y2[i] == 2:
            y[i] = 2
        elif y3[i] == 3:
            y[i] = 3
    y = y.astype(int)       
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2)
    for i in range (11):
        clf = RandomForest(n_trees=i+1, max_depth=10)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy(y_test, y_pred)

        print("Accuracy  for {} tree is : {} ".format(i+1 , acc) )

Accuracy  for 1 tree is : 0.9 
Accuracy  for 2 tree is : 0.9 
Accuracy  for 3 tree is : 0.9 
Accuracy  for 4 tree is : 0.9333333333333333 
Accuracy  for 5 tree is : 0.9333333333333333 
Accuracy  for 6 tree is : 0.9333333333333333 
Accuracy  for 7 tree is : 0.9 
Accuracy  for 8 tree is : 0.9666666666666667 
Accuracy  for 9 tree is : 0.9333333333333333 
Accuracy  for 10 tree is : 0.9333333333333333 
Accuracy  for 11 tree is : 0.9333333333333333 
