In [5]:
import numpy as np
import pandas as pd

data = pd.read_csv('Iris.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['Species']= label_encoder.fit_transform(data['Species'])

data['Species'].unique()

array([0, 1, 2])

In [20]:
import random
import math
from collections import Counter


# Calculate Entropy
def entropy(y: np.ndarray) -> np.float64:
    """
    Calculate the entropy of a given set of labels.

    Parameters:
    y (array-like): Labels array.

    Returns:
    float: Entropy value.
    """
    # Begin your code here
    pass
    # End your code here
    return None


In [21]:
# Create Node
class Node:

    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        """
        Initialize a Node object for decision tree.

        Parameters:
        feature (int or str): Index or name of the feature to split on.
        threshold (float): Threshold value for splitting.
        left (Node): Left child node.
        right (Node): Right child node.
        value (int): Value at leaf node (class label).

        Returns:
        None
        """
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        """
        Check if the node is a leaf node.

        Returns:
        bool: True if the node is a leaf node, False otherwise.
        """
        return self.value is not None

In [22]:
#Decision Tree
class DecissionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None, max_features='auto'):
        """
        Initialize a DecisionTree object.

        Parameters:
        min_samples_split (int): Minimum number of samples required to split a node.
        max_depth (int): Maximum depth of the tree.
        n_feats (int): Number of features to consider when looking for the best split.
        max_features (str or int or float): Strategy to select the number of features.

        Returns:
        None
        """
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None
        self.max_features = max_features

    def fit(self, X, y):
        """
        Fit the decision tree to the training data.

        Parameters:
        X (array-like): Training data features.
        y (array-like): Target values.

        Returns:
        None
        """
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.cols = list(X.columns)
        self.root = self.grow_tree(X, y)

    def grow_tree(self, X, y, depth=0):
        """
        Recursively grow the decision tree.

        Parameters:
        X (array-like): Training data features.
        y (array-like): Target values.
        depth (int): Current depth of the tree.

        Returns:
        Node: Root node of the decision tree.
        """
        df = X.copy()
        df['target'] = y

        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self.most_common_label(y)
            return Node(value=leaf_value)

        # array of random columns in Dataset

        data = self.feature_sampling(X, self.max_features)

        feats_idxs = list(data.columns)

        best_feat, best_thresh = self.best_criteria(X, y.tolist(), feats_idxs)

        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()

        left = self.grow_tree(left_df.drop('target', axis=1), left_df['target'].values, depth+1)
        right = self.grow_tree(right_df.drop('target', axis=1), right_df['target'].values, depth+1)

        return Node(best_feat, best_thresh, left, right)

    def best_criteria(self, X, y, feats_idxs):
        """
        Find the best splitting criterion for the decision tree.

        Parameters:
        X (array-like): Training data features.
        y (array-like): Target values.
        feats_idxs (list): List of feature indices.

        Returns:
        tuple: Best feature index and threshold value.
        """
        best_gain = -1
        split_idx, split_tresh = None, None

        X = X.to_numpy()

        for feats_idx in feats_idxs:

            index = int(self.cols.index(feats_idx))

            df = pd.DataFrame(X[:, index], columns=['X_col'])
            df['y'] = y
            df = df.sort_values(by=['X_col'], ascending=True)

            X_col_2 = df.X_col
            y_2 = df.y

            X_col_2 = X_col_2.to_numpy()
            y_2 = y_2.to_numpy()

            for val in X_col_2:
                gain = self.information_gain(y_2, X_col_2, val)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feats_idx
                    split_tresh = val

        return split_idx, split_tresh

    def information_gain(self, y, X_col, thresh):
        """
        Calculate the information gain for a split.

        Parameters:
        y (array-like): Target values.
        X_col (array-like): Feature values.
        thresh (float): Threshold value for splitting.

        Returns:
        float: Information gain.
        """
        parent_entropy = entropy(y)

        left, right = self.split(X_col, thresh)

        if len(left) == 0 or len(right) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(left), len(right)
        e_l, e_r = entropy(y[left]), entropy(y[right])

        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        ig = parent_entropy - child_entropy
        return ig

    def split(self, X_col, split_tresh):
        """
        Split the data based on a threshold.

        Parameters:
        X_col (array-like): Feature values.
        split_tresh (float): Threshold value for splitting.

        Returns:
        tuple: Indices of samples in left and right splits.
        """
        left_idxs = np.argwhere(X_col <= split_tresh).flatten()
        right_idxs = np.argwhere(X_col > split_tresh).flatten()

        return left_idxs, right_idxs

    def most_common_label(self, y):
        """
        Find the most common label in a set of labels.

        Parameters:
        y (array-like): Target values.

        Returns:
        int: Most common label.
        """
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        """
        Make predictions using the fitted decision tree.

        Parameters:
        X (array-like): Test data features.

        Returns:
        array-like: Predicted target values.
        """
        X = X.to_numpy().tolist()
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        """
        Traverse the decision tree to make a prediction for a single sample.

        Parameters:
        x (array-like): Feature values for a single sample.
        node (Node): Current node in the decision tree.

        Returns:
        int: Predicted target value.
        """
        if node.is_leaf_node():
            return node.value

        index = int(self.cols.index(node.feature))

        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)

        return self.traverse_tree(x, node.right)

    def feature_sampling(self, data, val):
        """
        Perform feature sampling based on the specified strategy.

        Parameters:
        data (DataFrame): Input data.
        val (int or float or str): Sampling strategy or number of features to sample.

        Returns:
        DataFrame: Sampled data.
        """
        if type(val) == int:
            col = random.sample(data.columns.tolist()[:], val)
            new_df = data[col]
            return new_df
        elif type(val) == float:
            col = random.sample(data.columns.tolist()[:], int(val * data.shape[1]))
            new_df = data[col]
            return new_df
        elif val == 'auto' or val == 'sqrt':
            col = random.sample(data.columns.tolist()[:], int(math.sqrt(data.shape[1])))
            new_df = data[col]
            return new_df
        elif val == 'log2':
            col = random.sample(data.columns.tolist()[:], int(math.log2(data.shape[1])))
            new_df = data[col]
            return new_df
        else:
            return data

In [27]:
class randomforestclassifier:
    def __init__(self, n_estimators=100, criterion='entropy', max_depth=None, min_samples_split=2, bootstrap=True, max_samples=None,
                 max_features='auto', oob_score=False):
        """
        Initialize a RandomForestClassifier object.

        Parameters:
        n_estimators (int): Number of trees in the forest.
        criterion (str): Splitting criterion for decision trees.
        max_depth (int): Maximum depth of the trees.
        min_samples_split (int): Minimum number of samples required to split a node.
        bootstrap (bool): Whether bootstrap samples are used when building trees.
        max_samples (int or float): Number of samples to draw from X to train each base estimator.
        max_features (str or int or float): Strategy to select the number of features.
        oob_score (bool): Whether to use out-of-bag samples to estimate the generalization accuracy.

        Returns:
        None
        """
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.max_samples = max_samples
        self.max_features = max_features
        self.oob_score = oob_score

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> None:
        """
        Fit the random forest classifier to the training data.

        Parameters:
        X_train (array-like): Training data features.
        y_train (array-like): Target values.

        Returns:
        None
        """
        # Begin your code here
        pass
        # End your code here
        return None


    def predict(self, X_test: pd.DataFrame) -> list:
        """
        Make predictions using the fitted random forest classifier.

        Parameters:
        X_test (array-like): Test data features.

        Returns:
        array-like: Predicted target values.
        """
        # Begin your code here
        pass
        # End your code here
        return None

    def score(self, y_true=None, y_pred=None):
        """
        Calculate the accuracy score of the classifier.

        Parameters:
        y_true (array-like): True target values.
        y_pred (array-like): Predicted target values.

        Returns:
        float: Accuracy score.
        """
        acc = np.sum(y_true == y_pred)/len(y_true)
        return acc

    def row_sampling(self, data, val):
        """
        Perform row sampling based on the specified strategy.

        Parameters:
        data (DataFrame): Input data.
        val (int or float): Sampling strategy or number of samples to sample.

        Returns:
        DataFrame: Sampled data.
        """
        if type(val) == float:
            return data.sample(int(val * data.shape[0]), replace=True)
        if type(val) == int:
            return data.sample(val, replace=True)
        if val == None:
            return data

In [None]:
if __name__ == '__main__':
    x = data.drop('Species', axis=1)
    y = data.Species

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    re = randomforestclassifier(n_estimators=10, max_depth=5, min_samples_split=5, max_samples=50, max_features=3)

    re.fit(X_train, y_train)

In [12]:
y_pred = re.predict(X_test)

In [None]:
sc = re.score(y_test, y_pred)
print('Accuracy :', sc)