In [25]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

In [26]:
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        if depth >= self.max_depth or num_samples < self.min_samples_split or len(set(y)) == 1:
            return Counter(y).most_common(1)[0][0]  # Leaf node

        feature_idx, threshold = self._best_split(X, y, num_features)
        if feature_idx is None:
            return Counter(y).most_common(1)[0][0]

        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask

        return {
            'feature_idx': feature_idx,
            'threshold': threshold,
            'left': self._grow_tree(X[left_mask], y[left_mask], depth + 1),
            'right': self._grow_tree(X[right_mask], y[right_mask], depth + 1)
        }

    def _best_split(self, X, y, num_features):
        best_gini = float('inf')
        best_split = None

        for feature_idx in np.random.choice(X.shape[1], num_features, replace=False):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask
                if sum(left_mask) == 0 or sum(right_mask) == 0:
                    continue

                gini = self._gini_impurity(y[left_mask], y[right_mask])
                if gini < best_gini:
                    best_gini = gini
                    best_split = (feature_idx, threshold)

        return best_split if best_split else (None, None)

    def _gini_impurity(self, left, right):
        def gini(y):
            counts = np.bincount(y)
            probs = counts / np.sum(counts)
            return 1 - np.sum(probs ** 2)

        left_size, right_size = len(left), len(right)
        total_size = left_size + right_size
        return (left_size / total_size) * gini(left) + (right_size / total_size) * gini(right)

    def predict(self, X):
        return np.array([self._traverse_tree(sample, self.tree) for sample in X])

    def _traverse_tree(self, sample, node):
        if not isinstance(node, dict):
            return node
        if sample[node['feature_idx']] <= node['threshold']:
            return self._traverse_tree(sample, node['left'])
        else:
            return self._traverse_tree(sample, node['right'])


In [27]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            bootstrap_indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap, y_bootstrap = X[bootstrap_indices], y[bootstrap_indices]
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=tree_predictions)


In [28]:
# fetching data
df = pd.read_csv("/content/drive/MyDrive/ML/Dataset/hr_analytics_dataset.csv")

df.drop(columns=["EmployeeID"], inplace=True)
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 2})
df["Department"] = df["Department"].map({"HR": 1, "R&D": 2, "Marketing": 3, "Finance": 4, "Operations": 5, "Sales": 6, "IT": 7})
df["JobRole"] = df["JobRole"].map({"Sales Representative": 1, "Manager": 2,
                                   "Scientist": 3, "Analyst": 4, "Engineer": 5, "Executive": 6, "HR Specialist": 7})
df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})

df.head()
# print(df["JobRole"].unique())
# (df == " ").sum()

Unnamed: 0,Age,Gender,Department,JobRole,JobLevel,YearsAtCompany,YearsInCurrentRole,WorkLifeBalance,JobSatisfaction,PerformanceRating,Attrition
0,42,1,1,1,4,23,5,1,1,1,1
1,27,1,2,2,3,5,0,3,2,3,0
2,56,2,3,3,3,37,8,4,3,1,0
3,28,2,4,1,5,22,10,1,4,2,0
4,51,2,5,4,3,4,0,2,4,2,0


In [29]:
#Splitting data
Y = df["Attrition"].to_numpy()
X = df.drop(columns=["Attrition"]).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [30]:
# Train the Random Forest
rf = RandomForest(n_trees=10, max_depth=5)
rf.fit(X_train, y_train)

In [31]:
# Predict and Evaluate
from sklearn.metrics import accuracy_score
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.81
