#STEP 1A: Add the Custom Decision Tree Code

In [1]:
import numpy as np

class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # Stop if node is pure
        if len(unique_classes) == 1:
            return {'class': unique_classes[0]}

        # Stop if max depth reached
        if self.max_depth is not None and depth >= self.max_depth:
            return {'class': np.bincount(y).argmax()}

        best_gain = -1
        best_split = None

        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])

            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask

                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue

                gain = self._information_gain(y, y[left_mask], y[right_mask])

                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        'feature_idx': feature_idx,
                        'threshold': threshold,
                        'left_mask': left_mask,
                        'right_mask': right_mask
                    }

        if best_split is None:
            return {'class': np.bincount(y).argmax()}

        left_tree = self._build_tree(
            X[best_split['left_mask']],
            y[best_split['left_mask']],
            depth + 1
        )

        right_tree = self._build_tree(
            X[best_split['right_mask']],
            y[best_split['right_mask']],
            depth + 1
        )

        return {
            'feature_idx': best_split['feature_idx'],
            'threshold': best_split['threshold'],
            'left_tree': left_tree,
            'right_tree': right_tree
        }

    def _information_gain(self, parent, left, right):
        parent_entropy = self._entropy(parent)
        left_entropy = self._entropy(left)
        right_entropy = self._entropy(right)

        weighted_entropy = (
            len(left) / len(parent) * left_entropy +
            len(right) / len(parent) * right_entropy
        )

        return parent_entropy - weighted_entropy

    def _entropy(self, y):
        probs = np.bincount(y) / len(y)
        return -np.sum(probs * np.log2(probs + 1e-9))

    def predict(self, X):
        return [self._predict_single(x, self.tree) for x in X]

    def _predict_single(self, x, tree):
        if 'class' in tree:
            return tree['class']

        if x[tree['feature_idx']] <= tree['threshold']:
            return self._predict_single(x, tree['left_tree'])
        else:
            return self._predict_single(x, tree['right_tree'])


concepts that were used 

fit() → builds the tree

_build_tree() → recursive splitting

_entropy() → impurity measure

_information_gain() → split quality

predict() → uses tree to classify new data



#STEP 2: Load and Split the IRIS Dataset

In [3]:
# Necessary Imports
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target
# Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Step-3- Train and Evaluate a Custom Decision Tree:

In [4]:
# Train the custom decision tree
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_custom = custom_tree.predict(X_test)
# Calculate accuracy
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")

Custom Decision Tree Accuracy: 1.0000


#Step-4- Train and Evaluate a Scikit Learn Decision Tree:

In [5]:
# Train the Scikit-learn decision tree
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_sklearn = sklearn_tree.predict(X_test)
# Calculate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")

Scikit-learn Decision Tree Accuracy: 1.0000


#Step-5- Result Comparison:

In [6]:
print(f"Accuracy Comparison:")
print(f"Custom Decision Tree: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree: {accuracy_sklearn:.4f}")

Accuracy Comparison:
Custom Decision Tree: 1.0000
Scikit-learn Decision Tree: 1.0000


#3 Exercise- Ensemble Methods and Hyperparameter Tuning.

#Using the Wine Dataset from scikit-learn

#1. Implement Classification Models:

In [7]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


#Load & Split Wine Dataset

In [8]:
# Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 142
Testing samples: 36


#Train Decision Tree Classifier

In [9]:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_clf.predict(X_test)

# F1 Score
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')
print("Decision Tree F1 Score:", f1_dt)


Decision Tree F1 Score: 0.9439974457215836


#Train Random Forest Classifier

In [10]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_clf.predict(X_test)

# F1 Score
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
print("Random Forest F1 Score:", f1_rf)


Random Forest F1 Score: 1.0


#Compare Models

In [11]:
print("Model Comparison (F1 Score)")
print("Decision Tree:", f1_dt)
print("Random Forest:", f1_rf)


Model Comparison (F1 Score)
Decision Tree: 0.9439974457215836
Random Forest: 1.0


Classification Model Comparison:
A Decision Tree Classifier and a Random Forest Classifier were trained on the Wine dataset.
The models were evaluated using the weighted F1-score.
Random Forest achieved a higher F1-score due to ensemble learning, which reduces overfitting and improves generalization.

#2. Hyperparameter Tuning:

What I must do

Choose 3 hyperparameters

Optimize Random Forest using GridSearchCV

 Chosen Hyperparameters

n_estimators – number of trees

max_depth – depth of trees

min_samples_split – minimum samples to split

#GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='f1_weighted',
    cv=5
)

grid_search.fit(X_train, y_train)


#Best Parameters & Score

In [13]:
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score: 0.9782952128219708


Hyperparameter Tuning:
GridSearchCV was used to tune three hyperparameters of the Random Forest Classifier:
number of trees, maximum depth, and minimum samples required to split a node.
Cross-validation with 5 folds was used to select the best parameter combination.

#3. Implement Regression Model:

#Imports for Regression

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


#Train Decision Tree Regressor

In [15]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

y_pred_dt_reg = dt_reg.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt_reg)

print("Decision Tree Regressor MSE:", mse_dt)


Decision Tree Regressor MSE: 0.16666666666666666


#Train Random Forest Regressor

In [16]:
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

y_pred_rf_reg = rf_reg.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf_reg)

print("Random Forest Regressor MSE:", mse_rf)


Random Forest Regressor MSE: 0.06483333333333333


#RandomSearchCV (Hyperparameter Tuning) Chosen Parameters

Chosen Parameters

n_estimators

max_depth

min_samples_split

In [17]:
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=42
)

random_search.fit(X_train, y_train)


#Best Regression Parameters

In [18]:
print("Best Parameters (Regression):", random_search.best_params_)
print("Best MSE:", -random_search.best_score_)


Best Parameters (Regression): {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 20}
Best MSE: 0.04678888653548126


Regression Models and Hyperparameter Tuning:
Decision Tree and Random Forest Regressors were trained on the Wine dataset.
Model performance was evaluated using Mean Squared Error (MSE).
RandomizedSearchCV was used to optimize three hyperparameters of the Random Forest Regressor.
Random Forest achieved better performance due to averaging multiple trees.