<a href="https://colab.research.google.com/github/reesha-rsh/MLb4/blob/main/Homework/HW8.%20Bagging_Random%20Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# 1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation

*   Validation approach: We have a small amount of data so I will use a **K Fold** method
*   Metric: I plan to optimize **fbeta score** with beta 0.5 to give more weight for precision, thus minimizing false positives - reducing the prediction that a passenger survived when he actually did not.



In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


from google.colab import drive
drive.mount('/content/drive')

train_full = pd.read_csv("/content/drive/MyDrive/MLb4/EDA Titanic/train.csv")
test_full = pd.read_csv("/content/drive/MyDrive/MLb4/EDA Titanic/test.csv")

Mounted at /content/drive


In [2]:
def generate(df,age_median,fare_median):
  useless_features = ['Name','Ticket','Cabin']
  data_cleaned = df
  data_cleaned = data_cleaned.drop(columns = useless_features)

  # generate binary values using get_dummies
  data_cleaned = pd.get_dummies(data_cleaned, columns=['Sex'],prefix=["Sex"])
  data_cleaned = pd.get_dummies(data_cleaned, columns=['Embarked'],prefix=["Embarked"])

  # Check for NaN values in the DataFrame
  nan_mask = data_cleaned.isnull()
  # Count the number of NaN values in each column
  nan_count_per_column = data_cleaned.isnull().sum()

  data_cleaned['Age'] = data_cleaned['Age'].fillna(age_median)
  data_cleaned['Fare'] = data_cleaned['Fare'].fillna(fare_median)

  return data_cleaned


In [3]:
features_columns = ['Pclass',	'Age',	'SibSp',	'Parch',	'Fare',	'Sex_female',	'Sex_male',	'Embarked_C',	'Embarked_Q',	'Embarked_S']

In [4]:
# get medians that will fill NaNs in generate func
age_median = train_full['Age'].median()
fare_median = train_full['Fare'].median()

In [5]:
train = generate(train_full,age_median=age_median,fare_median=fare_median)
train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,889,0,3,28.0,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [6]:
random_state = 42

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(train[features_columns], train['Survived'], test_size=0.2, random_state=random_state, stratify=train['Survived'])


In [9]:
# X_train = train[features_columns]
# y_train = train['Survived']

In [10]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, fbeta_score, classification_report, accuracy_score
from sklearn import metrics


In [11]:
# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [12]:
# Define the custom scoring function with the desired beta value
beta = 0.5
custom_scorer = make_scorer(fbeta_score, beta=beta)

In [13]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, fbeta_score



# Define the DecisionTreeClassifier model
classifier = DecisionTreeClassifier(random_state=random_state)

# Define the hyperparameter grid to search over
param_grid = {
    'criterion': ['gini'],
    'splitter': ['random'],
    'max_depth': [None, 3, 5, 7, 9, 11],
    'min_samples_leaf': [1,  3,  5,  7,  9,  11],
    'max_features': [ 1,  3,  5,  7,  9,  10],
    'class_weight': ['balanced']
}


# Initialize the GridSearchCV object with the DecisionTreeClassifier, hyperparameter grid, and custom scorer
grid_search = GridSearchCV(classifier, param_grid, scoring=custom_scorer, cv=skf)

# Perform grid search to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Optionally, you can evaluate the model on the full data using the F-beta score with beta=0.1
y_pred = best_model.predict(X_validation)

fbeta = fbeta_score(y_validation, y_pred, beta=beta)
accuracy = accuracy_score(y_validation, y_pred)

print("F-beta Score (beta={}): {:.4f}".format(beta, fbeta))
print("Accuracy: {:.4f}".format(accuracy))
print(metrics.classification_report(y_validation, y_pred))
print(metrics.confusion_matrix(y_validation, y_pred))


Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 3, 'min_samples_leaf': 9, 'splitter': 'random'}
F-beta Score (beta=0.5): 0.7736
Accuracy: 0.7989
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       110
           1       0.84      0.59      0.69        69

    accuracy                           0.80       179
   macro avg       0.81      0.76      0.77       179
weighted avg       0.80      0.80      0.79       179

[[102   8]
 [ 28  41]]


# 4. Train bagging model with decision tree as a base model and estimate performance on validation

In [14]:
from sklearn.ensemble import BaggingClassifier


In [15]:
parameters = {
    "max_features": [1,  3,  5,  7,  9,  10],
    "max_samples": [0.7, 0.8, 0.9],
    "estimator__max_depth": [None, 3, 5, 7, 9, 11],
    "estimator__min_samples_leaf": [1,  3,  5,  7,  9,  11],
    'estimator__criterion': ['gini'],
    'estimator__splitter': ['random'],
    'estimator__class_weight': ['balanced']
}

dt = DecisionTreeClassifier(random_state=random_state)
bg = BaggingClassifier(dt, random_state=random_state, n_estimators=25)

bag_grid_search = GridSearchCV(bg, parameters, scoring=custom_scorer, cv=skf)


# Perform grid search to find the best hyperparameters
bag_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = bag_grid_search.best_params_
best_model = bag_grid_search.best_estimator_


# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Optionally, you can evaluate the model on the full data using the F-beta score with beta=0.1
y_pred = best_model.predict(X_validation)

fbeta = fbeta_score(y_validation, y_pred, beta=beta)
accuracy = accuracy_score(y_validation, y_pred)

print("F-beta Score (beta={}): {:.4f}".format(beta, fbeta))
print("Accuracy: {:.4f}".format(accuracy))
print(metrics.classification_report(y_validation, y_pred))
print(metrics.confusion_matrix(y_validation, y_pred))


Best Hyperparameters: {'estimator__class_weight': 'balanced', 'estimator__criterion': 'gini', 'estimator__max_depth': 7, 'estimator__min_samples_leaf': 1, 'estimator__splitter': 'random', 'max_features': 9, 'max_samples': 0.8}
F-beta Score (beta=0.5): 0.7348
Accuracy: 0.7877
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       110
           1       0.75      0.67      0.71        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.78       179

[[95 15]
 [23 46]]


# RANDOM FOREST

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
parameters = {
    "max_features": [1,  3,  5,  7,  9,  10],
    "max_samples": [0.7, 0.8, 0.9],
    "max_depth": [None, 3, 5, 7, 9, 11],
    "min_samples_leaf": [1,  3,  5,  7,  9,  11],
    'criterion': ['gini'],
    'class_weight': ['balanced']
}

rfc = RandomForestClassifier(n_estimators=25, random_state=random_state, n_jobs=-1, oob_score=True)
forest_grid_search = GridSearchCV(rfc, parameters, scoring=custom_scorer, cv=skf)

# Perform grid search to find the best hyperparameters
forest_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = forest_grid_search.best_params_
best_model = forest_grid_search.best_estimator_


# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Optionally, you can evaluate the model on the full data using the F-beta score with beta=0.1
y_pred = best_model.predict(X_validation)

fbeta = fbeta_score(y_validation, y_pred, beta=beta)
accuracy = accuracy_score(y_validation, y_pred)

print("F-beta Score (beta={}): {:.4f}".format(beta, fbeta))
print("Accuracy: {:.4f}".format(accuracy))
print(metrics.classification_report(y_validation, y_pred))
print(metrics.confusion_matrix(y_validation, y_pred))


Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 9, 'max_features': 3, 'max_samples': 0.7, 'min_samples_leaf': 1}
F-beta Score (beta=0.5): 0.7475
Accuracy: 0.7933
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       110
           1       0.78      0.65      0.71        69

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179

[[97 13]
 [24 45]]



# 5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)


In [18]:
import numpy as np

In [19]:
import pickle

In [20]:
from sklearn.base import clone

In [21]:
def compare_trees(tree1, tree2):
    if hash(tree1.__dict__.values())==hash(tree2.__dict__.values()):
        # the trees have both been trained
        if tree1.tree_ != None and tree2.tree_ != None:
            try: # the tree values are matching arrays
                return np.array_equal(tree1.tree_.value, tree2.tree_.value)
            except: # they do not match
                return False
        elif tree1.tree_ != None or tree2.tree_ != None:
            # XOR of the trees is not trained
            return False
        else: # Neither has been trained
            return True
    else: # the params are different
        return False

In [28]:
class CustomBaggingClassifier:
    def __init__(self, base_estimator, n_estimators, max_samples=1.0, max_features=1.0, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.random_state = random_state
        self.estimators = []



    def bootstrap_sampling(self, X, y):
        # Implement bootstrap sampling to randomly select subsets of data
        num_samples = X.shape[0]
        if isinstance(self.max_samples, int):
            sample_size = self.max_samples
        elif isinstance(self.max_samples, float):
            sample_size = int(self.max_samples * num_samples)
        else:
            raise ValueError("Invalid type for max_samples. It must be int or float.")

        n_features = X.shape[1]
        if isinstance(self.max_features, int):
            max_features = self.max_features
        elif isinstance(self.max_features, float):
            max_features = max(1, int(self.max_features * n_features))
        else:
            raise ValueError("Invalid type for max_features. It must be int or float.")

        sample_indices = np.random.choice(num_samples, size=sample_size, replace=True)
        sample_features = np.random.choice(n_features, size=max_features, replace=True)

        X_sampled = X.iloc[sample_indices,sample_features]
        y_sampled = y.iloc[sample_indices]
        return X_sampled, y_sampled

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            # Get a random subset of the data using bootstrap sampling
            X_sampled, y_sampled = self.bootstrap_sampling(X, y)

            # Train a new base estimator and store it in the list
            estimator = clone(self.base_estimator)  # Clone without the random_state
            estimator.set_params(random_state=self.random_state)  # Set random_state here
            estimator.fit(X_sampled, y_sampled)

            # Check if estimators are the same
            if self.estimators:
              if compare_trees(estimator, self.estimators[-1]):
                raise ValueError("Estimators are the same.")

            self.estimators.append(estimator)


    def predict(self, X):
        # Make predictions using majority voting for classification
        predictions = np.zeros((X.shape[0], len(self.estimators)))

        for index, estimator in enumerate(self.estimators):

            predictions[:, index] = estimator.predict(X.loc[:,estimator.feature_names_in_])

        # Take the majority vote to make the final prediction
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype('int')).argmax(), axis=1, arr=predictions)

        return final_predictions

    def get_params(self, deep=True):
        return {
            'base_estimator': self.base_estimator,
            'n_estimators': self.n_estimators,
            'max_samples': self.max_samples,
            'max_features': self.max_features,
            'random_state': self.random_state
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self


In [23]:
parameters = {
    "max_features": [  3 ],
    "max_samples": [0.7],
    "base_estimator__max_depth": [None],
    "base_estimator__min_samples_leaf": [1],
    'base_estimator__criterion': ['gini'],
    'base_estimator__splitter': ['random'],
    'base_estimatorr__class_weight': ['balanced']
}


dt = DecisionTreeClassifier()
cbg = CustomBaggingClassifier(dt, n_estimators=2)

cbag_grid_search = GridSearchCV(cbg, parameters, scoring=custom_scorer, cv=skf)


# Perform grid search to find the best hyperparameters
cbag_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = cbag_grid_search.best_params_
best_model = cbag_grid_search.best_estimator_
best_score = cbag_grid_search.best_score_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)
print("Best score:", best_params)


# Optionally, you can evaluate the model on the full data using the F-beta score with beta=0.1
y_pred = best_model.predict(X_validation)

fbeta = fbeta_score(y_validation, y_pred, beta=beta)
accuracy = accuracy_score(y_validation, y_pred)

print("F-beta Score (beta={}): {:.4f}".format(beta, fbeta))
print("Accuracy: {:.4f}".format(accuracy))
print(metrics.classification_report(y_validation, y_pred))
print(metrics.confusion_matrix(y_validation, y_pred))


Best Hyperparameters: {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__splitter': 'random', 'base_estimatorr__class_weight': 'balanced', 'max_features': 3, 'max_samples': 0.7}
F-beta Score (beta=0.5): 0.3468
Accuracy: 0.6034
              precision    recall  f1-score   support

           0       0.63      0.87      0.73       110
           1       0.46      0.17      0.25        69

    accuracy                           0.60       179
   macro avg       0.54      0.52      0.49       179
weighted avg       0.56      0.60      0.55       179

[[96 14]
 [57 12]]


In [30]:
cbag_grid_search.best_score_

0.816495190847203

In [29]:
parameters = {
    "max_features": [1,  3,  5,  7,  9,  10],
    "max_samples": [0.7, 0.8, 0.9],
    "base_estimator__max_depth": [None, 3, 5, 7, 9, 11],
    "base_estimator__min_samples_leaf": [1,  3,  5,  7,  9,  11],
    'base_estimator__criterion': ['gini'],
    'base_estimator__splitter': ['random'],
    'base_estimatorr__class_weight': ['balanced']
}


dt = DecisionTreeClassifier(random_state=random_state)
cbg = CustomBaggingClassifier(dt, random_state=random_state, n_estimators=25)

cbag_grid_search = GridSearchCV(cbg, parameters, scoring=custom_scorer, cv=skf)


# Perform grid search to find the best hyperparameters
cbag_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = cbag_grid_search.best_params_
best_model = cbag_grid_search.best_estimator_


# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Optionally, you can evaluate the model on the full data using the F-beta score with beta=0.1
y_pred = best_model.predict(X_validation)

fbeta = fbeta_score(y_validation, y_pred, beta=beta)
accuracy = accuracy_score(y_validation, y_pred)

print("F-beta Score (beta={}): {:.4f}".format(beta, fbeta))
print("Accuracy: {:.4f}".format(accuracy))
print(metrics.classification_report(y_validation, y_pred))
print(metrics.confusion_matrix(y_validation, y_pred))


Best Hyperparameters: {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 7, 'base_estimator__splitter': 'random', 'base_estimatorr__class_weight': 'balanced', 'max_features': 3, 'max_samples': 0.7}
F-beta Score (beta=0.5): 0.7347
Accuracy: 0.7709
              precision    recall  f1-score   support

           0       0.76      0.93      0.83       110
           1       0.82      0.52      0.64        69

    accuracy                           0.77       179
   macro avg       0.79      0.72      0.73       179
weighted avg       0.78      0.77      0.76       179

[[102   8]
 [ 33  36]]


1 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "<ipython-input-28-222fa6dc8b39>", line 50, in fit
    raise ValueError("Estimators are the same.")
ValueError: Estimators are the same.

 0.76991312 0.76574839 0.7821288  0.79583939 0.78826578 0.78686575
 0.78624098 0.77716116 0.78047984 0.76422094 0.76489412 0.77761528
 0.49520558 0.51529026 0.5402635  0.76577957 0.76176723 0.76877487
 0.76022502 0.78154191 0.78299504 0.79201829 0.76328682 0.7865638
 0.7757

# 6. Compare performance of sklearn bagging model with your own implementation

**CustomBaggingClassifier**

Best Hyperparameters: {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__splitter': 'random', 'base_estimatorr__class_weight': 'balanced', 'max_features': 5, 'max_samples': 0.9}
F-beta Score (beta=0.5): 0.7935
Accuracy: 0.8103
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       549
           1       0.86      0.60      0.71       342

    accuracy                           0.81       891
   macro avg       0.83      0.77      0.78       891
weighted avg       0.82      0.81      0.80       891

[[516  33]
 [136 206]]

BaggingClassifier

Best Hyperparameters: {'estimator__class_weight': 'balanced', 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__splitter': 'random', 'max_features': 5, 'max_samples': 0.7}
F-beta Score (beta=0.5): 0.9102
Accuracy: 0.9147
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       549
           1       0.93      0.84      0.88       342

    accuracy                           0.91       891
   macro avg       0.92      0.90      0.91       891
weighted avg       0.92      0.91      0.91       891

[[527  22]
 [ 54 288]]

I have a really big gap between Custom and sklearn bagging classifiers. I don't understand what I am missing in my custom implementation