# Import Packages

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from collections import Counter

# Loading Dataset

In [5]:
# Load the dataset
data = pd.read_csv('our_data.csv')
data.shape

(159256, 12)

# Data Preprocessing

In [6]:
# Feature Selection and Preprocessing
continuous_features = ['height(cm)', 'waist(cm)', 'age']
X = data[continuous_features]
y = data['smoking']

# Normalize continuous features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Data Splitting

In [7]:
# Split the dataset to 70% training, 15% validation & 15% testing
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print("The size of training features data = ", X_train.shape)
print("The size of validation features data = ", X_val.shape)
print("The size of testing features data = ", X_test.shape)
print("The size of training targets data = ", y_train.shape)
print("The size of validation targets data = ", y_val.shape)
print("The size of testing targets data = ", y_test.shape)

The size of training features data =  (111479, 3)
The size of validation features data =  (23888, 3)
The size of testing features data =  (23889, 3)
The size of training targets data =  (111479,)
The size of validation targets data =  (23888,)
The size of testing targets data =  (23889,)


# Random Forest Algorithm

### Training:
Given the whole dataset:
* Get a subset of the dataset
* Create a decision tree
* Repeat as many as the number of trees
### Testing:
Given a data point:
* Get the predictions from each tree
* Since we are in a classification problem we will take the majority vote


In [141]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(max_depth=self.max_depth,
                                          min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        # Ensure X and y are numpy arrays for correct indexing
        X_sample = X.iloc[idxs].to_numpy() if isinstance(X, pd.DataFrame) else X[idxs]
        y_sample = y.iloc[idxs].to_numpy() if isinstance(y, pd.Series) else y[idxs]
        return X_sample, y_sample

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
        return predictions    

### Implementing Grid Search for hyperparameters tuning

In [142]:
# hyperparameters of random forest
param_grid = {
    'n_trees': [50, 100, 150],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
best_accuracy = 0
for n_trees in param_grid['n_trees']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            # Initialize and train the RandomForest model
            rf_model = RandomForest(
                n_trees=n_trees, 
                max_depth=max_depth, 
                min_samples_split=min_samples_split
            )
            rf_model.fit(X_train, y_train)
            # Make predictions on the validation set
            y_pred = rf_model.predict(X_val)
            # Calculate accuracy
            correct_predictions = sum(y_val == y_pred)  # Count correct predictions
            accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
            # Update best parameters if the current model is better
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'n_trees': n_trees,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split
                }
            # Print progress (optional)
            print(f"Params: n_trees={n_trees}, max_depth={max_depth}, min_samples_split={min_samples_split}, Accuracy={accuracy:.4f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_trees=50, max_depth=5, min_samples_split=2, Accuracy=0.7161
Params: n_trees=50, max_depth=5, min_samples_split=5, Accuracy=0.7158
Params: n_trees=50, max_depth=5, min_samples_split=10, Accuracy=0.7165
Params: n_trees=50, max_depth=10, min_samples_split=2, Accuracy=0.7149
Params: n_trees=50, max_depth=10, min_samples_split=5, Accuracy=0.7137
Params: n_trees=50, max_depth=10, min_samples_split=10, Accuracy=0.7145
Params: n_trees=50, max_depth=20, min_samples_split=2, Accuracy=0.7022
Params: n_trees=50, max_depth=20, min_samples_split=5, Accuracy=0.7033
Params: n_trees=50, max_depth=20, min_samples_split=10, Accuracy=0.7045
Params: n_trees=100, max_depth=5, min_samples_split=2, Accuracy=0.7164
Params: n_trees=100, max_depth=5, min_samples_split=5, Accuracy=0.7163
Params: n_trees=100, max_depth=5, min_samples_split=10, Accuracy=0.7163
Params: n_trees=100, max_depth=10, min_samples_split=2, Accuracy=0.7145
Params: n_trees=100, max_depth=10, min_samples_split=5, Accuracy=0.7144
Par

# Adaboost Algorithm

<img src="mQ9Np.png" alt="alt text" width="1000" height="400">

In [10]:
class AdaBoost:
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        self.alphas = []  # Store the weights of weak classifiers
        self.weak_classifiers = []  # Store the weak classifiers

    def fit(self, X, y):
        n_samples = X.shape[0]
        # Initialize weights equally
        weights = np.ones(n_samples) / n_samples
        for t in range(self.n_estimators):
            # Train a weak classifier (decision stump)
            stump = DecisionTreeClassifier(max_depth=1)
            stump.fit(X, y, sample_weight=weights)
            self.weak_classifiers.append(stump)
            # Predict and compute weighted error
            y_pred = stump.predict(X)
            # Use (0-1) Loss
            misclassified = (y_pred != y)
            error = np.sum(weights * misclassified) / np.sum(weights)
            # Compute alpha (classifier weight) or Performance
            alpha = 0.5 * np.log((1 - error) / (error + 1e-10))
            self.alphas.append(alpha)
            # Update weights
            weights *= np.exp(-alpha * y * y_pred)
            weights /= np.sum(weights)  # Normalize weights

    def predict(self, X):
        # Aggregate predictions from all weak classifiers
        final_prediction = np.zeros(X.shape[0])
        for alpha, classifier in zip(self.alphas, self.weak_classifiers):
            final_prediction += alpha * classifier.predict(X)
        return np.sign(final_prediction)

### Implementing Random Search for hyperparameters tuning

In [11]:
# range of hyperparameters of adaboost
param_dist = {
    'n_estimators': (10,200)
}
best_accuracy = 0
n_iter = 10
for i in range(n_iter):
    # randomly choose number of estimators
    n_estimators = int(np.random.uniform(*param_dist['n_estimators']))
    # Initialize and train the RandomForest model
    ab_model = AdaBoost(
        n_estimators=n_estimators
    )
    ab_model.fit(X_train, y_train)
    # Make predictions on the validation set
    y_pred = ab_model.predict(X_val)
    # Calculate accuracy
    correct_predictions = sum(y_val == y_pred)  # Count correct predictions
    accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'n_estimators': n_estimators
        }
    # Print progress (optional)
    print(f"Params: n_estimators={n_estimators}, Accuracy={accuracy:.4f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_estimators=144, Accuracy=0.7038
Params: n_estimators=133, Accuracy=0.7038
Params: n_estimators=18, Accuracy=0.7038
Params: n_estimators=176, Accuracy=0.7038
Params: n_estimators=174, Accuracy=0.7038
Params: n_estimators=35, Accuracy=0.7038
Params: n_estimators=118, Accuracy=0.7038
Params: n_estimators=122, Accuracy=0.7038
Params: n_estimators=150, Accuracy=0.7038
Params: n_estimators=24, Accuracy=0.7038

Best Parameters: {'n_estimators': 144}
Best Accuracy: 0.7037843268586738


# Bagged Logistic Regression

In [8]:
class BaggingLogisticRegression:
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        self.models = []

    def fit(self, X, y):
        self.models = []
        for _ in range(self.n_estimators):
            # Create bootstrap sample
            X_bootstrap, y_bootstrap = self._bootstrap_samples(X, y)
            # Train logistic regression on the bootstrap sample
            model = LogisticRegression()
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        # Ensure X and y are numpy arrays for correct indexing
        X_sample = X.iloc[idxs].to_numpy() if isinstance(X, pd.DataFrame) else X[idxs]
        y_sample = y.iloc[idxs].to_numpy() if isinstance(y, pd.Series) else y[idxs]
        return X_sample, y_sample

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.models])
        models_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in models_preds])
        return predictions 


In [9]:
# hyperparameters of Bagged Logistic Regression
param_grid = {
    'n_estimators': [50, 100, 150]
}
best_accuracy = 0
for n_estimators in param_grid['n_estimators']:
    # Initialize and train the RandomForest model
    BLR_model = BaggingLogisticRegression(
        n_estimators=n_estimators, 
    )
    BLR_model.fit(X_train, y_train)
    # Make predictions on the validation set
    y_pred = BLR_model.predict(X_val)
    # Calculate accuracy
    correct_predictions = sum(y_val == y_pred)  # Count correct predictions
    accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'n_estimators': n_estimators
        }
    # Print progress (optional)
    print(f"Params: n_estimators={n_estimators}, Accuracy={accuracy:.4f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_estimators=50, Accuracy=0.6888
Params: n_estimators=100, Accuracy=0.6888
Params: n_estimators=150, Accuracy=0.6888

Best Parameters: {'n_estimators': 100}
Best Accuracy: 0.6888395847287341
