# Import Packages

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from collections import Counter
from sklearn.base import clone

# Loading Dataset

In [7]:
# Load the dataset
data_train = pd.read_csv('pca_train_df.csv')
data_test = pd.read_csv('pca_test_df.csv')

# Data Splitting

In [8]:
# Assuming 'data_train' and 'data_test' are already loaded as pandas DataFrames

# Separate features and target variable from training data
X_train = data_train.drop(columns=['smoking'])
y_train = data_train['smoking']

# Separate features and target variable from testing data
X_test_val = data_test.drop(columns=['smoking'])
y_test_val = data_test['smoking']

# Split the training data into training and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

print("The size of training features data = ", X_train.shape)
print("The size of validation features data = ", X_val.shape)
print("The size of testing features data = ", X_test.shape)
print("The size of training targets data = ", y_train.shape)
print("The size of validation targets data = ", y_val.shape)
print("The size of testing targets data = ", y_test.shape)

The size of training features data =  (91272, 4)
The size of validation features data =  (23889, 4)
The size of testing features data =  (23888, 4)
The size of training targets data =  (91272,)
The size of validation targets data =  (23889,)
The size of testing targets data =  (23888,)


# Random Forest Algorithm

### Training:
Given the whole dataset:
* Get a subset of the dataset
* Create a decision tree
* Repeat as many as the number of trees
### Testing:
Given a data point:
* Get the predictions from each tree
* Since we are in a classification problem we will take the majority vote


In [9]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(max_depth=self.max_depth,
                                          min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        # Ensure X and y are numpy arrays for correct indexing
        X_sample = X.iloc[idxs].to_numpy() if isinstance(X, pd.DataFrame) else X[idxs]
        y_sample = y.iloc[idxs].to_numpy() if isinstance(y, pd.Series) else y[idxs]
        return X_sample, y_sample

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
        return predictions    

### Implementing Grid Search for hyperparameters tuning

In [10]:
# hyperparameters of random forest
param_grid = {
    'n_trees': [50, 100, 150],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
best_accuracy = 0
for n_trees in param_grid['n_trees']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            # Initialize and train the RandomForest model
            rf_model = RandomForest(
                n_trees=n_trees, 
                max_depth=max_depth, 
                min_samples_split=min_samples_split
            )
            rf_model.fit(X_train.values, y_train.values)
            # Make predictions on the validation set
            y_pred = rf_model.predict(X_val.values)
            # Calculate accuracy
            correct_predictions = sum(y_val == y_pred)  # Count correct predictions
            accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
            # Update best parameters if the current model is better
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {
                    'n_trees': n_trees,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split
                }
            # Print progress (optional)
            print(f"Params: n_trees={n_trees}, max_depth={max_depth}, min_samples_split={min_samples_split}, Accuracy={accuracy:.4f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_trees=50, max_depth=5, min_samples_split=2, Accuracy=0.9832
Params: n_trees=50, max_depth=5, min_samples_split=5, Accuracy=0.9859
Params: n_trees=50, max_depth=5, min_samples_split=10, Accuracy=0.9892
Params: n_trees=50, max_depth=10, min_samples_split=2, Accuracy=0.9859
Params: n_trees=50, max_depth=10, min_samples_split=5, Accuracy=0.9859
Params: n_trees=50, max_depth=10, min_samples_split=10, Accuracy=0.9834
Params: n_trees=50, max_depth=20, min_samples_split=2, Accuracy=0.9846
Params: n_trees=50, max_depth=20, min_samples_split=5, Accuracy=0.9857
Params: n_trees=50, max_depth=20, min_samples_split=10, Accuracy=0.9846
Params: n_trees=100, max_depth=5, min_samples_split=2, Accuracy=0.9858
Params: n_trees=100, max_depth=5, min_samples_split=5, Accuracy=0.9856
Params: n_trees=100, max_depth=5, min_samples_split=10, Accuracy=0.9858
Params: n_trees=100, max_depth=10, min_samples_split=2, Accuracy=0.9859
Params: n_trees=100, max_depth=10, min_samples_split=5, Accuracy=0.9858
Par

# Adaboost Algorithm

<img src="mQ9Np.png" alt="alt text" width="1000" height="400">

In [11]:
class AdaBoost:
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        self.alphas = []  # Store the weights of weak classifiers
        self.weak_classifiers = []  # Store the weak classifiers

    def fit(self, X, y):
        n_samples = X.shape[0]
        # Initialize weights equally
        weights = np.ones(n_samples) / n_samples
        for t in range(self.n_estimators):
            # Train a weak classifier (decision stump)
            stump = DecisionTreeClassifier(max_depth=1)
            stump.fit(X, y, sample_weight=weights)
            self.weak_classifiers.append(stump)
            # Predict and compute weighted error
            y_pred = stump.predict(X)
            # Use (0-1) Loss
            misclassified = (y_pred != y)
            error = np.sum(weights * misclassified) / np.sum(weights)
            # Compute alpha (classifier weight) or Performance
            alpha = 0.5 * np.log((1 - error) / (error + 1e-10))
            self.alphas.append(alpha)
            # Update weights
            weights *= np.exp(-alpha * y * y_pred)
            weights /= np.sum(weights)  # Normalize weights

    def predict(self, X):
        # Aggregate predictions from all weak classifiers
        final_prediction = np.zeros(X.shape[0])
        for alpha, classifier in zip(self.alphas, self.weak_classifiers):
            final_prediction += alpha * classifier.predict(X)
        return np.sign(final_prediction)

### Implementing Random Search for hyperparameters tuning

In [12]:
# range of hyperparameters of adaboost
param_dist = {
    'n_estimators': (10,200)
}
best_accuracy = 0
n_iter = 10
for i in range(n_iter):
    # randomly choose number of estimators
    n_estimators = int(np.random.uniform(*param_dist['n_estimators']))
    # Initialize and train the RandomForest model
    ab_model = AdaBoost(
        n_estimators=n_estimators
    )
    ab_model.fit(X_train.values, y_train.values)
    # Make predictions on the validation set
    y_pred = ab_model.predict(X_val.values)
    # Calculate accuracy
    correct_predictions = sum(y_val == y_pred)  # Count correct predictions
    accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'n_estimators': n_estimators
        }
    # Print progress (optional)
    print(f"Params: n_estimators={n_estimators}, Accuracy={accuracy:.4f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_estimators=53, Accuracy=0.9711
Params: n_estimators=19, Accuracy=0.9711
Params: n_estimators=199, Accuracy=0.9711
Params: n_estimators=182, Accuracy=0.9711
Params: n_estimators=35, Accuracy=0.9711
Params: n_estimators=80, Accuracy=0.9711
Params: n_estimators=149, Accuracy=0.9711
Params: n_estimators=65, Accuracy=0.9711
Params: n_estimators=24, Accuracy=0.9711
Params: n_estimators=47, Accuracy=0.9711

Best Parameters: {'n_estimators': 53}
Best Accuracy: 0.9710745531416133


# Bagged Logistic Regression

In [13]:
class BaggingLogisticRegression:
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        self.models = []

    def fit(self, X, y):
        self.models = []
        for _ in range(self.n_estimators):
            # Create bootstrap sample
            X_bootstrap, y_bootstrap = self._bootstrap_samples(X, y)
            # Train logistic regression on the bootstrap sample
            model = LogisticRegression(max_iter=2000)
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        # Ensure X and y are numpy arrays for correct indexing
        X_sample = X.iloc[idxs].to_numpy() if isinstance(X, pd.DataFrame) else X[idxs]
        y_sample = y.iloc[idxs].to_numpy() if isinstance(y, pd.Series) else y[idxs]
        return X_sample, y_sample

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.models])
        models_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in models_preds])
        return predictions 


In [14]:
# hyperparameters of Bagged Logistic Regression
param_grid = {
    'n_estimators': [50, 100, 150]
}
best_accuracy = 0
for n_estimators in param_grid['n_estimators']:
    # Initialize and train the RandomForest model
    BLR_model = BaggingLogisticRegression(
        n_estimators=n_estimators, 
    )
    BLR_model.fit(X_train.values, y_train.values)
    # Make predictions on the validation set
    y_pred = BLR_model.predict(X_val.values)
    # Calculate accuracy
    correct_predictions = sum(y_val == y_pred)  # Count correct predictions
    accuracy = correct_predictions / len(y_val)  # Proportion of correct predictions
    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'n_estimators': n_estimators
        }
    # Print progress (optional)
    print(f"Params: n_estimators={n_estimators}, Accuracy={accuracy:0.8f}")
# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: n_estimators=50, Accuracy=0.99799071
Params: n_estimators=100, Accuracy=0.99799071
Params: n_estimators=150, Accuracy=0.99799071

Best Parameters: {'n_estimators': 50}
Best Accuracy: 0.9979907070199674


In [15]:
class Bagging_Classifier:
    def __init__(self, base_estimator, n_estimators=10):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.models = []

    def fit(self, X, y):
        self.models = []
        for _ in range(self.n_estimators):
            # Create bootstrap sample
            X_bootstrap, y_bootstrap = self._bootstrap_samples(X, y)
            # Clone the base estimator
            model = clone(self.base_estimator)
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        # Ensure X and y are numpy arrays for correct indexing
        X_sample = X[idxs] if isinstance(X, np.ndarray) else X.iloc[idxs].to_numpy()
        y_sample = y[idxs] if isinstance(y, np.ndarray) else y.iloc[idxs].to_numpy()
        return X_sample, y_sample

    def _most_common_label(self, labels):
        counter = Counter(labels)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.models])
        models_preds = np.swapaxes(predictions, 0, 1)
        # Majority vote
        final_predictions = [self._most_common_label(preds) for preds in models_preds]
        return np.array(final_predictions)

## Grid Search over base estimators

In [16]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
parameters = {
    'base_estimator': [DecisionTreeClassifier(max_depth=5), LogisticRegression(max_iter=2000), KNeighborsClassifier()],
    'n_estimators': [50, 100, 150]
}

best_accuracy = 0
best_params = None

for base_estimator in parameters['base_estimator']:
    for n_estimators in parameters['n_estimators']:
        # Initialize and train the Bagging_Classifier model
        BC_model = Bagging_Classifier(
            base_estimator=clone(base_estimator),
            n_estimators=n_estimators,
        )
        BC_model.fit(X_train.values, y_train.values)
        # Make predictions on the validation set
        y_pred = BC_model.predict(X_val.values)
        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        # Update best parameters if the current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {
                'base_estimator': base_estimator.__class__.__name__,
                'n_estimators': n_estimators
            }
        # Print progress (optional)
        print(f"Params: base_estimator={base_estimator.__class__.__name__}, n_estimators={n_estimators}, Accuracy={accuracy:.4f}")

# Output the best parameters and accuracy
print("\nBest Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Params: base_estimator=DecisionTreeClassifier, n_estimators=50, Accuracy=0.9859
Params: base_estimator=DecisionTreeClassifier, n_estimators=100, Accuracy=0.9859
Params: base_estimator=DecisionTreeClassifier, n_estimators=150, Accuracy=0.9858
Params: base_estimator=LogisticRegression, n_estimators=50, Accuracy=0.9980
Params: base_estimator=LogisticRegression, n_estimators=100, Accuracy=0.9980
Params: base_estimator=LogisticRegression, n_estimators=150, Accuracy=0.9980
Params: base_estimator=KNeighborsClassifier, n_estimators=50, Accuracy=0.9978
Params: base_estimator=KNeighborsClassifier, n_estimators=100, Accuracy=0.9979
Params: base_estimator=KNeighborsClassifier, n_estimators=150, Accuracy=0.9979

Best Parameters: {'base_estimator': 'LogisticRegression', 'n_estimators': 50}
Best Accuracy: 0.9979907070199674


# Voting Classifier

In [20]:
class Voting_Classifier:
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.models])
        predictions = np.swapaxes(predictions, 0, 1)
        # Majority vote
        return np.array([Counter(sample_preds).most_common(1)[0][0] for sample_preds in predictions])
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
    

# Initialize the Voting Classifier with the best models
voting_classifier = Voting_Classifier(
    models=[
        RandomForest(n_trees=50, max_depth=5, min_samples_split=2), 
        AdaBoost(n_estimators=10), 
        BaggingLogisticRegression(n_estimators=50)
    ]
)

# Fit the Voting Classifier
voting_classifier.fit(X_train.values, y_train.values)
# Evaluate the Voting Classifier on the validation set
val_score = voting_classifier.score(X_val.values, y_val.values)
print(f"Validation set accuracy: {val_score:.4f}")
        

Validation set accuracy: 0.9859


In [21]:
class Stacking_Classifier:
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model

    def fit(self, X, y):
        for model in self.base_models:
            model.fit(X, y)
        # Generate predictions for training data using the base models
        base_predictions = np.array([model.predict(X) for model in self.base_models])
        base_predictions = base_predictions.T
        # Train the meta-model on the base predictions
        self.meta_model.fit(base_predictions, y)

    def predict(self, X):
        # Generate predictions for the input data using the base models
        base_predictions = np.array([model.predict(X) for model in self.base_models])
        base_predictions = base_predictions.T
        # Use the meta-model to make final predictions
        return self.meta_model.predict(base_predictions)

    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

# Initialize the Stacking Classifier with the best models
stacking_classifier = Stacking_Classifier(
    base_models=[
        RandomForest(n_trees=50, max_depth=5, min_samples_split=2),
        AdaBoost(n_estimators=10),
        BaggingLogisticRegression(n_estimators=50)
    ],
    meta_model=LogisticRegression(max_iter=1000)
)
# Fit the Stacking Classifier
stacking_classifier.fit(X_train.values, y_train.values)
# Evaluate the Stacking Classifier on the validation set
val_score = stacking_classifier.score(X_val.values, y_val.values)
print(f"Validation set accuracy: {val_score:.4f}")

Validation set accuracy: 0.9859
