In [27]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Load the dataset
file_path = 'data.csv'  # Replace with the correct path
df = pd.read_csv(file_path)

# Split features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def k_fold_cross_validation(model, X, y, k = 5):
    """
    Perform k fold cross validation

    Parameters:
    - model : the machine learning model to train
    - X : Feature matrix
    - y : Targer vector
    -k : Number of folds

    returns: average validation error across k folds
    """

    X,y = shuffle(X, y, random_state = 42)
    fold_size = len(X) // k
    errors = []

    for i in range(k):
        #create validation and training sets
        X_val = X[i * fold_size : (i+1) * fold_size]
        y_val = y[i * fold_size : ( i+1) * fold_size]
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)


         # Train the model
        model.fit(X_train, y_train)
        # Predict on validation set
        y_pred = model.predict(X_val)
        # Compute error
        error = 1 - accuracy_score(y_val, y_pred)
        errors.append(error)
    return np.mean(errors)


    
    

In [34]:
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameters to tune
ensemble_sizes = [10, 20, 50, 100, 200, 300, 500]
best_ensemble_size_rf = None
lowest_error_rf = float('inf')

# Perform k-fold cross-validation for each ensemble size
for ensemble_size in ensemble_sizes:
    rf = RandomForestClassifier(n_estimators=ensemble_size, max_features='sqrt', criterion='gini', random_state=42)
    avg_error = k_fold_cross_validation(rf, X_train, y_train, k=5)
    print(f"Random Forest (n_estimators={ensemble_size}) - Avg Validation Error: {avg_error}")
    
    if avg_error < lowest_error_rf:
        lowest_error_rf = avg_error
        best_ensemble_size_rf = ensemble_size

print(f"Best Random Forest Ensemble Size: {best_ensemble_size_rf} with Avg Validation Error: {lowest_error_rf}")


Random Forest (n_estimators=10) - Avg Validation Error: 0.06005434782608694
Random Forest (n_estimators=20) - Avg Validation Error: 0.052989130434782594
Random Forest (n_estimators=50) - Avg Validation Error: 0.05027173913043479
Random Forest (n_estimators=100) - Avg Validation Error: 0.048641304347826056
Random Forest (n_estimators=200) - Avg Validation Error: 0.047826086956521754
Random Forest (n_estimators=300) - Avg Validation Error: 0.047010869565217404
Random Forest (n_estimators=500) - Avg Validation Error: 0.046739130434782596
Best Random Forest Ensemble Size: 500 with Avg Validation Error: 0.046739130434782596


In [38]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Define hyperparameters to tune
ensemble_sizes = [10, 20, 50, 100, 200, 300, 500]
best_ensemble_size_ada = None
lowest_error_ada = float('inf')

# Perform k-fold cross-validation for each ensemble size
for ensemble_size in ensemble_sizes:
    weak_learner = DecisionTreeClassifier(max_depth=1, random_state=42)  # Decision stump
    ada = AdaBoostClassifier(estimator=weak_learner, n_estimators=ensemble_size, random_state=42)
    avg_error = k_fold_cross_validation(ada, X_train, y_train, k=5)
    print(f"AdaBoost (n_estimators={ensemble_size}) - Avg Validation Error: {avg_error}")
    
    if avg_error < lowest_error_ada:
        lowest_error_ada = avg_error
        best_ensemble_size_ada = ensemble_size

print(f"Best AdaBoost Ensemble Size: {best_ensemble_size_ada} with Avg Validation Error: {lowest_error_ada}")




AdaBoost (n_estimators=10) - Avg Validation Error: 0.06793478260869565




AdaBoost (n_estimators=20) - Avg Validation Error: 0.06304347826086958




AdaBoost (n_estimators=50) - Avg Validation Error: 0.05978260869565215




AdaBoost (n_estimators=100) - Avg Validation Error: 0.054891304347826075




AdaBoost (n_estimators=200) - Avg Validation Error: 0.05027173913043479




AdaBoost (n_estimators=300) - Avg Validation Error: 0.05461956521739131




AdaBoost (n_estimators=500) - Avg Validation Error: 0.052989130434782636
Best AdaBoost Ensemble Size: 200 with Avg Validation Error: 0.05027173913043479


In [43]:
# Train and evaluate Random Forest
rf = RandomForestClassifier(n_estimators=best_ensemble_size_rf, max_features='sqrt', criterion='gini', random_state=42)
rf.fit(X_train, y_train)
rf_test_error = 1 - accuracy_score(y_test, rf.predict(X_test))
print(f"Random Forest Test Error: {rf_test_error}")

# Train and evaluate AdaBoost
weak_learner = DecisionTreeClassifier(max_depth=1, random_state=42)  # Decision stump
ada = AdaBoostClassifier(estimator=weak_learner, n_estimators=best_ensemble_size_ada, random_state=42)
ada.fit(X_train, y_train)
ada_test_error = 1 - accuracy_score(y_test, ada.predict(X_test))
print(f"AdaBoost Test Error: {ada_test_error}")


Random Forest Test Error: 0.06304347826086953




AdaBoost Test Error: 0.06739130434782614
