In [None]:
'''Use the Decision Tree classification algorithm to construct a classifier on two datasets.
Evaluate the classifier's performance by performing ten-fold cross validation. Compare
the performance with that of:
i. Bagging ensemble consisting of 3, 5, 7, 9 Decision tree classifiers
ii. Adaboost ensemble consisting of 3, 5, 7, 9 Decision tree classifiers'''

In [11]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
from sklearn.impute import SimpleImputer

# Function to evaluate model performance using 10-fold cross-validation
def cross_validation(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    return scores.mean(), scores.std()

# Function to print evaluation results
def print_results(name, mean_score, std_dev):
    print(f"{name} - Mean Accuracy: {mean_score:.4f}, Standard Deviation: {std_dev:.4f}")


In [12]:
# Load Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Standardize the data
scaler = StandardScaler()
X_iris_standardized = scaler.fit_transform(X_iris)

# Normalize the data
normalizer = MinMaxScaler()
X_iris_normalized = normalizer.fit_transform(X_iris_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_iris_binarized = binarizer.fit_transform(X_iris_normalized)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
mean_score, std_dev = cross_validation(dt, X_iris_binarized, y_iris)
print_results("Decision Tree (Iris)", mean_score, std_dev)

# Bagging Ensembles
for n_estimators in [3, 5, 7, 9]:
    bagging = BaggingClassifier(estimator=dt, n_estimators=n_estimators, random_state=42)
    mean_score, std_dev = cross_validation(bagging, X_iris_binarized, y_iris)
    print_results(f"Bagging with {n_estimators} Decision Trees (Iris)", mean_score, std_dev)

# AdaBoost Ensembles with SAMME algorithm
for n_estimators in [3, 5, 7, 9]:
    adaboost = AdaBoostClassifier(estimator=dt, n_estimators=n_estimators, algorithm='SAMME', random_state=42)
    mean_score, std_dev = cross_validation(adaboost, X_iris_binarized, y_iris)
    print_results(f"AdaBoost with {n_estimators} Decision Trees (Iris)", mean_score, std_dev)


Decision Tree (Iris) - Mean Accuracy: 0.7867, Standard Deviation: 0.0884
Bagging with 3 Decision Trees (Iris) - Mean Accuracy: 0.7667, Standard Deviation: 0.1164
Bagging with 5 Decision Trees (Iris) - Mean Accuracy: 0.7867, Standard Deviation: 0.0884
Bagging with 7 Decision Trees (Iris) - Mean Accuracy: 0.7867, Standard Deviation: 0.0884
Bagging with 9 Decision Trees (Iris) - Mean Accuracy: 0.7867, Standard Deviation: 0.0884
AdaBoost with 3 Decision Trees (Iris) - Mean Accuracy: 0.7933, Standard Deviation: 0.0917
AdaBoost with 5 Decision Trees (Iris) - Mean Accuracy: 0.7933, Standard Deviation: 0.0917
AdaBoost with 7 Decision Trees (Iris) - Mean Accuracy: 0.8000, Standard Deviation: 0.0989
AdaBoost with 9 Decision Trees (Iris) - Mean Accuracy: 0.8000, Standard Deviation: 0.0989


In [13]:
# Load Titanic dataset
df = sns.load_dataset("titanic")

# Drop rows with missing 'embarked' values
df.dropna(subset=['embarked'], inplace=True)

# Impute missing 'age' values with mean
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

# Convert categorical columns to numeric
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Select features and target
X_titanic = df[['age', 'fare', 'sex', 'embarked_Q', 'embarked_S']]
y_titanic = df['survived']

# Standardize the data
X_titanic_standardized = scaler.fit_transform(X_titanic)

# Normalize the data
X_titanic_normalized = normalizer.fit_transform(X_titanic_standardized)

# Discretize the data
X_titanic_binarized = binarizer.fit_transform(X_titanic_normalized)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
mean_score, std_dev = cross_validation(dt, X_titanic_binarized, y_titanic)
print_results("Decision Tree (Titanic)", mean_score, std_dev)

# Bagging Ensembles
for n_estimators in [3, 5, 7, 9]:
    bagging = BaggingClassifier(estimator=dt, n_estimators=n_estimators, random_state=42)
    mean_score, std_dev = cross_validation(bagging, X_titanic_binarized, y_titanic)
    print_results(f"Bagging with {n_estimators} Decision Trees (Titanic)", mean_score, std_dev)

# AdaBoost Ensembles with SAMME algorithm
for n_estimators in [3, 5, 7, 9]:
    adaboost = AdaBoostClassifier(estimator=dt, n_estimators=n_estimators, algorithm='SAMME', random_state=42)
    mean_score, std_dev = cross_validation(adaboost, X_titanic_binarized, y_titanic)
    print_results(f"AdaBoost with {n_estimators} Decision Trees (Titanic)", mean_score, std_dev)


Decision Tree (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0275
Bagging with 3 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0297
Bagging with 5 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0297
Bagging with 7 Decision Trees (Titanic) - Mean Accuracy: 0.7885, Standard Deviation: 0.0281
Bagging with 9 Decision Trees (Titanic) - Mean Accuracy: 0.7862, Standard Deviation: 0.0291
AdaBoost with 3 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0275
AdaBoost with 5 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0275
AdaBoost with 7 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0275
AdaBoost with 9 Decision Trees (Titanic) - Mean Accuracy: 0.7874, Standard Deviation: 0.0275
