# Balance Scale Data Set

## Train/Test Split

In [50]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn import model_selection

In [54]:
def load_balance_dataset():
    """
    Load the balance scale dataset from UCI Machine Learning Repository.
    Dataset contains information about weight distribution of
    balance scales used in physical therapy.
    """
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
    column_names = ['class_name', 'left_weight', 'left_distance', 'right_weight', 'right_distance']
    dataset = pd.read_csv(url, names=column_names, header=None)
      
    # Print dataset shape and first few rows
    print("Dataset shape:", dataset.shape)
    print("Dataset head:\n", dataset.head())

    return dataset

In [8]:
def split_dataset(dataset):
    """
    Split the dataset into training & test sets using a 70/30 split.
    I expect the input dataset to have the class label as the first column
    and the feature values as the remaining columns.
    """
    features = dataset.values[:, 1:5]
    class_labels = dataset.values[:, 0]

    X_train, X_test, y_train, y_test = train_test_split(
        features, class_labels, test_size=0.3, random_state=100)

    # Print the shapes of the resulting train and test sets
    print(f"Training features shape: {X_train.shape}, labels shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}, labels shape: {y_test.shape}")

    return features, class_labels, X_train, X_test, y_train, y_test

In [9]:
# Load the balance scale dataset
dataset = load_balance_dataset()

# Split the dataset into training and test sets
features, class_labels, X_train, X_test, y_train, y_test = split_dataset(dataset)

Dataset shape: (625, 5)
Dataset head:
   class_name  left_weight  left_distance  right_weight  right_distance
0          B            1              1             1               1
1          R            1              1             1               2
2          R            1              1             1               3
3          R            1              1             1               4
4          R            1              1             1               5
Training features shape: (437, 4), labels shape: (437,)
Test features shape: (188, 4), labels shape: (188,)


## Bagging

In [11]:
X = features
Y = class_labels

# Set the random seed for reproducibility
random_seed = 7

# Define a K-fold cross-validation object
kfold_cv = KFold(n_splits=10)

# Define the base estimator (classifier) for the bagging model
base_estimator = DecisionTreeClassifier()

# Define the number of trees to use in the bagging model
num_trees = 100

# Define the bagging model with the base estimator, number of trees, and random seed
bagging_model = BaggingClassifier(
    base_estimator=base_estimator,
    n_estimators=num_trees,
    random_state=random_seed
)

# Evaluate the bagging model using cross-validation
scores = cross_val_score(
    estimator=bagging_model,
    X=X,
    y=Y,
    cv=kfold_cv
)

# Print the mean accuracy of the bagging model over all cross-validation folds
print(f"Bagging accuracy: {scores.mean()}") 

Bagging accuracy: 0.7985919098822325


In [12]:
# Train (fit) the bagging model on the training data
trained_bagging_model = bagging_model.fit(X_train, y_train)

In [17]:
def calculate_accuracy(y_true, y_pred): 
    """
    Calculates and prints the accuracy, confusion matrix, and classification report 
    for a set of true labels and predicted labels.
    
    Parameters:
    y_true (array-like): The true labels of the data.
    y_pred (array-like): The predicted labels for the data.
    """
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)

    # Compute the accuracy score
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy: {:.2f}%".format(acc*100))

    # Compute and print the classification report
    report = classification_report(y_true, y_pred)
    print("Classification Report:\n", report)

In [18]:
# Use the bagging model to make predictions on the test data
y_pred_bagging = trained_bagging_model.predict(X_test)

# Evaluate the accuracy of the bagging model predictions
calculate_accuracy(y_test, y_pred_bagging)

Confusion Matrix:
 [[ 0  7  6]
 [ 8 77  0]
 [ 8  3 79]]
Accuracy: 82.98%
Classification Report:
               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.89      0.91      0.90        85
           R       0.93      0.88      0.90        90

    accuracy                           0.83       188
   macro avg       0.60      0.59      0.60       188
weighted avg       0.85      0.83      0.84       188



## Adaboost

In [26]:
# Set the random seed and number of trees for the AdaBoost model
random_seed = 7
num_trees = 30

# Set up the k-fold cross validation for the model
kfold_cv = KFold(n_splits=10)

# Initialize the AdaBoost classifier with the given parameters
model = AdaBoostClassifier(n_estimators=num_trees, random_state=random_seed)

# Train the model and evaluate its accuracy using k-fold cross validation
scores = cross_val_score(model, X, Y, cv=kfold_cv)
print("Adaboost Accuracy: ", scores.mean())

Adaboost Accuracy:  0.9118535586277522


In [27]:
# Train an AdaBoost model using the training data
trained_adaboost_model = model.fit(X_train, y_train)

In [28]:
# Use the trained AdaBoost model to make predictions on the test data
y_pred_adaboost = trained_adaboost_model.predict(X_test)

# Evaluate the accuracy of the AdaBoost model predictions
calculate_accuracy(y_test, y_pred_adaboost)

Confusion Matrix:
 [[ 3  5  5]
 [ 2 83  0]
 [ 6  0 84]]
Accuracy: 90.43%
Classification Report:
               precision    recall  f1-score   support

           B       0.27      0.23      0.25        13
           L       0.94      0.98      0.96        85
           R       0.94      0.93      0.94        90

    accuracy                           0.90       188
   macro avg       0.72      0.71      0.72       188
weighted avg       0.90      0.90      0.90       188



## Random Forest

In [32]:
# Set the seed for reproducibility
random_seed = 7

# Define the number of trees to use in the random forest
num_trees = 100

# Define the maximum number of features to consider when splitting each tree
max_features = 3

# Define the number of folds for cross-validation
kfold_cv = KFold(n_splits=10)

# Create a random forest classifier with the specified number of trees and maximum number of features
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features, random_state=random_seed)

# Evaluate the random forest classifier using cross-validation
scores = cross_val_score(model, X, Y, cv=kfold_cv)
print("Random Forest Accuracy : ", scores.mean())

Random Forest Accuracy :  0.8050179211469534


In [33]:
# Train the Random Forest model on the training data
trained_random_forest_model = model.fit(X_train, y_train)

In [34]:
# Use the trained random forest model to make predictions on the test data
y_pred_random_forest = trained_random_forest_model.predict(X_test)

# Evaluate the accuracy of the random forest model predictions
calculate_accuracy(y_test, y_pred_random_forest)

Confusion Matrix:
 [[ 0  7  6]
 [ 8 77  0]
 [ 7  5 78]]
Accuracy: 82.45%
Classification Report:
               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.87      0.91      0.89        85
           R       0.93      0.87      0.90        90

    accuracy                           0.82       188
   macro avg       0.60      0.59      0.59       188
weighted avg       0.84      0.82      0.83       188



## Decision Tree Classifier

In [35]:
# Function to train a decision tree classifier using the Gini Index
def train_decision_tree_gini(X_train, X_test, y_train): 
    # Create the decision tree classifier object with Gini Index as criterion
    clf_gini = DecisionTreeClassifier(criterion="gini", 
                                       random_state=100,
                                       max_depth=3,
                                       min_samples_leaf=5) 
  
    # Train the classifier on the training data
    clf_gini.fit(X_train, y_train) 
    return clf_gini

In [36]:
# Function to train decision tree using entropy criterion
def train_using_entropy(X_train, X_test, y_train):
    """
    Trains a decision tree classifier using entropy as the split criterion
    
    Args:
    - X_train (array-like, shape = [n_samples, n_features]): Training input samples
    - X_test (array-like, shape = [n_samples, n_features]): Test input samples
    - y_train (array-like, shape = [n_samples]): Target values for the training set
    
    Returns:
    - clf_entropy (DecisionTreeClassifier): Fitted decision tree classifier using entropy as the split criterion
    """
  
    # Create decision tree classifier object
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", 
            random_state = 100, 
            max_depth = 3, 
            min_samples_leaf = 5) 
  
    # Fit decision tree classifier to training data
    clf_entropy.fit(X_train, y_train) 
    
    # Return the trained classifier
    return clf_entropy

In [37]:
# Function to make predictions using a trained classifier
def make_predictions(X_test, clf): 

    # Predictions using the classifier 
    y_pred = clf.predict(X_test) 
    # Print the predicted values (optional)
    # print("Predicted values:") 
    # print(y_pred) 
    return y_pred

In [39]:
# Building Phase 
clf_gini = train_decision_tree_gini(X_train, X_test, y_train) 
clf_entropy = train_using_entropy(X_train, X_test, y_train) 

In [52]:
import warnings
warnings.filterwarnings("ignore")

# Make predictions using the gini model
y_pred_gini = make_predictions(X_test, clf_gini)

# Calculate accuracy of the gini model
calculate_accuracy(y_test, y_pred_gini)

Confusion Matrix:
 [[ 0  6  7]
 [ 0 67 18]
 [ 0 19 71]]
Accuracy: 73.40%
Classification Report:
               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.73      0.79      0.76        85
           R       0.74      0.79      0.76        90

    accuracy                           0.73       188
   macro avg       0.49      0.53      0.51       188
weighted avg       0.68      0.73      0.71       188



In [53]:
# Predict using the entropy classifier
y_pred_entropy = make_predictions(X_test, clf_entropy) 

# Calculate and print the accuracy of the entropy classifier
calculate_accuracy(y_test, y_pred_entropy)

Confusion Matrix:
 [[ 0  6  7]
 [ 0 63 22]
 [ 0 20 70]]
Accuracy: 70.74%
Classification Report:
               precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.71      0.74      0.72        85
           R       0.71      0.78      0.74        90

    accuracy                           0.71       188
   macro avg       0.47      0.51      0.49       188
weighted avg       0.66      0.71      0.68       188



## Reflect on the applicability of the methods studied

Brief summary of my Jupyter notebook, based on cross-validation accuracies:

    Bagging: 79.86%
    AdaBoost: 91.19%
    Random Forest: 80.50%
    Decision Tree (Gini): 73.40%
    Decision Tree (Entropy): 70.74%

AdaBoost has the highest cross-validation accuracy (91.19%), followed by the Random Forest and Bagging. The decision tree classifiers have lower accuracies, as expected. AdaBoost, Bagging, and Random Forest provide better performance as they combine the results of multiple weak decision trees to improve the overall prediction accuracy and reduce overfitting. When comparing the performance on the test set, AdaBoost again has the highest accuracy (90.43%). The confusion matrices and classification reports provide further insight into the performance of each classifier.

While AdaBoost has the best performance, it's important to consider that the choice of performance metrics depends on the specific requirements. For example, if the cost of misclassifying a certain class is higher than others, it would be preferable to focus on precision, recall or F1-score for that class rather than overall accuracy.