# Machine Learning for Data Science  
# Multi-class and Multi-Label Classification Using Support Vector Machines | K-Means Clustering on a Multi-Class and Multi-Label Data Set

## Rajnandini Thopte 

In [1]:
import numpy as np
import pandas as pd
import random

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report, confusion_matrix, f1_score, hamming_loss, silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")


#### (a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics. uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [2]:
mfcc_data = "../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv"
mfcc_data = pd.read_csv(mfcc_data)
mfcc_data

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [3]:
family = mfcc_data['Family']
genus = mfcc_data['Genus']
species = mfcc_data['Species']
import random
num_row = len(mfcc_data)
random.seed(num_row)

r = range(num_row)
los = int(num_row * 0.7)
train_idx = random.sample(r, los)

test_idx = []
for i in range(num_row):
    if i not in train_idx:
        test_idx.append(i)

# split train and test
mfcc_train = mfcc_data.iloc[train_idx, :].reset_index(drop=True)
mfcc_test = mfcc_data.iloc[test_idx, :].reset_index(drop=True)

# split train features and labels
train_x = mfcc_train.iloc[:, :-4]
train_family = mfcc_train['Family']
train_genus = mfcc_train['Genus']
train_species = mfcc_train['Species']

# split test features and labels
test_x = mfcc_test.iloc[:, :-4]
test_family = mfcc_test['Family']
test_genus = mfcc_test['Genus']
test_species = mfcc_test['Species']

In [4]:
mfcc_train.shape

(5036, 26)

In [5]:
mfcc_test.shape

(2159, 26)

#### b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:

#### i. Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem.

> **Exact Match**:
Exact match is a measure of classification accuracy that considers whether the model correctly predicts all labels for a given instance.
For multi-label classification problems, where each instance can belong to multiple classes simultaneously, exact match evaluates if the model's prediction is an exact match to the true set of labels.
The exact match metric is binary – either the model's prediction is entirely correct for a given instance, or it is not.
The exact match ratio is calculated as the number of instances where the predicted labels exactly match the true labels divided by the total number of instances.


> **Hamming Loss**:
Hamming Loss is another metric used in multi-label classification tasks to quantify the accuracy of the model's predictions.
It measures the fraction of labels that are incorrectly predicted, averaged over all instances.
Hamming Loss is calculated as the average Hamming distance over all instances. The Hamming distance for a single instance is the number of positions at which the predicted labels are different from the true labels.
The Hamming Loss score ranges from 0 to 1, with 0 indicating perfect predictions and 1 indicating that none of the predicted labels match the true labels.

> In summary, *Exact match* is a metric that measures whether a multilabel classifier correctly predicts all the true labels for each instance. It calculates the proportion of instances where the classifier's output is an exact match to the true labels.
*Hamming Loss* is a metric that quantifies the fraction of incorrectly predicted labels over all the labels in the test set. It calculates the average fraction of incorrect labels for each instance, considering both falsely predicted labels and missed labels.
Both metrics are commonly used in the evaluation of multilabel classification models, providing different perspectives on the model's performance.

In [67]:
def multilabel_evaluation(title, testX, groundTruthY, classifiers):
    # Create an empty DataFrame to store predicted labels
    predictY = pd.DataFrame(columns=groundTruthY.columns)

    # Iterate over each label column
    for label in groundTruthY.columns:
        # Get the classifier for the current label
        clf = classifiers[label]

        # Predict labels for the test data using the classifier
        test_pred = clf.predict(testX)

        # Assign the predicted labels to the corresponding column in the DataFrame
        predictY.loc[:, label] = test_pred

    # Print a message indicating the start of multilabel evaluation
    print(f"Multilabel evaluation of {title}")

    # Calculate multilabel metrics using the ground truth and predicted labels
    hamming, exact_ratio = calculate_multilabel_metrics(groundTruthY, predictY)

    # Return a list containing the calculated metrics
    return [hamming, exact_ratio]


def calculate_multilabel_metrics(groundTruthY, predictY):
    # Initialize the count of misclassified labels
    missclf_labels = 0

    # Iterate over ground truth and predicted values
    for truth, pred in zip(groundTruthY.values, predictY.values):
        # Check for misclassified labels in each pair of truth and prediction
        miss = (truth != pred)
        missclf_labels += np.sum(miss)

    # Calculate Hamming Loss
    hamming = missclf_labels / (groundTruthY.shape[0] * groundTruthY.shape[1])

    # Initialize the count for instances with exact matches
    exact_ratio = 0

    # Iterate over ground truth and predicted values again
    for truth, pred in zip(groundTruthY.values, predictY.values):
        # Check if all labels match for a given instance
        match = (truth == pred)
        if sum(match) == groundTruthY.shape[1]:
            exact_ratio += 1

    # Calculate Exact Match Ratio
    exact_ratio /= groundTruthY.shape[0]

    # Round the calculated values to 4 decimal places
    hamming = np.round(hamming, 4)
    exact_ratio = np.round(exact_ratio, 4)

    # Create a dictionary with the results
    ans = {
        "Hamming Loss": [hamming],
        "Exact Match Ratio": [exact_ratio]
    }

    # Display the results as a pandas DataFrame
    print(pd.DataFrame(data=ans))

    # Return the calculated metrics
    return hamming, exact_ratio


#### ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation.1 You are welcome to try to solve the problem with both standardized 2 and raw attributes and report the results.


In [68]:
def paramSearch(classifier, kwargs, trainX, trainY, testX, testY):
    # Build and fit the grid search for the given classifier with specified hyperparameters
    clf = GridSearchCV(estimator=classifier, **kwargs)
    clf.fit(trainX, trainY)
    
    # Print grid scores on the development set
    print("Grid scores on the development set:\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    
    # Iterate over mean scores, standard deviations, and parameter settings
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(f"{round(mean, 3)} (+/-{round(std * 2, 3)}) for {params}")
    
    # Output the best parameter setting
    print("\nThe best parameter setting is:")
    print(clf.best_params_, "\n")
    
    # Evaluate the classifier on the test set for a single-label task
    test_pred = clf.predict(testX)
    print("Classification Report on the Test Set:")
    print(classification_report(testY, test_pred))
    
    # Return the trained classifier with the best parameters
    return clf



#### Gaussian SVC without standardization

In [69]:
# Dictionary to store Gaussian SVC classifiers
gaussianSVC_classifiers = {}

# Hyperparameter tuning parameters
tuned_params = {
    'C': np.logspace(1, 4, 4),      # Range of values for the regularization parameter C
    'gamma': np.logspace(-3, 6, 10)  # Range of values for the kernel coefficient gamma
}

# Stratified K-Folds cross-validator for grid search
splitter = StratifiedKFold(n_splits=10, random_state=5036, shuffle=True)

# Keyword arguments for GridSearchCV
kwargs = {
    'param_grid': tuned_params,     # Parameter grid to search over
    'cv': splitter,                  # Cross-validation strategy
    'scoring': 'f1_weighted',        # Scoring metric for optimization
    'verbose': 1                     # Verbosity level during grid search
}


In [70]:
# Print a message indicating the class being processed
print(f"Class: Family (Gaussian SVC without Standardization)")

# Perform parameter search for Gaussian SVC on the 'Family' class
# and store the trained classifier in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Family'] = paramSearch(
    SVC(kernel='rbf'),        # Gaussian SVC with radial basis function kernel
    kwargs,                   # Hyperparameter tuning parameters and settings
    train_x,                  # Training features
    train_family,             # Ground truth labels for the 'Family' class
    test_x,                   # Test features
    test_family               # Ground truth labels for the 'Family' class in the test set
)


Class: Family (Gaussian SVC without Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.869 (+/-0.019) for {'C': 10.0, 'gamma': 0.001}
0.926 (+/-0.016) for {'C': 10.0, 'gamma': 0.01}
0.971 (+/-0.012) for {'C': 10.0, 'gamma': 0.1}
0.991 (+/-0.007) for {'C': 10.0, 'gamma': 1.0}
0.986 (+/-0.01) for {'C': 10.0, 'gamma': 10.0}
0.791 (+/-0.04) for {'C': 10.0, 'gamma': 100.0}
0.492 (+/-0.021) for {'C': 10.0, 'gamma': 1000.0}
0.472 (+/-0.005) for {'C': 10.0, 'gamma': 10000.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 100000.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 1000000.0}
0.925 (+/-0.016) for {'C': 100.0, 'gamma': 0.001}
0.94 (+/-0.02) for {'C': 100.0, 'gamma': 0.01}
0.982 (+/-0.007) for {'C': 100.0, 'gamma': 0.1}
0.992 (+/-0.008) for {'C': 100.0, 'gamma': 1.0}
0.986 (+/-0.01) for {'C': 100.0, 'gamma': 10.0}
0.791 (+/-0.04) for {'C': 100.0, 'gamma': 100.0}
0.492 (+/-0.021) for {'C': 100.0, 'gamma': 1000.0}
0.472 (+/-0.005)

In [71]:
# Print a message indicating the class being processed
print(f"Class: Genus (Gaussian SVC without Standardization)")

# Perform parameter search for Gaussian SVC on the 'Genus' class
# and store the trained classifier in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Genus'] = paramSearch(
    SVC(kernel='rbf'),        # Gaussian SVC with radial basis function kernel
    kwargs,                   # Hyperparameter tuning parameters and settings
    train_x,                  # Training features
    train_genus,              # Ground truth labels for the 'Genus' class
    test_x,                   # Test features
    test_genus                # Ground truth labels for the 'Genus' class in the test set
)


Class: Genus (Gaussian SVC without Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.744 (+/-0.026) for {'C': 10.0, 'gamma': 0.001}
0.92 (+/-0.019) for {'C': 10.0, 'gamma': 0.01}
0.973 (+/-0.014) for {'C': 10.0, 'gamma': 0.1}
0.988 (+/-0.009) for {'C': 10.0, 'gamma': 1.0}
0.98 (+/-0.011) for {'C': 10.0, 'gamma': 10.0}
0.733 (+/-0.042) for {'C': 10.0, 'gamma': 100.0}
0.442 (+/-0.014) for {'C': 10.0, 'gamma': 1000.0}
0.425 (+/-0.004) for {'C': 10.0, 'gamma': 10000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.919 (+/-0.017) for {'C': 100.0, 'gamma': 0.001}
0.965 (+/-0.016) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.009) for {'C': 100.0, 'gamma': 0.1}
0.989 (+/-0.007) for {'C': 100.0, 'gamma': 1.0}
0.98 (+/-0.011) for {'C': 100.0, 'gamma': 10.0}
0.733 (+/-0.042) for {'C': 100.0, 'gamma': 100.0}
0.442 (+/-0.014) for {'C': 100.0, 'gamma': 1000.0}
0.425 (+/-0.

In [72]:
# Print a message indicating the class being processed
print(f"Class: Species (Gaussian SVC without Standardization)")

# Perform parameter search for Gaussian SVC on the 'Species' class
# and store the trained classifier in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Species'] = paramSearch(
    SVC(kernel='rbf'),        # Gaussian SVC with radial basis function kernel
    kwargs,                   # Hyperparameter tuning parameters and settings
    train_x,                  # Training features
    train_species,            # Ground truth labels for the 'Species' class
    test_x,                   # Test features
    test_species              # Ground truth labels for the 'Species' class in the test set
)


Class: Species (Gaussian SVC without Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.8 (+/-0.015) for {'C': 10.0, 'gamma': 0.001}
0.935 (+/-0.021) for {'C': 10.0, 'gamma': 0.01}
0.974 (+/-0.017) for {'C': 10.0, 'gamma': 0.1}
0.988 (+/-0.011) for {'C': 10.0, 'gamma': 1.0}
0.978 (+/-0.01) for {'C': 10.0, 'gamma': 10.0}
0.656 (+/-0.04) for {'C': 10.0, 'gamma': 100.0}
0.33 (+/-0.015) for {'C': 10.0, 'gamma': 1000.0}
0.314 (+/-0.005) for {'C': 10.0, 'gamma': 10000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.935 (+/-0.021) for {'C': 100.0, 'gamma': 0.001}
0.97 (+/-0.018) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.019) for {'C': 100.0, 'gamma': 0.1}
0.988 (+/-0.011) for {'C': 100.0, 'gamma': 1.0}
0.978 (+/-0.01) for {'C': 100.0, 'gamma': 10.0}
0.656 (+/-0.04) for {'C': 100.0, 'gamma': 100.0}
0.33 (+/-0.015) for {'C': 100.0, 'gamma': 1000.0}
0.314 (+/-0.005)

In [73]:
summary = {}

In [74]:
# Set a title for the evaluation
title = "Gaussian SVC without Standardization"

# Create an entry in the 'summary' dictionary for the specified title
# and store the multilabel evaluation results for the Gaussian SVC classifiers
summary[title] = multilabel_evaluation(
    title,                                 # Evaluation title
    test_x,                                # Test features
    mfcc_test.iloc[:, -4:-1],              # Ground truth labels for multilabel evaluation
    gaussianSVC_classifiers                # Dictionary of Gaussian SVC classifiers
)



Multilabel evaluation of Gaussian SVC without Standardization
   Hamming Loss  Exact Match Ratio
0        0.0096             0.9852


#### Gaussian SVC with Standardization

In [75]:
std_scaler = StandardScaler()
std_train_x = std_scaler.fit_transform(train_x)
std_test_x = std_scaler.fit_transform(test_x)

In [76]:
# Print a message indicating the class being processed
print(f"Class: Family (Gaussian SVC with Standardization)")

# Perform parameter search for Gaussian SVC on the 'Family' class
# using standardized features and store the trained classifier
# in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Family'] = paramSearch(
    SVC(kernel='rbf'),    # Gaussian SVC with radial basis function kernel
    kwargs,               # Hyperparameter tuning parameters and settings
    std_train_x,          # Standardized training features
    train_family,         # Ground truth labels for the 'Family' class
    std_test_x,           # Standardized test features
    test_family           # Ground truth labels for the 'Family' class in the test set
)


Class: Family (Gaussian SVC with Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.944 (+/-0.02) for {'C': 10.0, 'gamma': 0.001}
0.987 (+/-0.005) for {'C': 10.0, 'gamma': 0.01}
0.99 (+/-0.007) for {'C': 10.0, 'gamma': 0.1}
0.905 (+/-0.023) for {'C': 10.0, 'gamma': 1.0}
0.575 (+/-0.047) for {'C': 10.0, 'gamma': 10.0}
0.473 (+/-0.004) for {'C': 10.0, 'gamma': 100.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 1000.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 10000.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 100000.0}
0.47 (+/-0.002) for {'C': 10.0, 'gamma': 1000000.0}
0.975 (+/-0.013) for {'C': 100.0, 'gamma': 0.001}
0.989 (+/-0.007) for {'C': 100.0, 'gamma': 0.01}
0.99 (+/-0.007) for {'C': 100.0, 'gamma': 0.1}
0.905 (+/-0.023) for {'C': 100.0, 'gamma': 1.0}
0.575 (+/-0.047) for {'C': 100.0, 'gamma': 10.0}
0.473 (+/-0.004) for {'C': 100.0, 'gamma': 100.0}
0.47 (+/-0.002) for {'C': 100.0, 'gamma': 1000.0}
0.47 (+/-0.002) for

In [77]:
# Print a message indicating the class being processed
print(f"Class: Genus (Gaussian SVC with Standardization)")

# Perform parameter search for Gaussian SVC on the 'Genus' class
# using standardized features and store the trained classifier
# in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Genus'] = paramSearch(
    SVC(kernel='rbf'),    # Gaussian SVC with radial basis function kernel
    kwargs,               # Hyperparameter tuning parameters and settings
    std_train_x,          # Standardized training features
    train_genus,          # Ground truth labels for the 'Genus' class
    std_test_x,           # Standardized test features
    test_genus            # Ground truth labels for the 'Genus' class in the test set
)


Class: Genus (Gaussian SVC with Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.958 (+/-0.018) for {'C': 10.0, 'gamma': 0.001}
0.986 (+/-0.008) for {'C': 10.0, 'gamma': 0.01}
0.987 (+/-0.009) for {'C': 10.0, 'gamma': 0.1}
0.857 (+/-0.02) for {'C': 10.0, 'gamma': 1.0}
0.513 (+/-0.018) for {'C': 10.0, 'gamma': 10.0}
0.426 (+/-0.006) for {'C': 10.0, 'gamma': 100.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 10000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.977 (+/-0.013) for {'C': 100.0, 'gamma': 0.001}
0.988 (+/-0.008) for {'C': 100.0, 'gamma': 0.01}
0.987 (+/-0.009) for {'C': 100.0, 'gamma': 0.1}
0.857 (+/-0.02) for {'C': 100.0, 'gamma': 1.0}
0.513 (+/-0.018) for {'C': 100.0, 'gamma': 10.0}
0.426 (+/-0.006) for {'C': 100.0, 'gamma': 100.0}
0.423 (+/-0.001) for {'C': 100.0, 'gamma': 1000.0}
0.423 (+/-0.00

In [78]:
# Print a message indicating the class being processed
print(f"Class: Species (Gaussian SVC with Standardization)")

# Perform parameter search for Gaussian SVC on the 'Species' class
# using standardized features and store the trained classifier
# in the 'gaussianSVC_classifiers' dictionary
gaussianSVC_classifiers['Species'] = paramSearch(
    SVC(kernel='rbf'),    # Gaussian SVC with radial basis function kernel
    kwargs,               # Hyperparameter tuning parameters and settings
    std_train_x,          # Standardized training features
    train_species,        # Ground truth labels for the 'Species' class
    std_test_x,           # Standardized test features
    test_species          # Ground truth labels for the 'Species' class in the test set
)


Class: Species (Gaussian SVC with Standardization)
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on the development set:

0.967 (+/-0.018) for {'C': 10.0, 'gamma': 0.001}
0.987 (+/-0.013) for {'C': 10.0, 'gamma': 0.01}
0.985 (+/-0.008) for {'C': 10.0, 'gamma': 0.1}
0.835 (+/-0.024) for {'C': 10.0, 'gamma': 1.0}
0.389 (+/-0.026) for {'C': 10.0, 'gamma': 10.0}
0.315 (+/-0.006) for {'C': 10.0, 'gamma': 100.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 10000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.98 (+/-0.014) for {'C': 100.0, 'gamma': 0.001}
0.987 (+/-0.012) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.008) for {'C': 100.0, 'gamma': 0.1}
0.835 (+/-0.024) for {'C': 100.0, 'gamma': 1.0}
0.389 (+/-0.026) for {'C': 100.0, 'gamma': 10.0}
0.315 (+/-0.006) for {'C': 100.0, 'gamma': 100.0}
0.312 (+/-0.001) for {'C': 100.0, 'gamma': 1000.0}
0.312 (+/-0

In [79]:
# Set a title for the evaluation
title = "Gaussian SVC Standardization"

# Create an entry in the 'summary' dictionary for the specified title
# and store the multilabel evaluation results for the Gaussian SVC classifiers
summary[title] = multilabel_evaluation(
    title,                                 # Evaluation title
    std_test_x,                            # Standardized test features
    mfcc_test.iloc[:, -4:-1],              # Ground truth labels for multilabel evaluation
    gaussianSVC_classifiers                # Dictionary of Gaussian SVC classifiers
)



Multilabel evaluation of Gaussian SVC Standardization
   Hamming Loss  Exact Match Ratio
0        0.0116             0.9792


#### iii. Repeat 1(b)ii with L1-penalized SVMs.Remember to standardize4 the at- tributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [80]:
# Dictionary to store L1 SVM classifiers
L1_svm_classifiers = {}

# Hyperparameter tuning parameters for L1 SVM
tuned_params = {'C': np.logspace(1, 5, 10)}  # Range of values for the regularization parameter C

# Stratified K-Folds cross-validator for grid search
splitter = StratifiedKFold(n_splits=10, random_state=5036, shuffle=True)

# Keyword arguments for GridSearchCV
kwargs = {
    'param_grid': tuned_params,     # Parameter grid to search over
    'cv': splitter,                  # Cross-validation strategy
    'scoring': 'f1_weighted',        # Scoring metric for optimization
    'verbose': 1                     # Verbosity level during grid search
}


In [81]:
# Print a message indicating the class being processed
print(f"Class: Family (L1-penalized SVM with Standardization)")

# Perform parameter search for L1-penalized SVM on the 'Family' class
# using standardized features and store the trained classifier
# in the 'L1_svm_classifiers' dictionary
L1_svm_classifiers['Family'] = paramSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_family,                         # Ground truth labels for the 'Family' class
    std_test_x,                           # Standardized test features
    test_family                           # Ground truth labels for the 'Family' class in the test set
)


Class: Family (L1-penalized SVM with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.928 (+/-0.026) for {'C': 10.0}
0.928 (+/-0.027) for {'C': 27.825594022071243}
0.928 (+/-0.027) for {'C': 77.4263682681127}
0.928 (+/-0.027) for {'C': 215.44346900318823}
0.928 (+/-0.027) for {'C': 599.4842503189409}
0.928 (+/-0.027) for {'C': 1668.100537200059}
0.928 (+/-0.027) for {'C': 4641.588833612777}
0.928 (+/-0.027) for {'C': 12915.496650148827}
0.928 (+/-0.027) for {'C': 35938.13663804626}
0.928 (+/-0.027) for {'C': 100000.0}

The best parameter setting is:
{'C': 10.0} 

Classification Report on the Test Set:
                 precision    recall  f1-score   support

      Bufonidae       0.00      0.00      0.00        15
  Dendrobatidae       0.88      0.92      0.90       154
        Hylidae       0.94      0.90      0.92       673
Leptodactylidae       0.95      0.98      0.96      1317

       accuracy                   

In [82]:
# Print a message indicating the class being processed
print(f"Class: Genus (L1-penalized SVM with Standardization)")

# Perform parameter search for L1-penalized SVM on the 'Genus' class
# using standardized features and store the trained classifier
# in the 'L1_svm_classifiers' dictionary
L1_svm_classifiers['Genus'] = paramSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_genus,                          # Ground truth labels for the 'Genus' class
    std_test_x,                           # Standardized test features
    test_genus                            # Ground truth labels for the 'Genus' class in the test set
)


Class: Genus (L1-penalized SVM with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.948 (+/-0.012) for {'C': 10.0}
0.948 (+/-0.013) for {'C': 27.825594022071243}
0.948 (+/-0.013) for {'C': 77.4263682681127}
0.948 (+/-0.013) for {'C': 215.44346900318823}
0.947 (+/-0.012) for {'C': 599.4842503189409}
0.947 (+/-0.012) for {'C': 1668.100537200059}
0.947 (+/-0.012) for {'C': 4641.588833612777}
0.947 (+/-0.012) for {'C': 12915.496650148827}
0.947 (+/-0.012) for {'C': 35938.13663804626}
0.947 (+/-0.012) for {'C': 100000.0}

The best parameter setting is:
{'C': 10.0} 

Classification Report on the Test Set:
               precision    recall  f1-score   support

    Adenomera       0.96      0.99      0.98      1240
     Ameerega       0.92      0.95      0.94       154
Dendropsophus       0.92      0.67      0.78        91
    Hypsiboas       0.92      0.98      0.95       479
Leptodactylus       0.97      0.90      0.93  

In [83]:
# Print a message indicating the class being processed
print(f"Class: Species (L1-penalized SVM with Standardization)")

# Perform parameter search for L1-penalized SVM on the 'Species' class
# using standardized features and store the trained classifier
# in the 'L1_svm_classifiers' dictionary
L1_svm_classifiers['Species'] = paramSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_species,                        # Ground truth labels for the 'Species' class
    std_test_x,                           # Standardized test features
    test_species                          # Ground truth labels for the 'Species' class in the test set
)


Class: Species (L1-penalized SVM with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.958 (+/-0.021) for {'C': 10.0}
0.959 (+/-0.021) for {'C': 27.825594022071243}
0.959 (+/-0.021) for {'C': 77.4263682681127}
0.959 (+/-0.022) for {'C': 215.44346900318823}
0.959 (+/-0.022) for {'C': 599.4842503189409}
0.959 (+/-0.022) for {'C': 1668.100537200059}
0.959 (+/-0.022) for {'C': 4641.588833612777}
0.959 (+/-0.022) for {'C': 12915.496650148827}
0.959 (+/-0.022) for {'C': 35938.13663804626}
0.959 (+/-0.022) for {'C': 100000.0}

The best parameter setting is:
{'C': 215.44346900318823} 

Classification Report on the Test Set:
                        precision    recall  f1-score   support

        AdenomeraAndre       0.88      0.94      0.91       182
AdenomeraHylaedactylus       0.99      1.00      0.99      1058
    Ameeregatrivittata       0.94      0.94      0.94       154
            HylaMinuta       0.92      0.71      

In [84]:
# Set a title for the evaluation
title = "Support Vector Classifier with L1-penalty"

# Create an entry in the 'summary' dictionary for the specified title
# and store the multilabel evaluation results for the L1-penalized SVM classifiers
summary['SVC_L1'] = multilabel_evaluation(
    title,                               # Evaluation title
    std_test_x,                          # Standardized test features
    mfcc_test.iloc[:, -4:-1],            # Ground truth labels for multilabel evaluation
    L1_svm_classifiers                   # Dictionary of L1-penalized SVM classifiers
)



Multilabel evaluation of Support Vector Classifier with L1-penalty
   Hamming Loss  Exact Match Ratio
0         0.052             0.9143


#### iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [85]:
def smoteParamSearch(classifier, settings, trainX, trainY, testX, testY):
    # Create a pipeline with SMOTE oversampling and the specified classifier
    naive_model = Pipeline([
        ('sampling', SMOTE()),          # SMOTE oversampling
        ('classification', classifier)  # Specified classifier
    ])

    # Perform parameter search for the pipeline and return the selected model
    selected_model = paramSearch(
        naive_model,                    # Pipeline with SMOTE and classifier
        settings,                       # Hyperparameter tuning parameters and settings
        trainX,                         # Training features
        trainY,                         # Ground truth labels for training
        testX,                          # Test features
        testY                           # Ground truth labels for testing
    )
    
    return selected_model


In [86]:
# Hyperparameter tuning parameters for SMOTE with Support Vector Classifier (SVC)
tuned_params = {'classification__C': np.logspace(1, 5, 10)}  

# Dictionary to store SMOTE with SVC classifiers
smote_svc_classifiers = {}

# Keyword arguments for GridSearchCV
kwargs = {
    'param_grid': tuned_params,     # Parameter grid to search over
    'cv': splitter,                  # Cross-validation strategy
    'scoring': 'f1_weighted',        # Scoring metric for optimization
    'verbose': 1                     # Verbosity level during grid search
}


In [87]:
# Print a message indicating the class being processed
print(f"Class: Family (L1-penalized and SMOTE with Standardization)")

# Perform parameter search for L1-penalized and SMOTE with SVC on the 'Family' class
# using standardized features and store the trained classifier
# in the 'smote_svc_classifiers' dictionary
smote_svc_classifiers['Family'] = smoteParamSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_family,                         # Ground truth labels for the 'Family' class
    std_test_x,                           # Standardized test features
    test_family                           # Ground truth labels for the 'Family' class in the test set
)


Class: Family (L1-penalized and SMOTE with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.92 (+/-0.025) for {'classification__C': 10.0}
0.92 (+/-0.024) for {'classification__C': 27.825594022071243}
0.92 (+/-0.025) for {'classification__C': 77.4263682681127}
0.921 (+/-0.023) for {'classification__C': 215.44346900318823}
0.92 (+/-0.03) for {'classification__C': 599.4842503189409}
0.919 (+/-0.024) for {'classification__C': 1668.100537200059}
0.921 (+/-0.025) for {'classification__C': 4641.588833612777}
0.92 (+/-0.028) for {'classification__C': 12915.496650148827}
0.92 (+/-0.025) for {'classification__C': 35938.13663804626}
0.919 (+/-0.025) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 215.44346900318823} 

Classification Report on the Test Set:
                 precision    recall  f1-score   support

      Bufonidae       0.24      1.00      0.38        15
  Dendrobatidae  

In [88]:
# Print a message indicating the class being processed
print(f"Class: Genus (L1-penalized and SMOTE with Standardization)")

# Perform parameter search for L1-penalized and SMOTE with SVC on the 'Genus' class
# using standardized features and store the trained classifier
# in the 'smote_svc_classifiers' dictionary
smote_svc_classifiers['Genus'] = smoteParamSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_genus,                          # Ground truth labels for the 'Genus' class
    std_test_x,                           # Standardized test features
    test_genus                            # Ground truth labels for the 'Genus' class in the test set
)


Class: Genus (L1-penalized and SMOTE with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.918 (+/-0.022) for {'classification__C': 10.0}
0.918 (+/-0.025) for {'classification__C': 27.825594022071243}
0.919 (+/-0.021) for {'classification__C': 77.4263682681127}
0.917 (+/-0.025) for {'classification__C': 215.44346900318823}
0.917 (+/-0.022) for {'classification__C': 599.4842503189409}
0.919 (+/-0.025) for {'classification__C': 1668.100537200059}
0.918 (+/-0.023) for {'classification__C': 4641.588833612777}
0.919 (+/-0.023) for {'classification__C': 12915.496650148827}
0.918 (+/-0.025) for {'classification__C': 35938.13663804626}
0.918 (+/-0.023) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 1668.100537200059} 

Classification Report on the Test Set:
               precision    recall  f1-score   support

    Adenomera       0.99      0.91      0.95      1240
     Ameerega   

In [89]:
# Print a message indicating the class being processed
print(f"Class: Species (L1-penalized and SMOTE with Standardization)")

# Perform parameter search for L1-penalized and SMOTE with SVC on the 'Species' class
# using standardized features and store the trained classifier
# in the 'smote_svc_classifiers' dictionary
smote_svc_classifiers['Species'] = smoteParamSearch(
    LinearSVC(penalty='l1', dual=False),  # L1-penalized Linear SVM
    kwargs,                               # Hyperparameter tuning parameters and settings
    std_train_x,                          # Standardized training features
    train_species,                        # Ground truth labels for the 'Species' class
    std_test_x,                           # Standardized test features
    test_species                          # Ground truth labels for the 'Species' class in the test set
)


Class: Species (L1-penalized and SMOTE with Standardization)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Grid scores on the development set:

0.956 (+/-0.019) for {'classification__C': 10.0}
0.958 (+/-0.016) for {'classification__C': 27.825594022071243}
0.957 (+/-0.018) for {'classification__C': 77.4263682681127}
0.956 (+/-0.013) for {'classification__C': 215.44346900318823}
0.956 (+/-0.017) for {'classification__C': 599.4842503189409}
0.957 (+/-0.015) for {'classification__C': 1668.100537200059}
0.957 (+/-0.018) for {'classification__C': 4641.588833612777}
0.958 (+/-0.016) for {'classification__C': 12915.496650148827}
0.958 (+/-0.018) for {'classification__C': 35938.13663804626}
0.956 (+/-0.015) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 35938.13663804626} 

Classification Report on the Test Set:
                        precision    recall  f1-score   support

        AdenomeraAndre       0.96      0.94      0.95       

In [90]:
# Set a title for the evaluation
title = 'SVM with L1 penalty and SMOTE'

# Create an entry in the 'summary' dictionary for the specified title
# and store the multilabel evaluation results for the L1-penalized and SMOTE with SVM classifiers
summary['SVC_L1_SMOTE'] = multilabel_evaluation(
    title,                               # Evaluation title
    std_test_x,                          # Standardized test features
    mfcc_test.iloc[:, -4:-1],            # Ground truth labels for multilabel evaluation
    smote_svc_classifiers                # Dictionary of L1-penalized and SMOTE with SVM classifiers
)




Multilabel evaluation of SVM with L1 penalty and SMOTE
   Hamming Loss  Exact Match Ratio
0        0.0716             0.8564


In [91]:
df = pd.DataFrame(data=summary)
df

Unnamed: 0,Gaussian SVC without Standardization,Gaussian SVC Standardization,SVC_L1,SVC_L1_SMOTE
0,0.0096,0.0116,0.052,0.0716
1,0.9852,0.9792,0.9143,0.8564


### 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

### (a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise). Choose k ∈ {1, 2, . . . , 50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method you know.
### (b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus and species.
### (c) Now for each cluster you have a majority label triplet (family, genus, species). Calculate the average Hamming distance, Hamming score, and Hamming loss5 between the true labels and the labels assigned by clusters.


In [6]:
# Utility function to get the optimal value of K using silhouette score
def findOptimalClusterNumber(num_cluster, X, rand):
    optimalK, max_score = 2, 0
    
    # Iterate over possible K values
    for n in range(2, num_cluster):
        clusterer = KMeans(n_clusters=n, random_state=rand)
        cluster_labels = clusterer.fit_predict(X)
        
        # Calculate silhouette score
        silhouette_avg = silhouette_score(X, cluster_labels)
        
        # Update optimalK if current silhouette score is higher
        if silhouette_avg > max_score:
            optimalK = n
            max_score = silhouette_avg
    
    print(f"\nThe optimal K is: {optimalK}")
    return optimalK


In [7]:
# Utility function to get majority labels of a cluster
def getMajorityLabels(optimalK, cluster_labels, Y):
    cluster_major = pd.DataFrame(columns=Y.columns)
    
    # Iterate over clusters
    for c in range(optimalK):
        idx, = np.where(cluster_labels == c)
        cluster_samples = Y.iloc[idx, :]
        row = []
        
        # Iterate over labels in Y columns
        for label in Y.columns:
            # Get the most frequent label in the cluster
            cur_major = cluster_samples.loc[:, label].value_counts().index[0]
            row.append(cur_major)
        
        cluster_major.loc[c] = row
    
    return cluster_major


In [8]:
# Utility function to calculate and get hamming distance/loss
def calculateHammingMetrics(cluster_major, cluster_labels, Y):
    missclf_labels = 0
    
    # Iterate over clusters
    for c in range(len(cluster_major)):
        idx, = np.where(cluster_labels == c)
        
        # Iterate over labels in the cluster
        for label in Y.loc[idx].values:
            miss = (label != cluster_major.loc[c].values)
            missclf_labels += np.sum(miss)
    
    hamming_dist = missclf_labels / Y.shape[0]
    hamming_loss = missclf_labels / (Y.shape[0] * Y.shape[1])
    
    return hamming_dist, hamming_loss


In [9]:
# Monte Carlo simulation function
def monteCarlo(times, X, Y):
    hamming_dist = []
    hamming_loss = []
    
    # Iterate over Monte Carlo simulations
    for i in range(times):
        optimalK = findOptimalClusterNumber(50, X, i)
        clusterer = KMeans(n_clusters=optimalK, random_state=i)
        cluster_labels = clusterer.fit_predict(X)
        cluster_major = getMajorityLabels(optimalK, cluster_labels, Y)
        cur_dist, cur_loss = calculateHammingMetrics(cluster_major, cluster_labels, Y)
        hamming_dist.append(cur_dist)
        hamming_loss.append(cur_loss)
        print(f"Iteration {i + 1} | Hamming Distance: {round(cur_dist, 4)}, Hamming Loss: {round(cur_loss, 4)}")
    
    return hamming_dist, hamming_loss


In [10]:
iterations = 50
hamming_dist, hamming_loss = monteCarlo(iterations, mfcc_data.iloc[:, :-4], mfcc_data.iloc[:, -4:-1])


The optimal K is: 4
Iteration 1 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 2 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 3 | Hamming Distance: 0.7354, Hamming Loss: 0.2451

The optimal K is: 4
Iteration 4 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 5 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 6 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 7 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 8 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 9 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 10 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 11 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 12 | Hamming Distance: 0.6657, Hamming Loss: 0.2219

The optimal 

In [11]:
def summarize(hamming_distance, hamming_loss):
    summary_hd = {
        "Average Hamming Distance": [round(np.mean(hamming_distance), 4)],
        "Std Devation (Hamming Distance)": [round(np.std(hamming_distance), 4)]
    }
    
    summary_hl = {
        "Average Hamming Loss": [round(np.mean(hamming_loss), 4)],
        "Std Devation (Hamming Loss)": [round(np.std(hamming_loss), 4)]
    }
    
    summary_h_score = {
        "Average Hamming Score": [round(1 - np.mean(hamming_loss), 4)],
        "Std Devation (Hamming Score)": [round(np.std(hamming_loss), 4)]
    }
    
    df_summary_hd = pd.DataFrame(data=summary_hd)
    df_summary_hl = pd.DataFrame(data=summary_hl)
    df_summary_h_score = pd.DataFrame(data=summary_h_score)
    
    return [df_summary_hd, df_summary_hl, df_summary_h_score]

In [12]:
df_hamming_distance, df_hamming_loss, df_hamming_score = summarize(hamming_dist, hamming_loss)
df_hamming_distance

Unnamed: 0,Average Hamming Distance,Std Devation (Hamming Distance)
0,0.6719,0.0168


In [13]:
df_hamming_loss

Unnamed: 0,Average Hamming Loss,Std Devation (Hamming Loss)
0,0.224,0.0056


In [14]:
df_hamming_score

Unnamed: 0,Average Hamming Score,Std Devation (Hamming Score)
0,0.776,0.0056
