In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sn
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import random
import time
import keras
from keras.datasets import cifar10
from keras.utils import to_categorical
from keras import models, layers, optimizers, regularizers
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from skimage.color import rgb2gray
import sklearn
from sklearn import model_selection, preprocessing, metrics
from scipy import stats

2024-04-27 16:41:39.541508: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-27 16:41:39.541623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-27 16:41:39.672564: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load and explore dataset

In [3]:
wd = '/kaggle/input/woz-speech/'
#Set training and test folder paths
training_path = wd+'features_train/features_train'
test_path = wd+'features_test/features_test'

#Load labels file
labels = pd.read_csv(wd+'labels.csv')

#Load feature description files, take out column 0 to use as header for training/test sets
features = pd.read_csv(wd+'feature_description.csv', encoding_errors='ignore', header=None, index_col=0)
features = features.index.tolist()
features

['F0semitoneFrom27.5Hz_sma3nz_amean',
 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
 'loudness_sma3_amean',
 'loudness_sma3_stddevNorm',
 'loudness_sma3_percentile20.0',
 'loudness_sma3_percentile50.0',
 'loudness_sma3_percentile80.0',
 'loudness_sma3_pctlrange0-2',
 'loudness_sma3_meanRisingSlope',
 'loudness_sma3_stddevRisingSlope',
 'loudness_sma3_meanFallingSlope',
 'loudness_sma3_stddevFallingSlope',
 'spectralFlux_sma3_amean',
 'spectralFlux_sma3_stddevNorm',
 'mfcc1_sma3_amean',
 'mfcc1_sma3_stddevNorm',
 'mfcc2_sma3_amean',
 'mfcc2_sma3_stddevNorm',
 'mfcc3_sma3_amean',
 'mfcc3_sma3_stddevNorm',
 'mfcc4_

In [4]:
def load_data(folder_path):
    #Init empty dataframe
    res = pd.DataFrame()
    for file in os.listdir(folder_path):
        #for each speaker file
        if file.endswith('.csv'):
            #get participant id from filename, eg filename: 'spk_305.csv'
            participant = float(file.split('_')[1].split('.')[0])
            #find labels for the participant
            label = labels[labels['Participant_ID'] == participant]
            #load participant feature file
            file_path = os.path.join(folder_path, file)
            data_df = pd.read_csv(file_path, header=None, names=features)
            #Add labels and participant id columns
            data_df['participant'] = participant
            data_df['gender'] = label['Gender'].values[0]
            data_df['depression'] = label['Depression'].values[0]
            #combine everything to result
            res = pd.concat([res, data_df])
    return res

In [5]:
#Load training data
training_df = load_data(training_path)
len(training_df)

13626

In [6]:
#Load test data
test_df = load_data(test_path)
len(test_df)

3280

In [7]:
# Check Missing values
missing_values = (training_df.isnull().sum()/len(training_df)) *100
print(f'Missing value percent % for each column, total samples {len(training_df)}')
print(missing_values)

Missing value percent % for each column, total samples 13626
F0semitoneFrom27.5Hz_sma3nz_amean             0.007339
F0semitoneFrom27.5Hz_sma3nz_stddevNorm        0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile20.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile50.0    0.007339
F0semitoneFrom27.5Hz_sma3nz_percentile80.0    0.007339
                                                ...   
StddevUnvoicedSegmentLength                   0.007339
equivalentSoundLevel_dBp                      0.007339
participant                                   0.000000
gender                                        0.000000
depression                                    0.000000
Length: 91, dtype: float64


In [8]:
training_df = training_df.dropna()
len(training_df)

13625

In [9]:
training_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,participant,gender,depression
0,37.110800,0.016898,36.521458,37.090477,37.525204,1.003746,24.374560,11.693271,-0.207191,9.438403,...,6.140351,4.629630,0.142000,0.117541,0.070000,0.027386,-39.496513,303.0,0,0
1,33.466145,0.081636,30.579086,32.893642,35.499447,4.920361,12.148806,0.000000,16.623375,0.000000,...,2.142857,2.222222,0.210000,0.072572,0.162500,0.084963,-40.960510,303.0,0,0
2,31.222300,0.035549,30.420216,30.726269,31.939356,1.519140,6.691987,0.000000,23.152794,0.000000,...,2.941176,1.587302,0.230000,0.000000,0.185000,0.065000,-42.518540,303.0,0,0
3,34.281920,0.031365,32.879463,34.713913,35.204903,2.325439,3.409068,0.000000,10.839425,0.000000,...,3.947368,2.816901,0.180000,0.000000,0.100000,0.024495,-44.036240,303.0,0,0
4,34.795260,0.018941,34.109280,34.899593,35.433464,1.324184,16.008287,14.265874,10.287021,6.374829,...,4.000000,2.040816,0.272000,0.109435,0.163333,0.170945,-42.155136,303.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,37.695770,0.082660,36.888977,38.232132,39.715218,2.826240,121.021860,127.670210,90.680580,75.641880,...,4.237289,2.654867,0.236667,0.087305,0.087500,0.039607,-34.249960,338.0,0,1
59,32.937874,0.124955,29.790354,30.834381,38.363396,8.573042,27.697432,0.000000,-57.483753,0.000000,...,4.938271,3.947368,0.140000,0.014142,0.067500,0.022776,-38.060207,338.0,0,1
60,33.065857,0.071610,31.588194,32.720340,35.155930,3.567736,390.305850,500.378700,27.218529,17.402937,...,3.649635,2.230483,0.245000,0.138894,0.180000,0.110905,-35.798412,338.0,0,1
61,38.278595,0.051522,36.595400,37.992966,40.502533,3.907131,120.221110,130.879240,19.591314,15.038571,...,1.992032,1.219512,0.493333,0.240878,0.227500,0.253414,-31.947002,338.0,0,1


There was only one row with null values so I think we're good to drop that row.

Designating subjects as validation vs training subjects

In [71]:
def val_split(df: pd.DataFrame):
    df['participant'] = df['participant'].astype(int)
    n = np.unique(df['participant'])
    n_range = range(min(n), max(n))
    train_subs = random.sample(n_range, math.floor(len(n)*0.75))
    train = df[df['participant'].isin(train_subs)]
    val = df[~df['participant'].isin(train_subs)]
    train_data = pd.concat((train, val), axis = 0)
    return train, val, train_data

In [72]:
train, val, train_data = val_split(training_df)

In [12]:
#Used to split into our features and our outcomes for this task
def x_y_split(df: pd.DataFrame):
    x = df.drop(['participant', 'gender', 'depression'], axis = 1)
    gender = df['gender']
    depression = df['depression']
    return np.array(x), np.array(gender), np.array(depression)

In [13]:
x_train, g_train, d_train = x_y_split(train)
x_val, g_val, d_val = x_y_split(val)
x_test, g_test, d_test = x_y_split(test_df)

In [14]:
from sklearn.metrics import confusion_matrix
# Calculates accuracy
# pass true and predicted labels
# return accuracy score
def calculate_total_accuracy(true_labels, predicted_labels):
    return accuracy_score(true_labels, predicted_labels)

# Calculates accuracy
# pass true and predicted labels
# return balanced accuracy score
def calculate_balanced_accuracy(true_labels, predicted_labels):
    #calculkate confusion matrix
    matrix = confusion_matrix(true_labels, predicted_labels)
    TP = matrix[1, 1]
    TN = matrix[0, 0]
    FP = matrix[0, 1]
    FN = matrix[1, 0]
    #For positive class, how many correct predictions
    accuracy_positive = TP/(TP+FN)
    #For negative class how many 
    accuracy_negative = TN/(TN+FP)
    return 0.5*(accuracy_positive + accuracy_negative)

#Calculates Equality of Opportunity
# pass true and predicted labels for male samples
# pass true and predicted labels for female samples
# return balanced accuracy score
def calculate_EO(true_labels_male, 
                 true_labels_female,
                 predicted_labels_male,
                predicted_labels_female):
    #Calculate True pistive rate for male gender with confusion matrix
    matrix_male = confusion_matrix(true_labels_male, predicted_labels_male)
    TP = matrix_male[1, 1]
    TN = matrix_male[0, 0]
    FN = matrix_male[1, 0]
    TPR_male = TP/(TP+FN)

    #Calculate True pistive rate for female gender with confusion matrix
    matrix_female = confusion_matrix(true_labels_female, predicted_labels_female)
    TP = matrix_female[1, 1]
    TN = matrix_female[0, 0]
    FN = matrix_female[1, 0]
    TPR_female = TP/(TP+FN)
    
    # Calculate EO
    return 1-abs(TPR_male-TPR_female) 

#Function to calculate majority votings
#Pass labels
#Returns mode or which label was predicted most
def majority_voting(df):
    counts = df.value_counts()
    return counts.idxmax()

In [109]:
from sklearn.metrics import accuracy_score
#Function to calculate all metrics
#Pass true labels, predicted labels and a reference(test/val) dataframe
#referece dataframe should have all labels and features
#Returns a dictionary with all the metric calculated
def calculate_metrics(y_true, y_pred, test_data):
    # Initialize metrics
    metrics = {}
    #---------------------------------------------------CALCULATING TOTAL METRICS
    #calculate total accuracy
    metrics["Total accuracy"] = calculate_total_accuracy(y_true, y_pred)
    #calculate total balanced accuracy
    metrics["Total Balanced accuracy"] = calculate_balanced_accuracy(y_true, y_pred)
    #calculate total EO
    #find gender based indices for true labels from data
    male_indices = test_data[test_data['gender']==1].index
    female_indices = test_data[test_data['gender']==0].index
    #separate true labels based on indices
    male_true = y_true.loc[male_indices]
    female_true = y_true.loc[female_indices]
    #Find the corresponding indices for predicted labels from true_labels
    male_true_index_list = list(male_true.index)
    female_true_index_list = list(female_true.index)
    #Get separated predicted labels based on gender
    male_predicted = y_pred[male_true_index_list]
    female_predicted = y_pred[female_true_index_list]
    metrics["Total EO"] = calculate_EO(male_true, female_true, male_predicted, female_predicted)
    #-------------------------------------------------CALCULATING AGGREGATED METRICS FOR EACH PARTICIPANT
    y_pred = np.reshape(y_pred, (len(y_true)))
    y_true = np.reshape(y_true, (len(y_true)))
    participants = np.array(test_data['participant'])
    predictions_df = pd.DataFrame({'participant': test_data['participant'], 'predicted_label': y_pred, 'true_label': y_true})
    aggregated_y_true = predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    aggregated_y_pred = predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    #Calculate aggregated accuracy score
    metrics["Aggregated accuracy score"] = calculate_total_accuracy(aggregated_y_true, aggregated_y_pred)
    #Calculate balanced aggregated accuracy
    metrics["Aggregated balanced accuracy score"] = calculate_balanced_accuracy(aggregated_y_true, aggregated_y_pred)
    #Calculate aggregated EOs
    participants = np.array(test_data['participant'])
    male_predictions_df = pd.DataFrame({'participant': participants[male_indices], 'predicted_label': y_pred[male_indices], 'true_label': y_true[male_indices]})
    male_aggregated_y_true = male_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    male_aggregated_y_pred = male_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    female_predictions_df = pd.DataFrame({'participant': participants[female_indices], 'predicted_label': y_pred[female_indices], 'true_label': y_true[female_indices]})
    female_aggregated_y_true = female_predictions_df.groupby('participant')['true_label'].agg(majority_voting)
    female_aggregated_y_pred = female_predictions_df.groupby('participant')['predicted_label'].agg(majority_voting)
    metrics["Aggregated EO score"] = calculate_EO(male_aggregated_y_true, female_aggregated_y_true, male_aggregated_y_pred, female_aggregated_y_pred)
    
    return metrics

# Feature selection

Running PCA to find the principal components in the data.

In [16]:
from sklearn import decomposition
from sklearn.feature_selection import SelectKBest

In [17]:
class PCA_(decomposition.PCA):
    def __init__(self, x_train, x_val, x_test):
        super().__init__()
        self.x_train = x_train
        self.x_val = x_val
        self.x_test = x_test
    
    def run_pca(self):
        #Find all PCs for the data
        self.fit(self.x_train)
        #Transform the data into the new feature space
        self.x_train = self.transform(self.x_train)
        self.x_val = self.transform(self.x_val)
        self.x_test = self.transform(self.x_test)
    
    def best_PCs(self, Y, k):
        #Find the k best PCs for predicting a given outcome Y (based on F score)
        best = SelectKBest(k=k).fit(self.x_train, Y)
        return best.transform(self.x_train), best.transform(self.x_val), best.transform(self.x_test)
    
    def remove_best_PCs(self, Y, k):
        #Find the k best PCs for predicting a given outcome Y and remove them
        best = SelectKBest(k=k).fit(self.x_train, Y)
        new = self.x_train.drop(best.get_support(indices = True), axis = 1)
        return not_best.transform(self.x_train), not_best.transform(self.x_val), not_best.transform(self.x_test)
        

In [18]:
pca = PCA_(x_train, x_val, x_test)
pca.run_pca()

# Linear Perceptron
Create a linear perceptron from the PCs that best predict gender and depression.

In [19]:
from sklearn import linear_model

In [None]:
def perceptron(x_train, y_train, x_val, y_val):
    percept = linear_model.Perceptron()
    percept.fit(x_train, y_train)
    preds = percept.predict(x_val)
    return calculate_metrics(y_val, preds)


In [21]:
pc_g_stats = {}
for k in range(1, 60):
    
    g_x_train, g_x_val, g_x_test = pca.best_PCs(g_train, k)
    pc_g_stats[k] = perceptron(g_x_train, g_train, g_x_val, g_val)

g_n_pcs_best = max(pc_g_stats, key=pc_g_stats.get)
g_acc_best = pc_g_stats[g_n_pcs_best]

print('For predicting gender with PCs, the best performance was achieved by using the', g_n_pcs_best, 'best PCs. This achieved a balanced accuracy of', g_acc_best)


For predicting gender with PCs, the best performance was achieved by using the 8 best PCs. This achieved a balanced accuracy of 0.9226445397486814


In [22]:
pc_d_stats = {}
for k in range(1, 60):
    d_x_train, d_x_val, d_x_test = pca.best_PCs(d_train, k)
    pc_d_stats[k] = perceptron(d_x_train, d_train, d_x_val, d_val)


d_n_pcs_best = max(pc_d_stats, key=pc_d_stats.get)
d_acc_best = pc_d_stats[d_n_pcs_best]

print('For predicting depression with PCs, the best performance was achieved by using the', d_n_pcs_best, 'best PCs. This achieved a balanced accuracy of', d_acc_best)


For predicting depression with PCs, the best performance was achieved by using the 35 best PCs. This achieved a balanced accuracy of 0.5523297284491314


# Implementing in RJ's decision tree set up

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

In [111]:
d_x_train, d_x_val, d_x_test = pca.best_PCs(d_train, 35)
X = np.concatenate((d_x_train, d_x_val))
y = np.concatenate((d_train, d_val))

depths = [3, 5, 7, 9, 15, 30, 50, 70, 90]
# Initialize metrics
metrics = {}


# Perform cross-validation for each tree depth
for depth in depths:
    # Initialize decision tree model
    tree = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        #Fit the training set
        tree.fit(X_train, y_train)
        #Predict validation set
        y_pred = tree.predict(X_val)
        #calculate metrics
        val_orig = train_data.iloc[val_index]
        fold_metrics.append(calculate_metrics(pd.DataFrame(y_val), y_pred, train_data.iloc[val_index]))

    #find avg metrics for each depth
    metrics[depth] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[depth] = sums

#Print metrics for all the hyperparameters (Depth)
for depth in metrics:
    print(f"for depth {depth}")
    print(metrics[depth])
    print()

for depth 3
{'Total accuracy': 0.7272660550458715, 'Total Balanced accuracy': 0.5614024569048714, 'Total EO': 0.9979610394160225, 'Aggregated accuracy score': 0.7333333333333333, 'Aggregated balanced accuracy score': 0.5533168778858435, 'Aggregated EO score': 1.0}

for depth 5
{'Total accuracy': 0.7342385321100917, 'Total Balanced accuracy': 0.5835829130587202, 'Total EO': 0.9879056748008187, 'Aggregated accuracy score': 0.7448275862068965, 'Aggregated balanced accuracy score': 0.5743405540819334, 'Aggregated EO score': 1.0}

for depth 7
{'Total accuracy': 0.7384220183486239, 'Total Balanced accuracy': 0.607200662426129, 'Total EO': 0.9772925653552667, 'Aggregated accuracy score': 0.735632183908046, 'Aggregated balanced accuracy score': 0.5589203261617055, 'Aggregated EO score': 1.0}

for depth 9
{'Total accuracy': 0.7322568807339449, 'Total Balanced accuracy': 0.6236049699102917, 'Total EO': 0.9910917300599653, 'Aggregated accuracy score': 0.7609195402298851, 'Aggregated balanced accu

# Random Forest Classifier

In [112]:
from sklearn.ensemble import RandomForestClassifier

In [115]:
np.sum(y)/len(y)

0.28858715596330275

In [118]:
d_x_train, d_x_val, d_x_test = pca.best_PCs(d_train, 35)
X = np.concatenate((d_x_train, d_x_val))

num_tree = [100, 150, 200, 250, 300, 350, 400]

metrics = {}

# Perform cross-validation for each number of trees
for n in num_tree:
    #each tree will use bootstrapping with at most 1/10th of the samples.
    #the classes will be weighed inversely to their frequency
    forest = RandomForestClassifier(n_estimators = n, criterion='entropy', bootstrap = True, class_weight = 'balanced_subsample', max_samples = 0.1)
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        #Fit the training set
        forest.fit(X_train, y_train)
        #Predict validation set
        y_pred = forest.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(pd.DataFrame(y_val), y_pred, train_data.iloc[val_index]))

    #find avg metrics for each number of trees
    metrics[n] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[n] = sums

#Print metrics for all the hyperparameters (Number of Trees)
for tree in metrics:
    print(f"for forest size {tree}")
    print(metrics[tree])
    print()

for forest size 100
{'Total accuracy': 0.7437064220183486, 'Total Balanced accuracy': 0.5595207899050914, 'Total EO': 0.9905196439464614, 'Aggregated accuracy score': 0.7126436781609196, 'Aggregated balanced accuracy score': 0.5193314667452599, 'Aggregated EO score': 1.0}

for forest size 150
{'Total accuracy': 0.7417981651376147, 'Total Balanced accuracy': 0.5549545084059008, 'Total EO': 0.9889632195768707, 'Aggregated accuracy score': 0.7126436781609196, 'Aggregated balanced accuracy score': 0.51947391688771, 'Aggregated EO score': 1.0}

for forest size 200
{'Total accuracy': 0.7429724770642202, 'Total Balanced accuracy': 0.5573624861016506, 'Total EO': 0.991779981265348, 'Aggregated accuracy score': 0.710344827586207, 'Aggregated balanced accuracy score': 0.5156277630415562, 'Aggregated EO score': 1.0}

for forest size 250
{'Total accuracy': 0.7430458715596331, 'Total Balanced accuracy': 0.5573055621443925, 'Total EO': 0.9825143097925579, 'Aggregated accuracy score': 0.7126436781609

# Adaboost 

In [120]:
from sklearn.ensemble import AdaBoostClassifier

In [133]:
metrics = {}

In [136]:
d_x_train, d_x_val, d_x_test = pca.best_PCs(d_train, 10)
X = np.concatenate((d_x_train, d_x_val))

num_tree = range(10, 100, 10)

# Perform cross-validation for each number of trees
for n in num_tree:
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5, criterion='entropy'), n_estimators=n, algorithm="SAMME")
    # Cross validation k fold, 4:1::training:validation
    kf = KFold(n_splits=5, shuffle=True)
    #list to store metrics for each cross validation split
    fold_metrics = []

    # Perform cross-validation and collect metrics
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        #Fit the training set
        ada.fit(X_train, y_train)
        #Predict validation set
        y_pred = ada.predict(X_val)
        #calculate metrics
        fold_metrics.append(calculate_metrics(pd.DataFrame(y_val), y_pred, train_data.iloc[val_index]))

    #find avg metrics for each number of trees
    metrics[n] = {}
    sums = {}
    for metric in fold_metrics:
        for key, value in metric.items():
            sums[key] = sums.get(key, 0) + value
    for key in sums:
        sums[key]/=len(fold_metrics)
    metrics[n] = sums

#Print metrics for all the hyperparameters (Number of Trees)
for tree in metrics:
    print(f"for adaboosted forest size {tree}")
    print(metrics[tree])
    print()

for adaboosted forest size 100
{'Total accuracy': 0.7208807339449541, 'Total Balanced accuracy': 0.5752416540449491, 'Total EO': 0.9849429811216334, 'Aggregated accuracy score': 0.7195402298850575, 'Aggregated balanced accuracy score': 0.5382108262108262, 'Aggregated EO score': 1.0}

for adaboosted forest size 10
{'Total accuracy': 0.7155963302752293, 'Total Balanced accuracy': 0.5549301383598881, 'Total EO': 0.9659686102587799, 'Aggregated accuracy score': 0.703448275862069, 'Aggregated balanced accuracy score': 0.5150997150997151, 'Aggregated EO score': 1.0}

for adaboosted forest size 20
{'Total accuracy': 0.7144954128440367, 'Total Balanced accuracy': 0.560102797926121, 'Total EO': 0.9800513309526302, 'Aggregated accuracy score': 0.7126436781609196, 'Aggregated balanced accuracy score': 0.5232905982905983, 'Aggregated EO score': 1.0}

for adaboosted forest size 30
{'Total accuracy': 0.7216146788990827, 'Total Balanced accuracy': 0.5627482103081805, 'Total EO': 0.9813671056139089, '

In [138]:
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5, criterion='entropy'), n_estimators=60, algorithm="SAMME")
# Cross validation k fold, 4:1::training:validation
kf = KFold(n_splits=5, shuffle=True)
#list to store metrics for each cross validation split
fold_metrics = []

# Perform cross-validation and collect metrics
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    #Fit the training set
    ada.fit(X_train, y_train)
    #Predict validation set
    y_pred = ada.predict(X_val)
    #calculate metrics
    fold_metrics.append(calculate_metrics(pd.DataFrame(y_val), y_pred, train_data.iloc[val_index]))

#find avg metrics for each number of trees
metrics[n] = {}
sums = {}
for metric in fold_metrics:
    for key, value in metric.items():
        sums[key] = sums.get(key, 0) + value
for key in sums:
    sums[key]/=len(fold_metrics)
metrics[n] = sums

ada.predict(d_x_test)
calculate_metrics(pd.DataFrame(y_val), y_pred, train_data.iloc[val_index])

{'Total accuracy': 0.6946788990825689,
 'Total Balanced accuracy': 0.5505334751635883,
 'Total EO': 0.9624269423957658,
 'Aggregated accuracy score': 0.7241379310344828,
 'Aggregated balanced accuracy score': 0.52,
 'Aggregated EO score': 1.0}