# Week 2 - Model Development and Integration

In [35]:
import pandas as pd
import numpy as np
import pathlib as pl

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [36]:
features_3s_path = pl.Path('../../Data/features_3_sec.csv').resolve()

features_3s_df = pd.read_csv(features_3s_path)

# lets see the first 5 rows
features_3s_df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [37]:
# lets label encode the label column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
features_3s_df['label'] = label_encoder.fit_transform(features_3s_df['label'])


In [40]:
features_3s_df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,0
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,0
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,0
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,0
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,0


In [42]:
# lets split the data into train and test

X = features_3s_df.drop(columns=['filename', 'label'])
y = features_3s_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# lets scale the data 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
dummy_clf_most_frequent = DummyClassifier(strategy='most_frequent')
dummy_clf_stratified = DummyClassifier(strategy='stratified')
dummy_clf_uniform = DummyClassifier(strategy='uniform')
dummy_clf_constant = DummyClassifier(strategy='constant', constant=0)

# train the model
dummy_clf_most_frequent.fit(X_train, y_train)
dummy_clf_stratified.fit(X_train, y_train)
dummy_clf_uniform.fit(X_train, y_train)
dummy_clf_constant.fit(X_train, y_train)

# evaluate the model
print(f"Dummy Classifier Accuracy (most_frequent): {accuracy_score(y_test, dummy_clf_most_frequent.predict(X_test)):.3f}")
print(f"Dummy Classifier Accuracy (stratified): {accuracy_score(y_test, dummy_clf_stratified.predict(X_test)):.3f}")
print(f"Dummy Classifier Accuracy (uniform): {accuracy_score(y_test, dummy_clf_uniform.predict(X_test)):.3f}")
print(f"Dummy Classifier Accuracy (constant): {accuracy_score(y_test, dummy_clf_constant.predict(X_test)):.3f}")



Dummy Classifier Accuracy (most_frequent): 0.090
Dummy Classifier Accuracy (stratified): 0.117
Dummy Classifier Accuracy (uniform): 0.100
Dummy Classifier Accuracy (constant): 0.104


In [51]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': 10,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
    }
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)

# Split data (example assumes X and y are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best Score:", study.best_value)

# Train final model with best hyperparameters
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train)
print(f"Optimized XGBoost Accuracy: {accuracy_score(y_test, best_model.predict(X_test)):.3f}")

[I 2024-12-26 13:25:31,574] A new study created in memory with name: no-name-6a687c9e-1b7a-47a5-9829-51e981a5f639
[I 2024-12-26 13:25:32,805] Trial 0 finished with value: 0.8513513513513513 and parameters: {'learning_rate': 0.27799682554049115, 'max_depth': 6, 'n_estimators': 62, 'subsample': 0.699815254432087, 'colsample_bytree': 0.5041490410082211, 'reg_alpha': 7.899663831116035, 'reg_lambda': 7.427096011170971}. Best is trial 0 with value: 0.8513513513513513.
[I 2024-12-26 13:25:40,513] Trial 1 finished with value: 0.9084084084084084 and parameters: {'learning_rate': 0.0981447640563195, 'max_depth': 9, 'n_estimators': 288, 'subsample': 0.6318072391509719, 'colsample_bytree': 0.9223090666167915, 'reg_alpha': 1.860422553101192, 'reg_lambda': 1.4999515441771372}. Best is trial 1 with value: 0.9084084084084084.
[I 2024-12-26 13:25:43,067] Trial 2 finished with value: 0.8798798798798799 and parameters: {'learning_rate': 0.22334777568244674, 'max_depth': 8, 'n_estimators': 115, 'subsample

Best Hyperparameters: {'learning_rate': 0.15545419559806609, 'max_depth': 7, 'n_estimators': 290, 'subsample': 0.761809867948227, 'colsample_bytree': 0.8949877938092121, 'reg_alpha': 0.5033922345398396, 'reg_lambda': 3.160044201118017}
Best Score: 0.918918918918919
Optimized XGBoost Accuracy: 0.919


Best Hyperparameters: 
```python
{'learning_rate': 0.15545419559806609, 'max_depth': 7, 'n_estimators': 290, 'subsample': 0.761809867948227, 'colsample_bytree': 0.8949877938092121, 'reg_alpha': 0.5033922345398396, 'reg_lambda': 3.160044201118017}
```

Best Score: 0.918918918918919

Optimized XGBoost Accuracy: 0.919