## 1. Import libraries

In [1]:
import os
import numpy as np
import librosa
import joblib
import pickle
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, cross_val_score
from model import extract_features_from_path, make_pipe

## 2. Data preprocessing

In [2]:
# Train data path
train_metadata_path = './2501ml_data/label/train_label.txt'
train_data_path     = './2501ml_data/train'

In [3]:
# Load train data
train_x = []
train_y = []

with open(train_metadata_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading train data"):
        spk, file_name, _, _, label = line.strip().split(' ')
        wav_path = os.path.join(train_data_path, file_name)
        features = extract_features_from_path(wav_path)
        train_x.append(features)
        train_y.append(label)

train_x = np.array(train_x)
train_y = np.array(train_y)

Loading train data: 4000it [01:58, 33.79it/s]


## 3. Model train & save

In [4]:
# Prepare pipeline
param_grid = {
    'svc__C':     [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10]
}

grid = list(ParameterGrid(param_grid))

In [5]:
# 3-fold cv grid search
best_score  = 0.0
best_params = None

for params in tqdm(grid, desc="GridSearch", ncols=80):
    pipe = make_pipe().set_params(**params)
    scores = cross_val_score(pipe, train_x, train_y, cv=3, scoring='accuracy', n_jobs=-2)
    mean_score = scores.mean()
    if mean_score > best_score:
        best_score, best_params = mean_score, params

print(f"\nbest hyperparameters: {best_params}, CV Accuracy = {best_score:.4f}")

GridSearch: 100%|███████████████████████████████| 21/21 [00:50<00:00,  2.38s/it]


best hyperparameters: {'svc__C': 10, 'svc__gamma': 0.01}, CV Accuracy = 0.9755





In [6]:
# Retrain with best hyperparameters
best_pipe = make_pipe().set_params(**best_params)
best_pipe.fit(train_x, train_y)

In [7]:
# Save the model
save_dir = './'
os.makedirs(save_dir, exist_ok=True)
model_path = os.path.join(save_dir, 'model.pkl')

with open(model_path, 'wb') as f:
    pickle.dump(best_pipe, f)
print(f"{model_path}")

./model.pkl
