## ML Classifier

In [9]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import torchaudio.functional as F
import torchaudio.transforms as T
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

  '"sox" backend is being deprecated. '


In [2]:
df_annotation = pd.read_csv("../dataset/split/annotation.csv", index_col=0)
for gen_pid in set(df_annotation['gen_pid']):
    EVAL_item = df_annotation[df_annotation['gen_pid'] == gen_pid].index
    TRAIN_item = df_annotation[df_annotation['gen_pid'] != gen_pid].index
    df_annotation.loc[TRAIN_item]
    df_annotation.loc[EVAL_item]
    print(gen_pid, len(TRAIN_item), len(EVAL_item))

F7 426 62
M2 429 59
M4 422 66
F8 418 70
F5 440 48
M1 432 56
F6 421 67
M3 428 60


## Featrue Extraction

In [18]:
sr = 16000
n_fft = int(0.025 * sr)
win_length = int(0.025 * sr)
hop_length = int(0.01 * sr)
n_mels = 96
n_mfcc = 13
melkwargs={
      'n_fft': n_fft,
      'n_mels': n_mels,
      'hop_length': hop_length,
    }
dirs = "../dataset/wav"
fnames = os.listdir(dirs)

In [20]:
import torchaudio.functional as F
import torchaudio.transforms as T
features = {}
rms_dict = {}
pitch_dict = {}
mfccs_dict = {}
for fname in tqdm(fnames):
    _id = fname.replace(".npy","")
    y = np.load(os.path.join(dirs, fname))
    mfcc_emb = librosa.feature.mfcc(
        y.squeeze(0), 
        n_mfcc =n_mfcc,
        sr=sampling_rate, 
        n_fft=n_fft, 
        hop_length=hop_length, 
        n_mels=n_mels,
    )
    mean_mfcc = list(mfcc_emb.mean(axis=1)) # temporal pooling
    std_mfcc = list(mfcc_emb.std(axis=1)) # temporal pooling
    pitchs = F.detect_pitch_frequency(torch.from_numpy(y.squeeze(0)), sr).numpy()
    pitch = np.array([i for i in pitchs if i < 1000])
    mean_pitch = pitch.mean()
    std_pitch = pitch.std()
    rms = librosa.feature.rms(y=y.squeeze(0))
    mean_rms = rms.mean()
    std_rms = rms.std()
    feature = mean_mfcc + std_mfcc + [mean_pitch, std_pitch, mean_rms, std_rms]
    features[_id] = feature
    rms_dict[_id] = rms
    pitch_dict[_id] = pitch
    mfccs_dict[_id] = mfcc_emba

In [21]:
all_samples = []
all_labels = []
all_preds = []
label_dist = {}
for fold in fold_case:
    df_tr = pd.read_csv(f"../dataset/split/{fold}_train.csv",index_col=0)
    df_va = pd.read_csv(f"../dataset/split/{fold}_valid.csv",index_col=0)
    df_train = pd.concat([df_tr, df_va])
    df_eval = pd.read_csv(f"../dataset/split/{fold}_eval.csv",index_col=0)
    label_dist[fold] = {"tr":df_train.sum(), "te":df_eval.sum()}
    X_train = np.stack([features[idx] for idx in df_train.index])
    y_train = np.stack([df_train.loc[idx].idxmax() for idx in df_train.index])
    X_test = np.stack([features[idx] for idx in df_eval.index])
    y_test = np.stack([df_eval.loc[idx].idxmax() for idx in df_eval.index])
    classifier = make_pipeline(StandardScaler(),LogisticRegression(random_state=42))
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    WA = accuracy_score(y_test, predictions)
    print(WA)
    # WA, UA evaluation
    all_labels.extend(list(y_test))
    all_preds.extend(list(predictions))
    all_samples.extend(list(df_eval.index))

In [22]:
results = pd.DataFrame(index=all_samples)
results['all_preds'] = all_preds
results['all_labels'] = all_labels

In [23]:
# weighted_acc & un-weighted acc
WA = accuracy_score(results['all_labels'], results['all_preds'])
UA = balanced_accuracy_score(results['all_labels'], results['all_preds'])