In [22]:
import mlflow
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [23]:
df = pd.read_pickle("experiments/crowd_train_all_data_embedded.pkl")

In [76]:
def get_dataframe(df, min_count, max_count, max_filter=None):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    if max_filter:
        freq = freq[(min_count <= freq) & (freq <= max_filter)]
    else:
        freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = (
        df_temp.groupby('source_id')
        .apply(lambda x: x.nlargest(max_count, 'duration'))
        .reset_index(drop=True)
        .drop(
            columns=[
                'duration',
                'hash_id',
                'annotator_emo',
                'golden_emo',
                'annotator_id',
                'speaker_text',
                'speaker_emo',
            ]
        )
    )

    df_temp.dropna(inplace=True)

    print(df_temp.source_id.nunique())

    return df_temp


df_t = get_dataframe(df, 200, 10)
not_in_df_t = get_dataframe(df, 0, 1, max_filter=1)

102
13


In [25]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0], -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X_mfccs = X[:, 36 + 384 : 36 + 384 + 60]  # mfcc only

X.shape, X_mfccs.shape, y.shape

((1020, 498), (1020, 60), (1020,))

In [92]:
X_not = np.vstack(not_in_df_t.audio_feature.to_numpy()).reshape(
    not_in_df_t.shape[0], -1
)
le_not = LabelEncoder()
y_not = le.fit_transform(not_in_df_t.source_id)
X_mfccs_not = X_not[:, 36 + 384 : 36 + 384 + 60]  # mfcc only

X_not.shape, X_mfccs_not.shape, y_not.shape

((13, 498), (13, 60), (13,))

In [79]:
def get_model_and_params(experiment_name: str, model_name: str):
    experiment_id = dict(mlflow.get_experiment_by_name(experiment_name))[
        'experiment_id'
    ]

    cls_102_5_df = mlflow.search_runs([experiment_id], order_by=['metrics.f1_weighted'])
    svc_df = cls_102_5_df[cls_102_5_df['tags.mlflow.runName'] == model_name]
    model_dict = dict(
        eval(svc_df['tags.mlflow.log-model.history'].item().replace('null', 'None'))[0]
    )
    logged_model = '/'.join(
        ['runs:', model_dict['run_id'], model_dict['artifact_path']]
    )

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    loaded_model = mlflow.sklearn.load_model(logged_model)

    params = (
        svc_df[svc_df.columns[svc_df.columns.str.startswith('params.')]]
        .dropna(axis=1)
        .to_dict()
    )
    params = {k.split('.')[-1]: list(v.values())[-1] for k, v in params.items()}
    for k, v in params.items():
        if v.isdigit():
            v = int(v)
        elif v.replace('.', '').isdigit():
            v = float(v)
        params[k] = v

    return loaded_model, params

SVM, 102 класса, mfcc фичи

In [80]:
experiment = "only mfcc 102 classes"
model_name = "svc gsc"
loaded_model, params = get_model_and_params(experiment, model_name)


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/09 23:47:39 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [81]:
X_train, X_val, y_train, y_val = train_test_split(
    X_mfccs, y, test_size=0.5, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [82]:
preds_val = loaded_model.predict(X_val_scaled)
metrics.f1_score(y_val, preds_val, average="weighted")


0.8556857086268851

In [108]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC

params = {"kernel": "rbf", "decision_function_shape": "ovr", "C": 5}
model = SVC(**params)
calibrated = CalibratedClassifierCV(model, method="isotonic", cv=5)
calibrated.fit(X_train_scaled, y_train)
calibrated_probs = calibrated.predict_proba(X_val_scaled)


In [109]:
preds_val = calibrated.predict(X_val_scaled)
metrics.f1_score(y_val, preds_val, average="weighted")


0.854234599822835

In [110]:
np.max(calibrated_probs, axis=1)[preds_val == y_val].mean()

0.7432785292376581

In [111]:
np.max(calibrated_probs, axis=1)[preds_val != y_val].mean()

0.5586797794527484