In [1]:
import mlflow
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import metrics
from sklearn.pipeline import Pipeline

import catboost

In [2]:
df = pd.read_pickle('experiments/crowd_train_all_data_embedded.pkl')

In [3]:
def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = df_temp.groupby('source_id').apply(
        lambda x: x.nlargest(max_count, 'duration')
    ).reset_index(drop=True).drop(columns=['duration', 'hash_id', 'annotator_emo', 'golden_emo', 'annotator_id', 'speaker_text', 'speaker_emo'])

    df_temp.dropna(inplace=True)
    
    print(df_temp.source_id.nunique())
    
    return df_temp

In [4]:
df_t = get_dataframe(df, 200, 10)
df_cross_val = get_dataframe(df, 200, 7)

102
102


In [5]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X_mfccs = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

X_cv = np.vstack(df_cross_val.audio_feature.to_numpy()).reshape(df_cross_val.shape[0] , -1)
le_cv = LabelEncoder()
y_cv = le.fit_transform(df_cross_val.source_id)
X_mfccs_cv = X_cv[:, 36 + 384: 36 + 384 + 60] #mfcc only
X_cv.shape, X_mfccs_cv.shape, y_cv.shape

((714, 498), (714, 60), (714,))

In [6]:
def get_model_and_params(experiment_name: str, model_name: str):
    experiment_id = dict(mlflow.get_experiment_by_name(experiment_name))['experiment_id']

    cls_102_5_df = mlflow.search_runs([experiment_id], order_by=['metrics.f1_weighted'])
    svc_df = cls_102_5_df[cls_102_5_df['tags.mlflow.runName'] == model_name]
    model_dict = dict(eval(svc_df['tags.mlflow.log-model.history'].item().replace('null', 'None'))[0])
    logged_model = '/'.join(['runs:', model_dict['run_id'], model_dict['artifact_path']])

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    loaded_model = mlflow.pyfunc.load_model(logged_model)

    params = svc_df[svc_df.columns[svc_df.columns.str.startswith('params.')]].dropna(axis=1).to_dict()
    params = {k.split('.')[-1]: list(v.values())[-1] for k, v in params.items()}
    for k, v in params.items():
        if v.isdigit():
            v = int(v)
        elif v.replace('.', '').isdigit():
            v = float(v)
        params[k] = v
    
    return loaded_model, params

## Классификация, 102 класса, mfcc фичи. Лучшая модель - svm

In [7]:
experiment = 'only mfcc 102 classes'
model_name = 'svc gsc'
loaded_model, params = get_model_and_params(experiment, model_name)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/09 17:10:28 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


проверка того, что модель загрузилась верно

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_mfccs, y, test_size=0.5, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [9]:
metrics.f1_score(y_val, loaded_model.predict(X_val_scaled), average='weighted')

0.8556857086268851

кросс-валидация

In [10]:
from sklearn.svm import SVC

In [11]:
scaler = StandardScaler()
model_svc = SVC(**params)

pipeline = Pipeline([('scaler', scaler), ('svc', model_svc)])

scoring = {'accuracy': 'accuracy',
           'f1_weighted': 'f1_weighted',
           'f1_micro': 'f1_micro',
           'f1_macro': 'f1_macro'}
scores = cross_validate(pipeline, X_mfccs_cv, y_cv, cv = 4, scoring=scoring)
scores

{'fit_time': array([0.08416033, 0.10100126, 0.07999778, 0.09599757]),
 'score_time': array([0.03799868, 0.05900025, 0.03500175, 0.04999971]),
 'test_accuracy': array([0.8603352 , 0.86592179, 0.88764045, 0.84831461]),
 'test_f1_weighted': array([0.84581006, 0.85512104, 0.88202247, 0.83801498]),
 'test_f1_micro': array([0.8603352 , 0.86592179, 0.88764045, 0.84831461]),
 'test_f1_macro': array([0.8503268 , 0.84836601, 0.8872549 , 0.82352941])}

In [12]:
metrcs = ['accuracy', 'f1_weighted', 'f1_micro', 'f1_macro']
final_scores = {k: scores['test_' + k].mean() for k in metrcs}
final_scores

{'accuracy': 0.8655530098549997,
 'f1_weighted': 0.8552421379699957,
 'f1_micro': 0.8655530098549997,
 'f1_macro': 0.8523692810457517}

In [13]:
with open(f'cross_validation_scores/{model_name}.json', 'w') as f:
    json.dump(final_scores, f)

## Классификация, 354 класса, mfcc фичи. Лучшая модель - svm

In [14]:
df_t = get_dataframe(df, 100, 10)
df_cross_val = get_dataframe(df, 100, 7)

354
354


In [15]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X_mfccs = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

X_cv = np.vstack(df_cross_val.audio_feature.to_numpy()).reshape(df_cross_val.shape[0] , -1)
le_cv = LabelEncoder()
y_cv = le.fit_transform(df_cross_val.source_id)
X_mfccs_cv = X_cv[:, 36 + 384: 36 + 384 + 60] #mfcc only
X_cv.shape, X_mfccs_cv.shape, y_cv.shape

((2478, 498), (2478, 60), (2478,))

In [16]:
experiment = 'only mfcc 354 classes'
model_name = 'svc gsc'
loaded_model, params = get_model_and_params(experiment, model_name)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/09 17:10:36 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Проверка загрузки

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X_mfccs, y, test_size=0.5, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

metrics.f1_score(y_val, loaded_model.predict(X_val_scaled), average='weighted')

0.7385117628761696

кросс-валидация

In [18]:
scaler = StandardScaler()
model_svc = SVC(**params)

pipeline = Pipeline([('scaler', scaler), ('svc', model_svc)])

scoring = {'accuracy': 'accuracy',
           'f1_weighted': 'f1_weighted',
           'f1_micro': 'f1_micro',
           'f1_macro': 'f1_macro'}
scores = cross_validate(pipeline, X_mfccs_cv, y_cv, cv = 4, scoring=scoring)
scores

{'fit_time': array([0.95486879, 0.87509155, 0.89400721, 0.89097023]),
 'score_time': array([1.0241034 , 0.9579289 , 0.97378707, 0.99095702]),
 'test_accuracy': array([0.76129032, 0.75483871, 0.76090468, 0.73990307]),
 'test_f1_weighted': array([0.74445469, 0.73672811, 0.74029541, 0.72124779]),
 'test_f1_micro': array([0.76129032, 0.75483871, 0.76090468, 0.73990307]),
 'test_f1_macro': array([0.74081248, 0.73473231, 0.73833737, 0.71942427])}

In [19]:
metrcs = ['accuracy', 'f1_weighted', 'f1_micro', 'f1_macro']
final_scores = {k: scores['test_' + k].mean() for k in metrcs}
final_scores

{'accuracy': 0.7542341966751784,
 'f1_weighted': 0.7356814978323394,
 'f1_micro': 0.7542341966751784,
 'f1_macro': 0.7333266074791499}

In [20]:
with open(f'cross_validation_scores/{model_name}_354.json', 'w') as f:
    json.dump(final_scores, f)

гипотеза: увеличение тренировочных данных - выше метрика

In [21]:
df_t = get_dataframe(df, 100, 20)
df_cross_val = get_dataframe(df, 100, 14)

354
354


In [22]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X_mfccs = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

X_cv = np.vstack(df_cross_val.audio_feature.to_numpy()).reshape(df_cross_val.shape[0] , -1)
le_cv = LabelEncoder()
y_cv = le.fit_transform(df_cross_val.source_id)
X_mfccs_cv = X_cv[:, 36 + 384: 36 + 384 + 60] #mfcc only
X_cv.shape, X_mfccs_cv.shape, y_cv.shape

((4956, 498), (4956, 60), (4956,))

In [23]:
experiment = 'only mfcc 354 classes'
model_name = 'svc gsc'
loaded_model, params = get_model_and_params(experiment, model_name)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/09 17:10:49 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [24]:
scaler = StandardScaler()
model_svc = SVC(**params)

pipeline = Pipeline([('scaler', scaler), ('svc', model_svc)])

scoring = {'accuracy': 'accuracy',
           'f1_weighted': 'f1_weighted',
           'f1_micro': 'f1_micro',
           'f1_macro': 'f1_macro'}
scores = cross_validate(pipeline, X_mfccs_cv, y_cv, cv = 4, scoring=scoring)
scores

{'fit_time': array([2.38389134, 2.30820179, 2.24736381, 2.29371357]),
 'score_time': array([2.89615369, 2.90902781, 2.89018297, 3.2234652 ]),
 'test_accuracy': array([0.82001614, 0.84665052, 0.8393866 , 0.83131558]),
 'test_f1_weighted': array([0.81441218, 0.84446034, 0.83316422, 0.82510823]),
 'test_f1_micro': array([0.82001614, 0.84665052, 0.8393866 , 0.83131558]),
 'test_f1_macro': array([0.81541788, 0.84249456, 0.83281712, 0.82330142])}

In [25]:
metrcs = ['accuracy', 'f1_weighted', 'f1_micro', 'f1_macro']
final_scores = {k: scores['test_' + k].mean() for k in metrcs}
final_scores

{'accuracy': 0.8343422114608554,
 'f1_weighted': 0.8292862426337003,
 'f1_micro': 0.8343422114608554,
 'f1_macro': 0.8285077446094394}

In [26]:
with open(f'cross_validation_scores/{model_name}_354_more_data.json', 'w') as f:
    json.dump(final_scores, f)

## Классификация, 102 класса, все доступные фичи. Лучшая модель - catboost tuned

In [11]:
experiment = '102 classes, 5 per class'
model_name = 'catboost tuned'
loaded_model, params = get_model_and_params(experiment, model_name)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

metrics.f1_score(y_val, loaded_model.predict(X_val), average='weighted')

0.9065314315314316

In [None]:
params['devices'] = '0'
model_cb = catboost.CatBoostClassifier(verbose=10, **params)

pipeline = Pipeline([('catboost', model_cb)])

scoring = {'accuracy': 'accuracy',
           'f1_weighted': 'f1_weighted',
           'f1_micro': 'f1_micro',
           'f1_macro': 'f1_macro'}
scores = cross_validate(pipeline, X_cv, y_cv, cv = 4, scoring=scoring)
scores

In [21]:
metrcs = ['accuracy', 'f1_weighted', 'f1_micro', 'f1_macro']
final_scores = {k: scores['test_' + k].mean() for k in metrcs}
final_scores

{'accuracy': 0.9117522440524763,
 'f1_weighted': 0.8989719056287382,
 'f1_micro': 0.9117522440524763,
 'f1_macro': 0.9014005602240897}

In [22]:
with open(f'cross_validation_scores/{model_name}_102.json', 'w') as f:
    json.dump(final_scores, f)

## Классификация, 354 класса, все доступные фичи. Лучшая модель - catboost tuned

In [7]:
df_t = get_dataframe(df, 100, 10)
df_cross_val = get_dataframe(df, 100, 7)

354
354


In [8]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)

X_cv = np.vstack(df_cross_val.audio_feature.to_numpy()).reshape(df_cross_val.shape[0] , -1)
le_cv = LabelEncoder()
y_cv = le.fit_transform(df_cross_val.source_id)
X_cv.shape, y_cv.shape

((2478, 498), (2478,))

In [9]:
experiment = '354 classes, 5 per class'
model_name = 'catboost tuned'
loaded_model, params = get_model_and_params(experiment, model_name)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

metrics.f1_score(y_val, loaded_model.predict(X_val), average='weighted')

0.8702095142773109

In [11]:
params['devices'] = '0'
model_cb = catboost.CatBoostClassifier(verbose=10, **params)

pipeline = Pipeline([('catboost', model_cb)])

scoring = {'accuracy': 'accuracy',
           'f1_weighted': 'f1_weighted',
           'f1_micro': 'f1_micro',
           'f1_macro': 'f1_macro'}
scores = cross_validate(pipeline, X_cv, y_cv, cv = 4, scoring=scoring)
scores

Learning rate set to 0.073649
0:	learn: 5.8689856	total: 93.1ms	remaining: 1m 32s
10:	learn: 5.7389519	total: 1.6s	remaining: 2m 24s
20:	learn: 5.7292819	total: 2.51s	remaining: 1m 57s
30:	learn: 5.7054912	total: 3.5s	remaining: 1m 49s
40:	learn: 5.5789402	total: 4.99s	remaining: 1m 56s
50:	learn: 5.4824308	total: 6.88s	remaining: 2m 8s
60:	learn: 5.4556111	total: 9.65s	remaining: 2m 28s
70:	learn: 5.4497086	total: 12s	remaining: 2m 37s
80:	learn: 5.4367332	total: 14.4s	remaining: 2m 43s
90:	learn: 5.4058905	total: 17.3s	remaining: 2m 52s
100:	learn: 5.3470770	total: 20.2s	remaining: 3m
110:	learn: 5.0344230	total: 26.6s	remaining: 3m 32s
120:	learn: 4.7117664	total: 33.8s	remaining: 4m 5s
130:	learn: 4.4195624	total: 41.1s	remaining: 4m 32s
140:	learn: 4.1708548	total: 48.4s	remaining: 4m 55s
150:	learn: 3.9541113	total: 55.7s	remaining: 5m 13s
160:	learn: 3.7112894	total: 1m 2s	remaining: 5m 28s
170:	learn: 3.5210962	total: 1m 10s	remaining: 5m 40s
180:	learn: 3.3086190	total: 1m 17s

{'fit_time': array([634.52272248, 583.82567358, 324.01509213, 265.35006046]),
 'score_time': array([0.15224195, 0.11741281, 0.11350965, 0.11302757]),
 'test_accuracy': array([0.86129032, 0.87903226, 0.86591276, 0.86752827]),
 'test_f1_weighted': array([0.84544547, 0.8677957 , 0.84891145, 0.85368875]),
 'test_f1_micro': array([0.86129032, 0.87903226, 0.86591276, 0.86752827]),
 'test_f1_macro': array([0.84285714, 0.86704331, 0.84869518, 0.85320151])}

In [12]:
metrcs = ['accuracy', 'f1_weighted', 'f1_micro', 'f1_macro']
final_scores = {k: scores['test_' + k].mean() for k in metrcs}
final_scores

{'accuracy': 0.868440903642712,
 'f1_weighted': 0.8539603418625255,
 'f1_micro': 0.868440903642712,
 'f1_macro': 0.8529492870594566}

In [13]:
with open(f'cross_validation_scores/{model_name}_354.json', 'w') as f:
    json.dump(final_scores, f)