In [1]:
import catboost
import numpy as np
import optuna
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [7]:
df = pd.read_pickle('crowd_train_all_data_embedded.pkl')


def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = (
        df_temp.groupby('source_id')
        .apply(lambda x: x.nlargest(max_count, 'duration'))
        .reset_index(drop=True)
        .drop(
            columns=[
                'duration',
                'hash_id',
                'annotator_emo',
                'golden_emo',
                'annotator_id',
                'speaker_text',
                'speaker_emo',
            ]
        )
    )

    df_temp.dropna(inplace=True)

    print(df_temp.source_id.nunique())

    return df_temp


#df_t = get_dataframe(df, 200, 10)
df_t = get_dataframe(df, 100, 10)

354


In [8]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0], -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)

# X = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)
X_train.shape, X_test.shape, len(y_train[y_train == 4]), len(y_test[y_test == 4])

((1770, 498), (1770, 498), 5, 5)

In [34]:
import optuna
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def objective(trial, X_train, y_train, X_val, y_val):
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 16),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'n_estimators': 1_000,
        'early_stopping_round': 10,
        'device': 'gpu',
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    preds = model.predict(X_val)
    accuracy = f1_score(y_val, preds, average='weighted')
    return accuracy

def run_optimization(X_train, y_train, X_val, y_val):
    results = []

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=150)

    best_params = study.best_params
    results.append(('best_params', best_params))

    return results

In [35]:
best_params = run_optimization(X_train, y_train, X_test, y_test)

[I 2024-05-05 16:09:32,068] A new study created in memory with name: no-name-2682aa40-8483-4157-9263-24c7c058c4da
[I 2024-05-05 16:12:50,846] Trial 0 finished with value: 0.7755262439085969 and parameters: {'num_leaves': 244, 'min_data_in_leaf': 72, 'max_depth': 13, 'lambda_l1': 1.0143511303878232e-07, 'lambda_l2': 4.52661433707551, 'feature_fraction': 0.8775654397072306, 'bagging_fraction': 0.8059415386016378, 'bagging_freq': 3, 'min_child_samples': 75, 'learning_rate': 0.11401396667236176}. Best is trial 0 with value: 0.7755262439085969.
[I 2024-05-05 16:14:48,445] Trial 1 finished with value: 0.8245665010370892 and parameters: {'num_leaves': 140, 'min_data_in_leaf': 200, 'max_depth': 15, 'lambda_l1': 2.419139151240461e-08, 'lambda_l2': 0.7753932753464238, 'feature_fraction': 0.733935813646829, 'bagging_fraction': 0.8882973688603936, 'bagging_freq': 2, 'min_child_samples': 99, 'learning_rate': 0.1691946730261765}. Best is trial 1 with value: 0.8245665010370892.
[I 2024-05-05 16:16:21

In [38]:
best_params[0][1]

{'num_leaves': 9,
 'min_data_in_leaf': 151,
 'max_depth': 3,
 'lambda_l1': 0.05156122427614393,
 'lambda_l2': 0.917067067731611,
 'feature_fraction': 0.5345089078095386,
 'bagging_fraction': 0.6022498606851232,
 'bagging_freq': 2,
 'min_child_samples': 30,
 'learning_rate': 0.011302934154635014}

In [43]:
best_params[0][1].update({'objective': 'multiclass',
'metric': 'multi_logloss',
'verbosity': -1,
'boosting_type': 'gbdt',
'n_estimators': 1_000,
'early_stopping_round': 10,
'device': 'gpu'})

In [None]:
[I 2024-05-05 17:21:25,727] Trial 79 finished with value: 0.874839158662688 and parameters: {'num_leaves': 9, 'min_data_in_leaf': 151, 'max_depth': 3, 'lambda_l1': 0.05156122427614393, 'lambda_l2': 0.917067067731611, 'feature_fraction': 0.5345089078095386, 'bagging_fraction': 0.6022498606851232, 'bagging_freq': 2, 'min_child_samples': 30, 'learning_rate': 0.011302934154635014}. Best is trial 79 with value: 0.874839158662688.

In [93]:
best_params[0][1]

{'num_leaves': 9,
 'min_data_in_leaf': 151,
 'max_depth': 3,
 'lambda_l1': 0.05156122427614393,
 'lambda_l2': 0.917067067731611,
 'feature_fraction': 0.5345089078095386,
 'bagging_fraction': 0.6022498606851232,
 'bagging_freq': 2,
 'min_child_samples': 30,
 'learning_rate': 0.011302934154635014,
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'n_estimators': 1000,
 'device': 'gpu'}

In [94]:
import mlflow
from sklearn.preprocessing import StandardScaler

from sklearn import metrics

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "lightgbm normalized optuna"

with mlflow.start_run(run_name=run_name) as run:
    model_params = best_params[0][1]
    model_lgb = lgb.LGBMClassifier(**model_params)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_lgb.fit(X_train_scaled, y_train)
    predicts = model_lgb.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_lgb.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

KeyboardInterrupt: 

In [10]:
model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1_000,
        'depth': 4,
        'random_strength': 5,
        "bagging_temperature": 1,
    }
model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# X_train_scaled = X_train
# X_test_scaled = X_test

model_cb.fit(X_train, y_train)
predicts = model_cb.predict(X_test)

print(metrics.f1_score(y_train, model_cb.predict(X_train), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

Learning rate set to 0.072924
0:	learn: 5.8692962	total: 72.6ms	remaining: 1m 12s
10:	learn: 5.8365444	total: 853ms	remaining: 1m 16s
20:	learn: 5.8084691	total: 1.54s	remaining: 1m 11s
30:	learn: 5.7859927	total: 2.2s	remaining: 1m 8s
40:	learn: 5.7748466	total: 2.88s	remaining: 1m 7s
50:	learn: 5.7369582	total: 3.65s	remaining: 1m 7s
60:	learn: 5.6514974	total: 4.5s	remaining: 1m 9s
70:	learn: 5.5763082	total: 5.36s	remaining: 1m 10s
80:	learn: 5.4816086	total: 6.95s	remaining: 1m 18s
90:	learn: 5.4751683	total: 8.6s	remaining: 1m 25s
100:	learn: 5.3675075	total: 10.9s	remaining: 1m 37s
110:	learn: 5.2386366	total: 13.3s	remaining: 1m 46s
120:	learn: 4.9981324	total: 16.5s	remaining: 2m
130:	learn: 4.7477517	total: 20.4s	remaining: 2m 15s
140:	learn: 4.4939867	total: 24s	remaining: 2m 26s
150:	learn: 4.2778867	total: 28.1s	remaining: 2m 38s
160:	learn: 4.0535405	total: 31.9s	remaining: 2m 46s
170:	learn: 3.8146369	total: 35.7s	remaining: 2m 53s
180:	learn: 3.6300332	total: 39.3s	rema

NameError: name 'metrics' is not defined

In [12]:

print(metrics.f1_score(y_train, model_cb.predict(X_train), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

0.9994293214632197

0.7445141173954732
0.7536723163841808
0.7536723163841808


In [9]:
import mlflow
from sklearn.preprocessing import StandardScaler

from sklearn import metrics

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "catboost tuned"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1_000,
        'depth': 4,
        'random_strength': 5,
        "bagging_temperature": 1,
    }
    model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)
    
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test

    model_cb.fit(X_train_scaled, y_train)
    predicts = model_cb.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_cb.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.catboost.log_model(model_cb, artifact_path=f"mlflow/{run_name}/model")

Learning rate set to 0.072924
0:	learn: 5.8155174	total: 811ms	remaining: 13m 30s
10:	learn: 5.7896010	total: 3.43s	remaining: 5m 8s
20:	learn: 5.7608713	total: 5.78s	remaining: 4m 29s
30:	learn: 5.7281002	total: 8.2s	remaining: 4m 16s
40:	learn: 5.6589595	total: 11s	remaining: 4m 16s
50:	learn: 5.5896926	total: 14.2s	remaining: 4m 24s
60:	learn: 5.5491079	total: 16.8s	remaining: 4m 18s
70:	learn: 5.5451503	total: 18.8s	remaining: 4m 5s
80:	learn: 5.5062980	total: 21.2s	remaining: 4m
90:	learn: 5.4662236	total: 23.7s	remaining: 3m 56s
100:	learn: 5.3617320	total: 27.2s	remaining: 4m 1s
110:	learn: 5.1641442	total: 31.4s	remaining: 4m 11s
120:	learn: 4.8283512	total: 38.1s	remaining: 4m 36s
130:	learn: 4.4873061	total: 44.8s	remaining: 4m 57s
140:	learn: 4.2118341	total: 51.4s	remaining: 5m 13s
150:	learn: 3.9836346	total: 58.1s	remaining: 5m 26s
160:	learn: 3.7377839	total: 1m 4s	remaining: 5m 38s
170:	learn: 3.5278287	total: 1m 11s	remaining: 5m 47s
180:	learn: 3.3421743	total: 1m 18s

In [None]:
model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1_000,
        'depth': 4,
        'random_strength': 5,
    }