In [1]:
import torch
import pandas as pd
import numpy as np
import os
import optuna
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, recall_score
from sklearn import svm

In [2]:
def f1(y_true, y_pred):
    return round(f1_score(y_true, y_pred, average='micro') * 100, 3)


def uar(y_true, y_pred):
    return round(recall_score(y_true, y_pred, average='macro') * 100, 3)

def combine(y_true, y_pred):
    return round((0.66 * f1(y_true, y_pred) + 0.34 * uar(y_true, y_pred)), 3)

def eval_metric(targets,predicts, partition_name):
    results = {}
    results['f1'] = f1(targets, predicts)
    results['uar'] = uar(targets, predicts)
    results['combine'] = combine(targets, predicts)
    print(f'Results in {partition_name}:')
    print("  - f1: ", results['f1'])
    print("  - uar: ", results['uar'])
    print("  - combined:", results['combine'])

In [3]:
def dataset(partition, feature_set):
    train = pd.read_csv(os.path.join('./data_csv/',partition,feature_set+'_train.csv'))
    valid = pd.read_csv(os.path.join('./data_csv/',partition,feature_set+'_devel.csv'))
    return train, valid

In [4]:
import lightgbm as lgb
# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def lightgbm(trial):
    X_train, y_train = train.loc[:,'segment_id':'{}'.format(last_columns)], train['class_id']
    X_valid, y_valid = valid.loc[:,'segment_id':'{}'.format(last_columns)], valid['class_id']
    dtrain = lgb.Dataset(X_train, label=y_train)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(X_valid)
    pred_labels = np.rint(preds)
    accuracy = combine(y_valid, pred_labels)
    return accuracy


In [5]:
maps = {
         'vggface': 512,
        }

for partition in ['arousal','valence']:
    for feature_set, num_columns in maps.items():
        train, valid = dataset(partition, feature_set)
        last_columns = num_columns - 1 
        study = optuna.create_study(direction='maximize')
        study.optimize(lightgbm, n_trials=1000)

        trial = study.best_trial
        print('Accuracy: {}'.format(trial.value))
        print('Parameter: {}'.format(trial.params))
        
        df = pd.DataFrame([[partition, feature_set, trial.value, trial.params]], columns=['patition', 'feature_set', 'value', 'params'])
        if not os.path.exists('./optimize'):
            os.makedirs('./optimize')
        if os.path.exists('./optimize/optimize_{}.csv'.format(feature_set)):
            df2 = pd.read_csv('./optimize/optimize_{}.csv'.format(feature_set))
            df2 = df2.append(df, ignore_index = True)
            df2.to_csv('./optimize/optimize_{}.csv'.format(feature_set), index=False)
        else:
            df.to_csv('./optimize/optimize_{}.csv'.format(feature_set), index=False)

[I 2020-07-15 10:40:46,794] Finished trial#0 with value: 39.216 with parameters: {'lambda_l1': 9.411364326571016, 'lambda_l2': 0.0001302234865937086, 'num_leaves': 15, 'feature_fraction': 0.6901804811998362, 'bagging_fraction': 0.9247905090826531, 'bagging_freq': 6, 'min_child_samples': 57}. Best is trial#0 with value: 39.216.
[I 2020-07-15 10:40:48,271] Finished trial#1 with value: 38.785 with parameters: {'lambda_l1': 0.003616210692393305, 'lambda_l2': 0.009710984839007638, 'num_leaves': 125, 'feature_fraction': 0.7795187076726453, 'bagging_fraction': 0.5843855314700953, 'bagging_freq': 1, 'min_child_samples': 51}. Best is trial#0 with value: 39.216.
[I 2020-07-15 10:40:48,815] Finished trial#2 with value: 39.972 with parameters: {'lambda_l1': 0.10360987161072097, 'lambda_l2': 0.3861073492589465, 'num_leaves': 11, 'feature_fraction': 0.5121455623198519, 'bagging_fraction': 0.7236754791798101, 'bagging_freq': 1, 'min_child_samples': 97}. Best is trial#2 with value: 39.972.
[I 2020-07-

Accuracy: 41.615
Parameter: {'lambda_l1': 2.2676215368566144e-08, 'lambda_l2': 4.003917144936102e-07, 'num_leaves': 10, 'feature_fraction': 0.8908135768273152, 'bagging_fraction': 0.4985789881349387, 'bagging_freq': 6, 'min_child_samples': 91}


[I 2020-07-15 10:57:58,159] Finished trial#0 with value: 35.357 with parameters: {'lambda_l1': 0.006241956026990927, 'lambda_l2': 3.318313323386931e-05, 'num_leaves': 20, 'feature_fraction': 0.5242909851273738, 'bagging_fraction': 0.773682283354681, 'bagging_freq': 6, 'min_child_samples': 55}. Best is trial#0 with value: 35.357.
[I 2020-07-15 10:57:59,727] Finished trial#1 with value: 35.125 with parameters: {'lambda_l1': 0.0002661687257382321, 'lambda_l2': 2.2730808775686943e-05, 'num_leaves': 182, 'feature_fraction': 0.7097434286680475, 'bagging_fraction': 0.5057905210508359, 'bagging_freq': 7, 'min_child_samples': 57}. Best is trial#0 with value: 35.357.
[I 2020-07-15 10:58:02,438] Finished trial#2 with value: 35.603 with parameters: {'lambda_l1': 0.17076428354098064, 'lambda_l2': 1.4640374442943096, 'num_leaves': 32, 'feature_fraction': 0.8791311535927158, 'bagging_fraction': 0.7850517637246364, 'bagging_freq': 7, 'min_child_samples': 39}. Best is trial#2 with value: 35.603.
[I 202

Accuracy: 38.102
Parameter: {'lambda_l1': 0.37422203407607046, 'lambda_l2': 0.004151487540770132, 'num_leaves': 96, 'feature_fraction': 0.6983154806051196, 'bagging_fraction': 0.9433270255267205, 'bagging_freq': 7, 'min_child_samples': 50}
