In [3]:
from sklearn.calibration import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from scipy import stats
import pandas as pd
import numpy as np

from common import load_data, pareto_rank, baseline_results

# Multilabel Classification



In [56]:
# Data Loading
random_state = 1234
test_size = 0.40
pareto_cutoff = 0.1
rank_by_domination_count = False

# performances = ["fps", "cpu"]
performances = ["kbs", "fps"]
# performances = []

(
    perf_matrix_initial,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(system="x264", data_dir="../data", input_properties_type="tabular")

if len(performances) == 0:
    performances = all_performances

# Normalization is needed for the Pareto cutoff
# We can normalize before splitting, because
# we normalize per input and we also split per input.
# There is no data leakage.
normalized_metrics = (
    perf_matrix_initial[["inputname"] + performances]
    .groupby("inputname", as_index=False)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
cutoff_mask = (normalized_metrics <= pareto_cutoff).all(axis=1)

nmdf = (
    perf_matrix_initial[["inputname"] + performances]
    .groupby("inputname", as_index=True)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
perf_matrix = pd.merge(
    perf_matrix_initial, nmdf, suffixes=("_raw", None), left_index=True, right_index=True
)
perf_matrix["feasible"] = cutoff_mask

all_perf_raw = [f"{p}_raw" for p in performances]
all_perf_norm = [f"{p}" for p in performances]

icm_all = (
    perf_matrix[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm_all.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm_all["ranks"] = icm_all.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

In [57]:
# Split data and preprocess further
train_inp, test_inp = train_test_split(
    perf_matrix["inputname"].unique(),
    test_size=test_size,
    random_state=random_state,
)
train_perf = perf_matrix[perf_matrix.inputname.isin(train_inp)]
test_perf = perf_matrix[perf_matrix.inputname.isin(test_inp)]


icm = (
    train_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
# icm = train_perf[train_perf.configurationID != 71]
icm_ranked_measures = icm.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm["ranks"] = icm.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Calculate the Pareto ranks for the test data
icm_test = (
    test_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_test["ranks"] = icm_test.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Full dataset of input features + config features that are in the first rank
dataset = icm[icm.ranks <= 1].join(config_features).join(input_features).reset_index()

In [67]:
dataset

Unnamed: 0,inputname,configurationID,kbs,fps,ranks,cabac,ref,subme,mixed_ref,me_range,...,scenecut,weightb,resolution,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY,category
0,Animation_1080P-01b3,13,0.069224,0.018952,1,0,1,0,0,16,...,0.0,,1080,1920,1080,0.098,0.004,0.017,0.005,Animation
1,Animation_1080P-01b3,14,0.070137,0.000000,1,0,1,0,1,16,...,0.0,,1080,1920,1080,0.098,0.004,0.017,0.005,Animation
2,Animation_1080P-01b3,23,0.068592,0.060504,1,0,1,1,0,16,...,0.0,,1080,1920,1080,0.098,0.004,0.017,0.005,Animation
3,Animation_1080P-01b3,88,0.056092,0.074948,1,1,2,4,0,16,...,0.0,,1080,1920,1080,0.098,0.004,0.017,0.005,Animation
4,Animation_1080P-0c4f,1,0.995015,0.039098,1,0,1,0,0,16,...,0.0,,1080,1920,1080,1.714,0.136,9.048,0.000,Animation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248729,Vlog_720P-6d56,197,0.141973,0.988177,1,1,16,7,1,24,...,40.0,1.0,720,1280,720,1.074,0.428,11.794,0.415,Vlog
248730,Vlog_720P-6d56,198,0.074811,0.978303,1,1,16,11,1,24,...,40.0,1.0,720,1280,720,1.074,0.428,11.794,0.415,Vlog
248731,Vlog_720P-6d56,199,0.088497,0.993914,1,1,16,11,1,24,...,40.0,1.0,720,1280,720,1.074,0.428,11.794,0.415,Vlog
248732,Vlog_720P-6d56,200,0.088497,0.992678,1,1,16,11,1,24,...,40.0,1.0,720,1280,720,1.074,0.428,11.794,0.415,Vlog


In [68]:
# Baseline results
baseline_results(icm, icm_ranked_measures, icm_test, dataset, config_features, verbose=True)

Average rank of the overall best configuration: 1.05+-0.28
Average rank of the most common configuration: 1.05+-0.28
Average rank of the best configuration for all metrics: 1.06+-0.33
Average rank of random configuration: 1.07+-0.38


{'overall': [1.0485436893203883, 0.2782183721138724],
 'metric': [1.062135922330097, 0.3286324999305114],
 'common': [1.0485436893203883, 0.2782183721138724],
 'random': [1.0689320388349515, 0.3766667822752681]}

In [79]:
## We make a multi-class classification problem
# Each input is annotated with the rank-1 classes

enc = LabelEncoder()
enc.fit(dataset["configurationID"].tolist())

grouped_df = (
    dataset.groupby("inputname")["configurationID"].apply(enc.transform).reset_index()
)
mlb = MultiLabelBinarizer()
# Fit and transform the 'Values' column
binary_matrix = mlb.fit_transform(grouped_df["configurationID"])

# Create a new DataFrame with the binary matrix
binary_df = pd.DataFrame(
    binary_matrix, columns=mlb.classes_, index=grouped_df["inputname"]
)

X = input_preprocessor.fit_transform(
    input_features[
        input_features.index.get_level_values("inputname").isin(train_inp)
    ].sort_index()
)
y = binary_df.values

X.shape, y.shape

((1287, 21), (1287, 201))

In [92]:
from sklearn.tree import DecisionTreeClassifier


model = MultiOutputClassifier(DecisionTreeClassifier(max_depth=21))
model.fit(X, y)
model.score(X, y)

1.0

In [96]:
from sklearn import tree

# Extract decision paths from all trees and mine for common prefixes
# 0/1: positive/negative class?
print(tree.export_text(model.estimators_[0], feature_names=input_preprocessor.get_feature_names_out()))

|--- cat__category_HDR <= 0.50
|   |--- remainder__SPATIAL_COMPLEXITY <= 0.16
|   |   |--- remainder__TEMPORAL_COMPLEXITY <= 0.05
|   |   |   |--- class: 0
|   |   |--- remainder__TEMPORAL_COMPLEXITY >  0.05
|   |   |   |--- cat__category_VerticalVideo <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- cat__category_VerticalVideo >  0.50
|   |   |   |   |--- class: 0
|   |--- remainder__SPATIAL_COMPLEXITY >  0.16
|   |   |--- remainder__SPATIAL_COMPLEXITY <= 0.29
|   |   |   |--- remainder__SPATIAL_COMPLEXITY <= 0.29
|   |   |   |   |--- remainder__CHUNK_COMPLEXITY_VARIATION <= 9.96
|   |   |   |   |   |--- cat__category_Lecture <= 0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- cat__category_Lecture >  0.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- remainder__CHUNK_COMPLEXITY_VARIATION >  9.96
|   |   |   |   |   |--- class: 0
|   |   |   |--- remainder__SPATIAL_COMPLEXITY >  0.29
|   |   |   |   |--- class: 0
|   |   |--- remainder__SPATIAL_CO

In [86]:
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=input_preprocessor.get_feature_names_out(),  
                                # class_names=iris.target_names,  
                                filled=True, rounded=True,  
                                special_characters=True)
graph = graphviz.Source(dot_data)  
#this will create an iris.pdf file with the rule path
graph.render("iris")

'iris.pdf'

In [60]:
from xgboost import XGBClassifier

# Create and train the model
model = XGBClassifier(n_estimators=100, random_state=random_state, objective='binary:logistic')
model.fit(X, y)

X_test = input_preprocessor.transform(input_features.query("inputname.isin(@test_inp)"))
y_pred_proba = model.predict_proba(X_test)

# Select the label with highest probability for each sample
pred_cfg_test = enc.inverse_transform(np.argmax(y_pred_proba, axis=1)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)

Test rank 1.0524271844660193


In [34]:
from sklearn.neural_network import MLPClassifier
for _ in range(20):
    clf2 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=500) #, random_state=random_state)
    clf2.fit(X, y)
    # X_test = input_preprocessor.transform(input_features.query("inputname.isin(@test_inp)"))
    y_pred_proba = clf2.predict_proba(X_test)

    # Select the label with highest probability for each sample
    pred_cfg_test = enc.inverse_transform(np.argmax(y_pred_proba, axis=1)).astype(int)

    inp_pred_map = pd.DataFrame(
        zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
    )
    print(
        "Test rank",
        icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
    )

Test rank 5.72621359223301
Test rank 5.446601941747573
Test rank 4.733980582524272
Test rank 5.067961165048544
Test rank 5.287378640776699
Test rank 5.467961165048544
Test rank 4.846601941747573
Test rank 4.90873786407767
Test rank 5.087378640776699
Test rank 3.732038834951456
Test rank 5.3359223300970875
Test rank 5.52621359223301
Test rank 4.49126213592233
Test rank 4.293203883495146
Test rank 4.918446601941747
Test rank 4.153398058252427
Test rank 5.332038834951456
Test rank 5.8116504854368936
Test rank 5.1825242718446605
Test rank 4.475728155339806


In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    n_estimators=100,
    random_state=random_state,
    loss_function='MultiLogloss'
)
model.fit(X, y)

X_test = input_preprocessor.transform(input_features.query("inputname.isin(@test_inp)"))
y_pred_proba = model.predict_proba(X_test)

# Select the label with highest probability for each sample
pred_cfg_test = enc.inverse_transform(np.argmax(y_pred_proba, axis=1)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)