In [2]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotnine as p9
from scipy import stats
from common import (
    load_data,
    pareto_rank,
    baseline_results,
    DecisionTreeClassifierWithMultipleLabels,
    DecisionTreeClassifierWithMultipleLabelsPandas
)
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [40]:
# Data Loading
random_state = 1234
test_size = 0.40
pareto_cutoff = 0.5
rank_by_domination_count = True

# performances = ["fps", "cpu"]
# performances = ["kbs", "fps"]
performances = []

(
    perf_matrix_initial,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(system="x264", data_dir="../data", input_properties_type="tabular")

if len(performances) == 0:
    performances = all_performances

# Normalization is needed for the Pareto cutoff
# We can normalize before splitting, because
# we normalize per input and we also split per input.
# There is no data leakage.
normalized_metrics = (
    perf_matrix_initial[["inputname"] + performances]
    .groupby("inputname", as_index=False)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
cutoff_mask = (normalized_metrics <= pareto_cutoff).all(axis=1)

nmdf = (
    perf_matrix_initial[["inputname"] + performances]
    .groupby("inputname", as_index=True)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
perf_matrix = pd.merge(
    perf_matrix_initial, nmdf, suffixes=("_raw", None), left_index=True, right_index=True
)
perf_matrix["feasible"] = cutoff_mask

all_perf_raw = [f"{p}_raw" for p in performances]
all_perf_norm = [f"{p}" for p in performances]

icm_all = (
    perf_matrix[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm_all.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm_all["ranks"] = icm_all.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

In [43]:
# Split data and preprocess further
train_inp, test_inp = train_test_split(
    perf_matrix["inputname"].unique(),
    test_size=test_size,
    random_state=random_state,
)
train_perf = perf_matrix[perf_matrix.inputname.isin(train_inp)].copy()
test_perf = perf_matrix[perf_matrix.inputname.isin(test_inp)]

dropped_measurements = train_perf.sample(frac=0.9, random_state=1337)
train_perf.drop(dropped_measurements.index, inplace=True)

icm = (
    train_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm["ranks"] = icm.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Calculate the Pareto ranks for the test data
icm_test = (
    test_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_test["ranks"] = icm_test.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Full dataset of input features + config features that are in the first rank
dataset = icm[icm.ranks <= 1].join(config_features).join(input_features).reset_index()

In [44]:
# Baseline results
baseline_results(icm, icm_ranked_measures, icm_test, dataset, config_features, verbose=True)

Average rank of the overall best configuration: 2.23+-2.25
Average rank of the most common configuration: 2.23+-2.25
Average rank of the best configuration for all metrics: 4.92+-1.82
Average rank of random configuration: 3.83+-2.40


{'overall': [2.2349514563106796, 2.246731683222051],
 'metric': [4.918446601941747, 1.8151243358010498],
 'common': [2.2349514563106796, 2.246731683222051],
 'random': [3.825631067961165, 2.404171000865784]}

In [45]:
## We make a multi-class classification problem
# Each input is annotated with the rank-1 classes

enc = LabelEncoder()
enc.fit(dataset["configurationID"].tolist())

grouped_df = dataset.groupby("inputname")["configurationID"].apply(enc.transform).reset_index()
mlb = MultiLabelBinarizer()
# Fit and transform the 'Values' column
binary_matrix = mlb.fit_transform(grouped_df['configurationID'])

# Create a new DataFrame with the binary matrix
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_, index=grouped_df['inputname'])

X = input_preprocessor.fit_transform(
    input_features[input_features.index.get_level_values("inputname").isin(train_inp)].sort_index()
)
y = binary_df.values

clf = DecisionTreeClassifierWithMultipleLabels(max_depth=X.shape[1])
clf.fit(X, y)
clf.score(X, y)

X_test = input_preprocessor.transform(
    input_features.query("inputname.isin(@test_inp)").sort_index()
)
pred_cfg_test = enc.inverse_transform(clf.predict(X_test)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)

Test rank 3.258252427184466


In [46]:
Xpd = input_features[input_features.index.get_level_values("inputname").isin(train_inp)].sort_index()
Xpd_test = input_features[input_features.index.get_level_values("inputname").isin(test_inp)].sort_index()

clf = DecisionTreeClassifierWithMultipleLabelsPandas(max_depth=Xpd.shape[1])
clf.fit(Xpd, y)
clf.score(Xpd, y)

pred_cfg_test = enc.inverse_transform(clf.predict(Xpd_test)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)

Test rank 3.153398058252427


In [69]:
clf.unique_leaf_values()

41

In [55]:
X_test = input_preprocessor.transform(
    input_features.query("inputname.isin(@test_inp)").sort_index()
)
pred_cfg_test = enc.inverse_transform(clf.predict(X_test)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)

AttributeError: 'numpy.ndarray' object has no attribute 'iterrows'

In [42]:
input_features[
        input_features.index.get_level_values("inputname").isin(train_inp)
    ].sort_index().dtypes

resolution                      int64
WIDTH                           int64
HEIGHT                          int64
SPATIAL_COMPLEXITY            float64
TEMPORAL_COMPLEXITY           float64
CHUNK_COMPLEXITY_VARIATION    float64
COLOR_COMPLEXITY              float64
category                       object
dtype: object

In [39]:
enc = LabelEncoder()
enc.fit(dataset["configurationID"].tolist())

grouped_df = (
    dataset.groupby("inputname")["configurationID"].apply(enc.transform).reset_index()
)
mlb = MultiLabelBinarizer()
# Fit and transform the 'Values' column
binary_matrix = mlb.fit_transform(grouped_df["configurationID"])

# Create a new DataFrame with the binary matrix
binary_df = pd.DataFrame(
    binary_matrix, columns=mlb.classes_, index=grouped_df["inputname"]
)

X = input_preprocessor.fit_transform(
    input_features[
        input_features.index.get_level_values("inputname").isin(train_inp)
    ].sort_index()
)
y = binary_df.values

train_idx, val_idx = train_test_split(
    np.arange(X.shape[0]), test_size=0.2, random_state=random_state
)
X_train = X[train_idx]
X_val = X[val_idx]
y_train = y[train_idx]
y_val = y[val_idx]
inputnames_train = train_inp[train_idx]
inputnames_val = train_inp[val_idx]

best_val_rank = 100_000
best_depth = 0

for i in range(1, X.shape[1] + 1):
    clf = DecisionTreeClassifierWithMultipleLabels(max_depth=i)
    # clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    val_score = clf.score(X_val, y_val)
    train_score = clf.score(X_train, y_train)

    # Validation test
    pred_cfg_lbl = clf.predict(X_val)
    pred_cfg = enc.inverse_transform(pred_cfg_lbl).astype(int)
    inp_pred_map = pd.DataFrame(
        zip(inputnames_val, pred_cfg), columns=["inputname", "configurationID"]
    )
    val_rank = icm.merge(inp_pred_map, on=["inputname", "configurationID"])[
        "ranks"
    ].mean()

    print(
        f"Depth={i}, Train score={train_score:.2f}, Val score={val_score:.2f}, Val rank={val_rank:.2f}"
    )

    if val_rank < best_val_rank:
        best_val_rank = val_rank
        best_depth = i

print(f"Best depth {best_depth}")
clf = DecisionTreeClassifierWithMultipleLabels(max_depth=best_depth)

# Test on whole training set
clf.fit(X, y)
pred_cfg = enc.inverse_transform(clf.predict(X)).astype(int)

print("Scores", clf.score(X, y))
inp_pred_map = pd.DataFrame(
    zip(inputnames_train, pred_cfg[train_idx]), columns=["inputname", "configurationID"]
)
train_rank = icm.merge(inp_pred_map, on=["inputname", "configurationID"])[
    "ranks"
].mean()
print("Train rank", train_rank)

# Validation test
inp_pred_map = pd.DataFrame(
    zip(inputnames_val, pred_cfg[val_idx]), columns=["inputname", "configurationID"]
)
val_rank = icm.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean()
print("Val rank", val_rank)

# Test set
X_test = input_preprocessor.transform(input_features.query("inputname.isin(@test_inp)"))
pred_cfg_test = enc.inverse_transform(clf.predict(X_test)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg_test), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)
print("")

Depth=1, Train score=0.52, Val score=0.42, Val rank=3.44
Depth=2, Train score=0.54, Val score=0.52, Val rank=3.77
Depth=3, Train score=0.55, Val score=0.46, Val rank=3.64
Depth=4, Train score=0.61, Val score=0.48, Val rank=3.90
Depth=5, Train score=0.66, Val score=0.52, Val rank=3.74
Depth=6, Train score=0.74, Val score=0.50, Val rank=3.90
Depth=7, Train score=0.86, Val score=0.45, Val rank=3.71
Depth=8, Train score=0.96, Val score=0.42, Val rank=3.76
Depth=9, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=10, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=11, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=12, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=13, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=14, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=15, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=16, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=17, Train score=1.00, Val score=0.41, Val rank=3.78
Depth=18, Train score=1