# Rank Prediction

Experiment to predict input x configuration --> Pareto rank

Implemented as a classification problem, but could be learning-to-rank.

In [6]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotnine as p9
from scipy import stats
from common import (
    load_data,
    pareto_rank,
    baseline_results,
    DecisionTreeClassifierWithMultipleLabels,
    DecisionTreeClassifierWithMultipleLabelsPandas,
)
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
# Data Loading
random_state = 1234
test_size = 0.40
pareto_cutoff = 0.4
rank_by_domination_count = False

# performances = ["fps", "cpu"]
performances = ["kbs", "fps"]
# performances = []

(
    perf_matrix_initial,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(system="x264", data_dir="../data", input_properties_type="tabular")

if len(performances) == 0:
    performances = all_performances

# Normalization is needed for the Pareto cutoff
# We can normalize before splitting, because
# we normalize per input and we also split per input.
# There is no data leakage.
normalized_metrics = (
    perf_matrix_initial[["inputname"] + all_performances]
    .groupby("inputname", as_index=False)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
cutoff_mask = (normalized_metrics <= pareto_cutoff).all(axis=1)

nmdf = (
    perf_matrix_initial[["inputname"] + all_performances]
    .groupby("inputname", as_index=True)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
perf_matrix = pd.merge(
    perf_matrix_initial, nmdf, suffixes=("_raw", None), left_index=True, right_index=True
)
perf_matrix["feasible"] = cutoff_mask

all_perf_raw = [f"{p}_raw" for p in all_performances]
all_perf_norm = [f"{p}" for p in all_performances]

icm_all = (
    perf_matrix[["inputname", "configurationID"] + all_performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm_all.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm_all["ranks"] = icm_all.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

In [3]:
# Split data and preprocess further
train_inp, test_inp = train_test_split(
    perf_matrix["inputname"].unique(),
    test_size=test_size,
    random_state=random_state,
)
train_perf = perf_matrix[perf_matrix.inputname.isin(train_inp)]
test_perf = perf_matrix[perf_matrix.inputname.isin(test_inp)]


icm = (
    train_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
# icm = train_perf[train_perf.configurationID != 71]
icm_ranked_measures = icm.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)
icm["ranks"] = icm.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Calculate the Pareto ranks for the test data
icm_test = (
    test_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_test["ranks"] = icm_test.groupby("inputname", group_keys=False).apply(
    lambda x: pareto_rank(
        x, cutoff=pareto_cutoff, rank_by_domination_count=rank_by_domination_count
    )
)

# Full dataset of input features + config features that are in the first rank
dataset = icm[icm.ranks <= 1].join(config_features).join(input_features).reset_index()

In [4]:
# Save the original index
original_index = input_features.index

# Preprocess the features
preprocessed_features = input_preprocessor.fit_transform(input_features)

# Convert to DataFrame and set the original index
preprocessed_input = pd.DataFrame(preprocessed_features, index=original_index)

original_index = config_features.index

# Preprocess the features
preprocessed_features = config_preprocessor.fit_transform(config_features)

# Convert to DataFrame and set the original index
preprocessed_config = pd.DataFrame(preprocessed_features, index=original_index)

dataset2 = icm.join(preprocessed_config).join(preprocessed_input, lsuffix="_cfg", rsuffix="_inp").reset_index()
feature_col = [c for c in dataset2.columns if c not in ["inputname", "configurationID", "ranks", "fps", "kbs"]]
X = dataset2[feature_col].to_numpy()
y = dataset2["ranks"].to_numpy() - 1

In [None]:
clf = RandomForestClassifier()
cross_val_score(clf, X, y, cv=5)

In [1]:
clf = XGBClassifier()
cross_val_score(clf, X, y, cv=5)

NameError: name 'XGBClassifier' is not defined

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier((100, 100), max_iter=200, verbose=True)
# cross_val_score(clf, X, y, cv=5)
clf.fit(X,y)
clf.score(X,y)

In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import pandas as pd
import keras
from keras.utils import FeatureSpace

dataset

val_dataframe = dataset.sample(frac=0.2, random_state=1337)
train_dataframe = dataset.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("ranks")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)