In [1]:
import pandas as pd
import numpy as np
import plotnine as p9
from scipy import stats
from common import load_data, pareto_rank
import os
import json

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
random_state = 1234
test_size = 0.2
pareto_cutoff = 0.4
(
    perf_matrix,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(system="x264", data_dir="../data", input_properties_type="tabular")

# Normalization is needed for the Pareto cutoff
# We can normalize before splitting, because
# we normalize per input and we also split per input.
# There is no data leakage.
normalized_metrics = (
    perf_matrix[["inputname"] + all_performances]
    .groupby("inputname", as_index=False)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
cutoff_mask = (normalized_metrics <= pareto_cutoff).all(axis=1)

nmdf = perf_matrix[["inputname"] + all_performances].groupby("inputname", as_index=True).transform(lambda x: (x - x.min()) / (x.max() - x.min()))
perf_matrix = pd.merge(perf_matrix, nmdf, suffixes=("_raw", None), left_index=True, right_index=True)

all_perf_raw = [f"{p}_raw" for p in all_performances]
all_perf_norm = [f"{p}" for p in all_performances]

train_inp, test_inp = train_test_split(
    perf_matrix["inputname"].unique(),
    test_size=test_size,
    random_state=random_state,
)
train_perf = perf_matrix[perf_matrix.inputname.isin(train_inp)]
test_perf = perf_matrix[perf_matrix.inputname.isin(test_inp)]

all_performances

['size', 'etime', 'cpu', 'fps', 'kbs']

In [3]:
performances = ["fps", "cpu"]
performances = ["size", "etime"]
icm = (
    train_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)

# TODO Adjust pareto_rank with cutoff
# 1. Second version that ranks dominated configurations by their "dominators"
# 2. 
icm["ranks"] = icm.groupby("inputname", group_keys=False).apply(pareto_rank)

# Calculate the Pareto ranks for the test data
icm_test = (
    test_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_test["ranks"] = icm_test.groupby("inputname", group_keys=False).apply(pareto_rank)

icm

Unnamed: 0_level_0,Unnamed: 1_level_0,size,etime,ranks
inputname,configurationID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animation_1080P-01b3,1,0.070142,0.003932,3
Animation_1080P-01b3,2,0.029658,0.066841,4
Animation_1080P-01b3,3,0.001764,0.090433,3
Animation_1080P-01b3,4,0.008462,0.098296,8
Animation_1080P-01b3,5,0.006305,0.094364,7
...,...,...,...,...
Vlog_720P-6d56,197,0.141975,0.723088,14
Vlog_720P-6d56,198,0.074812,0.583593,9
Vlog_720P-6d56,199,0.088498,0.836276,14
Vlog_720P-6d56,200,0.088498,0.809277,11


In [4]:
subdf = icm[icm.ranks <= 1]
subdf

Unnamed: 0_level_0,Unnamed: 1_level_0,size,etime,ranks
inputname,configurationID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animation_1080P-01b3,12,0.067026,0.001311,1
Animation_1080P-01b3,14,0.070142,0.000000,1
Animation_1080P-01b3,19,0.058674,0.011796,1
Animation_1080P-01b3,58,0.003181,0.039318,1
Animation_1080P-01b3,68,0.000770,0.070773,1
...,...,...,...,...
Vlog_720P-6d56,50,0.223844,0.010038,1
Vlog_720P-6d56,64,0.079784,0.021115,1
Vlog_720P-6d56,71,0.003678,0.027345,1
Vlog_720P-6d56,72,0.000000,0.033922,1


In [24]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

class CustomDecisionTreeClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree_ = None

    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y, depth=0)
        return self

    def _build_tree(self, X, y, depth, X_indices=None):
        if len(y) == 0:
            return None
        
        # Compute label probabilities for the current node
        label_counts = np.bincount(y)
        
        # Check stopping conditions
        if depth == self.max_depth or len(y) < self.min_samples_split:
            return {'type': 'leaf', 'class': np.argmax(label_counts)}

        # Find the best split
        best_split = None
        best_impurity = 1 #impurity
        # for feature in range(X.shape[1]):
        for feature in X.columns:
            thresholds = X[feature].unique()
            for threshold in thresholds:
                left_mask = X[feature] <= threshold
                right_mask = ~left_mask
                left_y, right_y = y[left_mask], y[right_mask]
                
                if len(left_y) == 0 or len(right_y) == 0:
                    continue
                
                # Compute weighted impurity for the split
                # left_label_counts = np.bincount(left_y, minlength=len(label_counts))
                # right_label_counts = np.bincount(right_y, minlength=len(label_counts))

                # left_label_probs = left_label_counts / len(left_y)
                # right_label_probs = right_label_counts / len(right_y)
                
                # left_impurity = -np.sum(left_label_probs * np.log(left_label_probs + 1e-9))
                # right_impurity = -np.sum(right_label_probs * np.log(right_label_probs + 1e-9))
                
                # weighted_impurity = (len(left_y) * left_impurity + len(right_y) * right_impurity) / len(y)


                # Es soll jede Seite unique y values haben
                # Uberlappende sind okay, solange es unique gibt

                # TODO Die Impurity innerhalb eines leafs ist die Anzahl configs die nicht in der Pareto front sind für alle abgedeckten inputs
                # Dann brauche ich auch keine spezielle impurity im parent node mehr, s.u.

                # impurity = 0, jede Seite exakt 1 unique
                # weighted_impurity = len(np.intersect1d(left_y, right_y)) / len(y)
                left_imp = np.setdiff1d(left_y, right_y)
                right_imp = np.setdiff1d(right_y, left_y)
                weighted_impurity = (len(left_imp) + len(right_imp)) / len(y)
                
                if weighted_impurity < best_impurity:
                    best_impurity = weighted_impurity
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'left_mask': left_mask,
                        'right_mask': right_mask,
                        'left_y': left_y,
                        'right_y': right_y,
                    }
        
        if best_split is None:
            return {'type': 'leaf', 'class': np.argmax(label_counts)}
        
        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(X[best_split['left_mask']], best_split['left_y'], depth + 1)
        right_subtree = self._build_tree(X[best_split['right_mask']], best_split['right_y'], depth + 1)
        
        return {
            'type': 'node',
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree,
        }

    def predict(self, X):
        return np.array([self._predict_instance(x) for x in X.iterrows()])

    def _predict_instance(self, x):
        node = self.tree_
        print(x)
        while node['type'] != 'leaf':
            print(node['feature'])
            if x[node['feature']] <= node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['class']


data = subdf.join(input_features).reset_index().set_index("inputname").sort_index()
input_labels = data.configurationID
enc = LabelEncoder()
y = enc.fit_transform(input_labels)

X = input_preprocessor.fit_transform(
    data[input_features.columns].query("inputname.isin(@input_labels.index)").sort_index()
)
X = data[input_features.columns].query("inputname.isin(@input_labels.index)").sort_index()
clf = CustomDecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

In [35]:
clf.predict(X)

TypeError: string indices must be integers, not 'str'

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split



InvalidIndexError: (slice(None, None, None), 'resolution')

In [22]:
X["resolution"].unique()

array([1080, 2160,  360,  480,  720])

In [6]:


input_labels = subdf.set_index("inputname").configurationID.sort_index()
enc = LabelEncoder()
y = enc.fit_transform(input_labels)

X = input_preprocessor.fit_transform(
    subdf.set_index("inputname")[input_features.columns].query("inputname.isin(@input_labels.index)").sort_index()
)

train_idx, val_idx = train_test_split(
    np.arange(X.shape[0]), test_size=0.2, random_state=random_state
)
X_train = X[train_idx]
X_val = X[val_idx]
y_train = y[train_idx]
y_val = y[val_idx]
inputnames_val = input_labels.index[val_idx]

# X_train = X
# y_train = y
# X_val = X
# y_val = y
# inputnames_val = input_labels.index

best_val_rank = 100_000
best_depth = 0

for i in range(1, X.shape[1]):
    print(i)
    clf = CustomDecisionTreeClassifier(max_depth=i)
    # clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    val_score = clf.score(X_val, y_val)
    print("Scores", clf.score(X_train, y_train), val_score)

    # Validation test
    pred_cfg_lbl = clf.predict(X_val)
    pred_cfg = enc.inverse_transform(pred_cfg_lbl).astype(int)
    inp_pred_map = pd.DataFrame(
        zip(inputnames_val, pred_cfg), columns=["inputname", "configurationID"]
    )
    val_rank = icm.merge(inp_pred_map, on=["inputname", "configurationID"])[
        "ranks"
    ].mean()
    print("Val rank", val_rank)

    if val_rank < best_val_rank:
        best_val_rank = val_score
        best_depth = i

print(f"Best depth {best_depth}")
clf = CustomDecisionTreeClassifier(max_depth=best_depth)
# clf = RandomForestClassifier()
clf.fit(X, y)
pred_cfg_lbl = clf.predict(X)
pred_cfg = enc.inverse_transform(pred_cfg_lbl).astype(int)

print("Scores", clf.score(X, y))

# Validation test
inp_pred_map = pd.DataFrame(
    zip(inputnames_val, pred_cfg), columns=["inputname", "configurationID"]
)
val_rank = icm.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean()
print("Val rank", val_rank)

# Test set
X_test = input_preprocessor.transform(input_features.query("inputname.isin(@test_inp)"))
pred_cfg = enc.inverse_transform(clf.predict(X_test)).astype(int)

inp_pred_map = pd.DataFrame(
    zip(test_inp, pred_cfg), columns=["inputname", "configurationID"]
)
print(
    "Test rank",
    icm_test.merge(inp_pred_map, on=["inputname", "configurationID"])["ranks"].mean(),
)
print("")

KeyError: "None of ['inputname'] are in the columns"