In [2]:
import pandas as pd
import numpy as np
import plotnine as p9
from scipy import stats
from common import load_data, pareto_rank
import os
import json

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from mlxtend.frequent_patterns import fpgrowth, fpmax
import pysubgroup as ps

In [3]:
random_state = 1234
test_size = 0.2
pareto_cutoff = 0.4
(
    perf_matrix,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(system="x264", data_dir="../data", input_properties_type="tabular")

# Normalization is needed for the Pareto cutoff
# We can normalize before splitting, because
# we normalize per input and we also split per input.
# There is no data leakage.
normalized_metrics = (
    perf_matrix[["inputname"] + all_performances]
    .groupby("inputname", as_index=False)
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)
cutoff_mask = (normalized_metrics <= pareto_cutoff).all(axis=1)

nmdf = perf_matrix[["inputname"] + all_performances].groupby("inputname", as_index=True).transform(lambda x: (x - x.min()) / (x.max() - x.min()))
perf_matrix = pd.merge(perf_matrix, nmdf, suffixes=("_raw", None), left_index=True, right_index=True)

all_perf_raw = [f"{p}_raw" for p in all_performances]
all_perf_norm = [f"{p}" for p in all_performances]

train_inp, test_inp = train_test_split(
    perf_matrix["inputname"].unique(),
    test_size=test_size,
    random_state=random_state,
)
train_perf = perf_matrix[perf_matrix.inputname.isin(train_inp)]
test_perf = perf_matrix[perf_matrix.inputname.isin(test_inp)]

all_performances

['size', 'etime', 'cpu', 'fps', 'kbs']

In [4]:
performances = ["fps", "cpu"]
performances = ["size", "etime"]
icm = (
    train_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_ranked_measures = icm.groupby(
    "inputname"
).transform(  # Go from measured values to ranks within each input group
    lambda x: stats.rankdata(x, method="min")
)

# TODO Adjust pareto_rank with cutoff
# 1. Second version that ranks dominated configurations by their "dominators"
# 2. 
icm["ranks"] = icm.groupby("inputname", group_keys=False).apply(pareto_rank)

# Calculate the Pareto ranks for the test data
icm_test = (
    test_perf[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
icm_test["ranks"] = icm_test.groupby("inputname", group_keys=False).apply(pareto_rank)

icm

Unnamed: 0_level_0,Unnamed: 1_level_0,size,etime,ranks
inputname,configurationID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animation_1080P-01b3,1,0.070142,0.003932,3
Animation_1080P-01b3,2,0.029658,0.066841,4
Animation_1080P-01b3,3,0.001764,0.090433,3
Animation_1080P-01b3,4,0.008462,0.098296,8
Animation_1080P-01b3,5,0.006305,0.094364,7
...,...,...,...,...
Vlog_720P-6d56,197,0.141975,0.723088,14
Vlog_720P-6d56,198,0.074812,0.583593,9
Vlog_720P-6d56,199,0.088498,0.836276,14
Vlog_720P-6d56,200,0.088498,0.809277,11


In [5]:
subdf = icm[icm.ranks <= 1]
subdf

Unnamed: 0_level_0,Unnamed: 1_level_0,size,etime,ranks
inputname,configurationID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Animation_1080P-01b3,12,0.067026,0.001311,1
Animation_1080P-01b3,14,0.070142,0.000000,1
Animation_1080P-01b3,19,0.058674,0.011796,1
Animation_1080P-01b3,58,0.003181,0.039318,1
Animation_1080P-01b3,68,0.000770,0.070773,1
...,...,...,...,...
Vlog_720P-6d56,50,0.223844,0.010038,1
Vlog_720P-6d56,64,0.079784,0.021115,1
Vlog_720P-6d56,71,0.003678,0.027345,1
Vlog_720P-6d56,72,0.000000,0.033922,1


In [47]:
subdf = icm.join(input_features).join(config_features).reset_index()
subdf["Target"] = subdf.ranks <= 1
subdf

dataset = subdf[list(input_features.columns) + list(config_features.columns) + ["Target"]]
dataset

Unnamed: 0,resolution,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY,category,cabac,ref,...,me,direct,deblock,b_adapt,b_pyramid,open_gop,rc_lookahead,scenecut,weightb,Target
0,1080,1920,1080,0.098,0.004,0.017,0.005,Animation,0,1,...,dia,,0:0:0,,,,,0.0,,False
1,1080,1920,1080,0.098,0.004,0.017,0.005,Animation,1,1,...,dia,auto,1:0:0,1.0,2.0,0.0,,40.0,1.0,False
2,1080,1920,1080,0.098,0.004,0.017,0.005,Animation,1,1,...,hex,auto,1:0:0,1.0,2.0,0.0,10.0,40.0,1.0,False
3,1080,1920,1080,0.098,0.004,0.017,0.005,Animation,1,2,...,hex,auto,1:0:0,1.0,2.0,0.0,20.0,40.0,1.0,False
4,1080,1920,1080,0.098,0.004,0.017,0.005,Animation,1,2,...,hex,auto,1:0:0,1.0,2.0,0.0,30.0,40.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206824,720,1280,720,1.074,0.428,11.794,0.415,Vlog,1,16,...,tesa,spatial,1:0:0,2.0,2.0,0.0,60.0,40.0,1.0,False
206825,720,1280,720,1.074,0.428,11.794,0.415,Vlog,1,16,...,hex,spatial,1:0:0,2.0,2.0,0.0,60.0,40.0,1.0,False
206826,720,1280,720,1.074,0.428,11.794,0.415,Vlog,1,16,...,tesa,spatial,1:0:0,2.0,2.0,0.0,60.0,40.0,1.0,False
206827,720,1280,720,1.074,0.428,11.794,0.415,Vlog,1,16,...,tesa,spatial,1:0:0,2.0,2.0,0.0,60.0,40.0,1.0,False


In [55]:
target = ps.BinaryTarget ('Target', True)
searchspace = ps.create_selectors(dataset, nbins=10, ignore=['Target', 'configurationID'])
task = ps.SubgroupDiscoveryTask (
    dataset,
    target,
    searchspace,
    result_set_size=100,
    depth=10,
    qf=ps.WRAccQF())
result = ps.DFS().execute(task)
rdf = result.to_dataframe()

In [59]:
rdf  #.sort_values("target_share_sg", ascending=False)

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.023247,chroma_qp_offset==0 AND fast_pskip==1 AND trel...,77175,206829,8622,10221,129654,0.373134,0.626866,0.843557,0.156443,0.111720,0.012333,0.049418,2.260734
1,0.023247,chroma_qp_offset==0 AND fast_pskip==1 AND qpma...,77175,206829,8622,10221,129654,0.373134,0.626866,0.843557,0.156443,0.111720,0.012333,0.049418,2.260734
2,0.022678,fast_pskip==1 AND trellis==0,85407,206829,8911,10221,121422,0.412935,0.587065,0.871833,0.128167,0.104336,0.010789,0.049418,2.111305
3,0.022678,fast_pskip==1 AND qpmax==69 AND trellis==0,85407,206829,8911,10221,121422,0.412935,0.587065,0.871833,0.128167,0.104336,0.010789,0.049418,2.111305
4,0.022128,chroma_qp_offset==0 AND trellis==0,87465,206829,8899,10221,119364,0.422886,0.577114,0.870658,0.129342,0.101744,0.011075,0.049418,2.058851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.017565,chroma_qp_offset==0 AND deblock=='1:0:0' AND q...,61740,206829,6684,10221,145089,0.298507,0.701493,0.653948,0.346052,0.108260,0.024378,0.049418,2.190725
96,0.017522,chroma_qp_offset==0 AND fast_pskip==1 AND mbtr...,41160,206829,5658,10221,165669,0.199005,0.800995,0.553566,0.446434,0.137464,0.027543,0.049418,2.781670
97,0.017522,chroma_qp_offset==0 AND fast_pskip==1 AND mbtr...,41160,206829,5658,10221,165669,0.199005,0.800995,0.553566,0.446434,0.137464,0.027543,0.049418,2.781670
98,0.017353,deblock=='1:0:0' AND fast_pskip==1 AND mixed_r...,48363,206829,5979,10221,158466,0.233831,0.766169,0.584972,0.415028,0.123628,0.026769,0.049418,2.501689


In [28]:
dataset = subdf.join(input_features).reset_index()[list(input_features.columns) + ["configurationID"]]

all_results = []

for c in dataset.configurationID.unique():
    dataset["Target"] = dataset["configurationID"] == c

    target = ps.BinaryTarget ('Target', True)
    searchspace = ps.create_selectors(dataset, ignore=['Target', 'configurationID'])
    task = ps.SubgroupDiscoveryTask (
        dataset,
        target,
        searchspace,
        result_set_size=20,
        depth=10,
        qf=ps.SimpleBinomialQF())
    result = ps.DFS().execute(task)
    rdf = result.to_dataframe()
    rdf["target"] = c
    all_results.append(rdf)

In [29]:
rdf = pd.concat(all_results)
rdf  #.sort_values("target_share_sg", ascending=False)

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift,target
0,0.006354,COLOR_COMPLEXITY: [0.09:0.18[ AND TEMPORAL_COM...,2,10221,1,468,10219,0.000196,0.999804,0.002137,0.997863,0.500000,0.045699,0.045788,10.919872,12
1,0.006354,COLOR_COMPLEXITY: [0.09:0.18[ AND SPATIAL_COMP...,2,10221,1,468,10219,0.000196,0.999804,0.002137,0.997863,0.500000,0.045699,0.045788,10.919872,12
2,0.006354,COLOR_COMPLEXITY: [0.09:0.18[ AND SPATIAL_COMP...,2,10221,1,468,10219,0.000196,0.999804,0.002137,0.997863,0.500000,0.045699,0.045788,10.919872,12
3,0.006354,COLOR_COMPLEXITY: [0.09:0.18[ AND HEIGHT>=1080...,2,10221,1,468,10219,0.000196,0.999804,0.002137,0.997863,0.500000,0.045699,0.045788,10.919872,12
4,0.006354,COLOR_COMPLEXITY: [0.09:0.18[ AND HEIGHT>=1080...,2,10221,1,468,10219,0.000196,0.999804,0.002137,0.997863,0.500000,0.045699,0.045788,10.919872,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,0.002740,CHUNK_COMPLEXITY_VARIATION<0.67 AND COLOR_COMP...,13,10221,1,1,10208,0.001272,0.998728,1.000000,0.000000,0.076923,0.000000,0.000098,786.230769,128
16,0.002740,CHUNK_COMPLEXITY_VARIATION<0.67 AND COLOR_COMP...,13,10221,1,1,10208,0.001272,0.998728,1.000000,0.000000,0.076923,0.000000,0.000098,786.230769,128
17,0.002740,CHUNK_COMPLEXITY_VARIATION<0.67 AND COLOR_COMP...,13,10221,1,1,10208,0.001272,0.998728,1.000000,0.000000,0.076923,0.000000,0.000098,786.230769,128
18,0.002740,CHUNK_COMPLEXITY_VARIATION<0.67 AND COLOR_COMP...,13,10221,1,1,10208,0.001272,0.998728,1.000000,0.000000,0.076923,0.000000,0.000098,786.230769,128


In [37]:
dataset[result.to_descriptions()[0][1].covers(dataset)]

Unnamed: 0,resolution,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY,category,configurationID,Target
9041,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,15,False
9042,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,25,False
9043,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,26,False
9044,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,45,False
9045,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,46,False
9046,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,47,False
9047,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,62,False
9048,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,80,False
9049,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,81,False
9050,480,720,1080,0.293,0.03,0.568,0.122,VerticalVideo,84,False


In [38]:
result.to_descriptions()[0][1]

(SPATIAL_COMPLEXITY<0.628 and WIDTH: [640:850[ and category=='VerticalVideo' and resolution==480)

In [15]:
import pysubgroup as ps

# Load the example dataset
from pysubgroup.datasets import get_titanic_data
data = get_titanic_data()

target = ps.BinaryTarget ('Survived', True)
searchspace = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=5,
    depth=2,
    qf=ps.WRAccQF())
result = ps.DFS().execute(task)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.13215,Sex=='female',56,156,40,54,100,0.358974,0.641026,0.740741,0.259259,0.714286,0.14,0.346154,2.063492
1,0.101331,Parch==0 AND Sex=='female',41,156,30,54,115,0.262821,0.737179,0.555556,0.444444,0.731707,0.208696,0.346154,2.113821
2,0.079142,Sex=='female' AND SibSp: [0:1[,25,156,21,54,131,0.160256,0.839744,0.388889,0.611111,0.84,0.251908,0.346154,2.426667
3,0.077663,Cabin.isnull() AND Sex=='female',43,156,27,54,113,0.275641,0.724359,0.5,0.5,0.627907,0.238938,0.346154,1.813953
4,0.071746,Embarked=='S' AND Sex=='female',37,156,24,54,119,0.237179,0.762821,0.444444,0.555556,0.648649,0.252101,0.346154,1.873874
