In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, top_k_accuracy_score, accuracy_score
from sklearn.linear_model import ElasticNet

In [2]:
predDimension = "kbs"
v_names_train = np.loadtxt("train_names.csv", dtype=str)
v_names_test = np.loadtxt("test_names.csv", dtype=str)

video_features = ["WIDTH", "HEIGHT", "SPATIAL_COMPLEXITY", "TEMPORAL_COMPLEXITY", "COLOR_COMPLEXITY", "ORIG_SIZE", "ORIG_KBS", "ORIG_DURATION"]
config_features = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", 
                "b_adapt", "direct", "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", 
                "mbtree", "qpmax", "aq-mode"]
config_features_categorical = ['analyse', 'me', 'direct', 'deblock', 'b_pyramid', 'b_adapt', 'weightb', 'open_gop', 'scenecut', 'rc_lookahead']

In [3]:
df = pd.read_csv("all_features.csv")

for c in config_features: #_categorical:
    df[c], _ = pd.factorize(df[c])
    
df.head()

Unnamed: 0,configurationID,cabac,ref,deblock,analyse,me,subme,mixed_ref,me_range,trellis,...,etime,FILENAME,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,COLOR_COMPLEXITY,ORIG_SIZE,ORIG_DURATION,ORIG_KBS
0,1,0,0,0,0,0,0,0,0,0,...,2.14,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
1,101,1,1,1,1,1,1,1,0,1,...,3.4,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
2,102,1,1,1,1,1,1,1,0,1,...,2.71,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
3,103,1,1,0,2,2,1,1,0,1,...,2.78,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
4,104,1,2,1,1,1,1,1,1,1,...,2.74,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278


In [4]:
# we separate the list of videos into a training (i.e. offline) set and a test set (i.e. online)
train_ind, test_ind = train_test_split([k for k in range(len(v_names_train))], test_size = 0.25)
# training set indexes
# train_index = [v[:-4] for v in v_names_train]
train_index = [v_names_train[k][:-4] for k in train_ind]
# test set indexes
test_index = [v_names_train[k][:-4] for k in test_ind]
print(len(train_index), len(test_index))

train_df = df[df.FILENAME.isin(train_index)].reset_index()
val_df = df[df.FILENAME.isin(test_index)].reset_index()

# X_train = train_df[video_features + config_features]
# y_train = np.array(train_df[predDimension]).reshape(-1, 1)
# X_val = val_df[video_features + config_features]
# y_val = np.array(val_df[predDimension]).reshape(-1, 1)

787 263


In [5]:
X_train = []
y_train = []
for video_name in train_df.FILENAME.unique():
    idx = train_df[train_df.FILENAME == video_name][predDimension].idxmin()
    row = train_df.iloc[idx]
    X_train.append(row[video_features].to_numpy())
    y_train.append(row['configurationID'])

X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, 1)

X_val = []
y_val = []
for video_name in sorted(val_df.FILENAME.unique()):
    idx = val_df[val_df.FILENAME == video_name][predDimension].idxmin()
    row = val_df.iloc[idx]
    X_val.append(row[video_features].to_numpy())
    y_val.append(row['configurationID'])

X_val = np.array(X_val)
y_val = np.array(y_val).reshape(-1, 1)

In [6]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((730, 8), (730, 1), (241, 8), (241, 1))

In [7]:
def performance_ratio(df, pred_classes):
    ratios = []
    for video_name, pred_cl in zip(sorted(df.FILENAME.unique()), pred_classes):
        idx = df[df.FILENAME == video_name][predDimension].idxmin()
        best_perf = df.iloc[idx][predDimension]
        pred_perf = df[(df.FILENAME == video_name) & (df.configurationID == pred_cl)][predDimension]
        ratios.append(pred_perf/best_perf)
        
    return np.mean(ratios)

In [8]:
clf = RandomForestClassifier(max_depth=None, min_samples_leaf=2, n_estimators=100, n_jobs=1)
clf.fit(X_train, y_train.ravel())
clf.score(X_val, y_val)
y_pred_val = clf.predict(X_val)
performance_ratio(val_df, y_pred_val)

1.1222257210075532

In [9]:
val_df.shape

(48441, 44)

In [19]:
# Reformulate into a classification problem: how much will be the kbs be reduced?

X_train = train_df[video_features + config_features].to_numpy()
# y_train = np.array(train_df[predDimension] / train_df["ORIG_KBS"]).reshape(-1, 1)
# y_train = np.array(train_df[predDimension]).reshape(-1, 1)
X_val = val_df[video_features + config_features].to_numpy()
# y_val = np.array(val_df[predDimension] / val_df["ORIG_KBS"]).reshape(-1, 1)
# y_val = np.array(val_df[predDimension]).reshape(-1, 1)

classes = 10 ** np.linspace(np.log10(0.001), np.log10(1.0), 50)
y_train = ((train_df[predDimension] / train_df["ORIG_KBS"]).to_numpy()[:, np.newaxis] < classes.reshape(1, -1)).argmax(axis=1).reshape(-1, 1)
y_val = ((val_df[predDimension] / val_df["ORIG_KBS"]).to_numpy()[:, np.newaxis] < classes.reshape(1, -1)).argmax(axis=1).reshape(-1, 1)
true_ratio = (val_df[predDimension]).to_numpy()

In [None]:
def performance_ratio_clf(df, clf):
    ratios = []
    for video_name, pred_cl in zip(sorted(df.FILENAME.unique()), pred_classes):
        idx = df[df.FILENAME == video_name][predDimension].idxmin()
        best_perf = df.iloc[idx][predDimension]
        pred_perf = df[(df.FILENAME == video_name) & (df.configurationID == pred_cl)][predDimension]
        ratios.append(pred_perf/best_perf)
        
    return np.mean(ratios)

In [14]:
clf = RandomForestClassifier(max_depth=None, min_samples_leaf=2, n_estimators=100, n_jobs=1)
clf.fit(X_train, y_train.ravel())
clf.score(X_val, y_val)
y_pred_val = clf.predict(X_val)
# accuracy(y_val, y_pred_val)
# performance_ratio(val_df, y_pred_val)

In [21]:
np.mean(classes[y_pred_val]*val_df["ORIG_KBS"] / true_ratio)

1.3039912747343665

In [12]:
# clf = RandomForestClassifier(max_depth=None, min_samples_leaf=2, n_estimators=100, n_jobs=1)
# clf.fit(X_train, y_train)
# print(clf.classes_)
# print(clf.n_classes_)
# print(clf.n_outputs_)
# y_val_pred = clf.predict(X_val)
# accuracy_score(y_val, y_val_pred)