In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# This is all_features.csv
vf = pd.read_csv("deepvar_video_features.csv", usecols=["FILENAME", "WIDTH", "HEIGHT", "SPATIAL_COMPLEXITY", "TEMPORAL_COMPLEXITY", "CHUNK_COMPLEXITY_VARIATION", "COLOR_COMPLEXITY"])
vf.head()

Unnamed: 0,FILENAME,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY
0,Animation_1080P-01b3,1920,1080,0.098,0.004,0.017,0.005
1,Animation_1080P-05f8,1920,1080,1.229,0.454,15.962,0.794
2,Animation_1080P-0c4f,1920,1080,1.714,0.136,9.048,0.0
3,Animation_1080P-0cdf,1920,1080,1.33,0.451,3.677,0.181
4,Animation_1080P-18f5,1920,1080,2.452,0.137,3.918,0.329


In [3]:
vf.describe()

Unnamed: 0,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY
count,1397.0,1397.0,1397.0,1397.0,1397.0,1397.0
mean,1526.392269,900.219041,1.354391,0.353651,4.15589,0.284471
std,1027.919721,540.881707,0.851814,0.226049,4.63931,0.230898
min,202.0,320.0,0.008,0.001,0.002,0.0
25%,640.0,480.0,0.713,0.173,0.767,0.113
50%,1280.0,720.0,1.164,0.321,2.409,0.208
75%,1920.0,1080.0,1.86,0.516,6.091,0.413
max,5120.0,2560.0,4.675,1.286,40.49,2.233


In [4]:
def elapsedtime_to_sec(el):
    tab = el.split(":")
    return float(tab[0])*60+float(tab[1])

# the data folder, see the markdown there for additional explanations
res_dir = "res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

# the list of measurements
listVideo = []

# we add each dataset in the list, converting the time to the right format
# third line asserts that the measures are complete
for v in v_names:
    data = pd.read_table(res_dir+v, delimiter = ',')
    data['etime'] = [*map(elapsedtime_to_sec, data['elapsedtime'])]
    assert data.shape == (201,34), v
    data["FILENAME"] = os.path.splitext(v)[0]
    listVideo.append(data)


In [5]:
print(" We consider ", len(listVideo), " videos")
listVideo[0].head()

 We consider  1397  videos


Unnamed: 0,configurationID,cabac,ref,deblock,analyse,me,subme,mixed_ref,me_range,trellis,8x8dct,fast_pskip,chroma_qp_offset,bframes,b_pyramid,b_adapt,direct,weightb,open_gop,weightp,scenecut,rc_lookahead,mbtree,qpmax,aq-mode,size,usertime,systemtime,elapsedtime,cpu,frames,fps,kbs,etime,FILENAME
0,1,0,1,0:0:0,0:0,dia,0,0,16,0,0,1,0,0,,,,,,0,0,,0,69,0,403085,7.94,1.36,0:02.14,434,600,375.22,161.07,2.14,Animation_1080P-01b3
1,101,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,1,1,0,8,2.0,1.0,spatial,1.0,0.0,1,0,60.0,1,69,1,234157,23.17,1.79,0:03.40,734,600,217.07,93.57,3.4,Animation_1080P-01b3
2,102,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,1,1,0,8,2.0,1.0,auto,1.0,0.0,1,0,10.0,1,69,0,159836,18.42,1.66,0:02.71,739,600,293.42,63.87,2.71,Animation_1080P-01b3
3,103,1,2,0:0:0,0x3:0x3,umh,6,1,16,1,1,1,0,8,2.0,1.0,spatial,1.0,0.0,1,0,10.0,1,69,0,163586,22.39,1.53,0:02.78,858,600,276.79,65.37,2.78,Animation_1080P-01b3
4,104,1,16,1:0:0,0x3:0x113,hex,6,1,24,1,1,1,-2,3,2.0,1.0,auto,1.0,0.0,1,40,30.0,0,69,1,218392,17.4,1.76,0:02.74,699,600,287.79,87.27,2.74,Animation_1080P-01b3


In [6]:
predDimension = "kbs"
v_names_train = np.loadtxt("train_names.csv", dtype= str)

# save names of test inputs
# np.savetxt("../../data/test_names.csv", v_names_test, fmt='%s')
v_names_test = np.loadtxt("test_names.csv", dtype= str)

In [7]:
listVideoTest = [listVideo[i] for i in range(len(listVideo)) if v_names[i] in v_names_test]
assert len(listVideoTest) == len(v_names_test)

best_perfs = [np.min(vid[predDimension]) for vid in listVideoTest]
best_configs = [np.argmin(vid[predDimension]) for vid in listVideoTest]

In [8]:
listFeatures = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", 
                "b_adapt", "direct", "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", 
                "mbtree", "qpmax", "aq-mode"]
categorial = ['analyse', 'me', 'direct']
val_config = listVideo[0][listFeatures].replace(to_replace ="None",value='0')
val_config['deblock'] =[int(val[0]) for val in val_config['deblock']]

for col in val_config.columns:
    if col not in categorial:
        arr_col = np.array(val_config[col],int)
        arr_col = (arr_col-np.mean(arr_col))/(np.std(arr_col)+1e-5)
        val_config[col] = arr_col
    else:
        if col not in [predDimension,'ranking']:
            val_config[col] = [np.where(k==val_config[col].unique())[0][0] for k in val_config[col]]
            arr_col = np.array(val_config[col],int)
            arr_col = (arr_col-np.mean(arr_col))/(np.std(arr_col)+1e-5)
            val_config[col] = arr_col

In [9]:
# we load the file (in itself an aggregation of datasets)
# the file is available in the data folder, then ugc_meta
# each line is a video, and the columns are the different metrics
# provided by Wang et. al.
meta = pd.read_csv("deepvar_video_features.csv").set_index('FILENAME')
# category is a high-level characterization of the content of the video
# for an example, Sports for a sports video
# you can see more details about different categories 
# and metrics per category in the resources/categories.csv file
# I also recommand to read the Youtube UGC paper to understand why we consider these categories
meta['category']=[str(meta.index[i]).split('_')[0] for i in range(meta.shape[0])]
# a lot of NA, not a big feature importance, seems complicated to compute -> remove NOISE DMOS
del meta['NOISE_DMOS']
# fill NA with zeros
meta = meta.fillna(0)
# create a numeric variable (quanti) to compute the category
# one video has one and only one category (1 to 1 in sql, so we can join the tables)
# again, to do it properly, we should use dummies
# but then we cannot compare directly the importances of the metrics to categories 
cat_tab = pd.Series(meta['category'].values).unique()
meta['video_category'] = [np.where(cat_tab==meta['category'][i])[0][0] for i in range(len(meta['category']))]
# delete the old columns (quali)
del meta['category']
# we normalize the variables, since height mean is about 1000, and complexity about 2
# different scales do not behave correctly with learning algorithms
for col in meta.columns:#[:len(meta.columns)-1]:
    inter = np.array(meta[col],float)
    meta[col] = (inter-np.mean(inter))/np.std(inter)
# left join performance groups to the dataset of metrics
# fake groups
groups = np.random.randint(0,3, len(listVideo))
perf = pd.DataFrame({'FILENAME': np.array([v_names[k][:-4] for k in range(len(v_names))]),
              'perf_group' : np.array([k for k in groups])}).set_index('FILENAME')
meta_perf = perf.join(meta)
# print the results
meta_perf

Unnamed: 0_level_0,perf_group,SLEEQ_DMOS,BANDING_DMOS,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,CHUNK_COMPLEXITY_VARIATION,COLOR_COMPLEXITY,video_category
FILENAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Animation_1080P-01b3,2,-0.678859,4.653015,0.383054,0.332504,-1.475487,-1.547345,-0.892454,-1.210798,-1.618194
Animation_1080P-05f8,1,0.844509,0.741729,0.383054,0.332504,-0.147257,0.444086,2.545710,2.207516,-1.618194
Animation_1080P-0c4f,1,-0.655778,-0.377464,0.383054,0.332504,0.422320,-0.963192,1.054868,-1.232460,-1.618194
Animation_1080P-0cdf,1,-0.294170,-0.059377,0.383054,0.332504,-0.028644,0.430810,-0.103261,-0.448284,-1.618194
Animation_1080P-18f5,0,-0.478821,-0.377464,0.383054,0.332504,1.289017,-0.958767,-0.051295,0.192920,-1.618194
...,...,...,...,...,...,...,...,...,...,...
Vlog_720P-561e,0,-0.678859,-0.377464,-0.239786,-0.333314,0.978979,-1.414583,-0.652893,0.457201,1.494379
Vlog_720P-5d08,0,-0.678859,-0.377464,-0.773092,-0.333314,3.257287,-0.303807,-0.437698,-0.158009,1.494379
Vlog_720P-60f8,0,0.444433,0.623920,-0.239786,-0.333314,0.234418,-0.042708,-0.364385,-0.149344,1.494379
Vlog_720P-6410,1,-0.455739,3.769441,-0.239786,-0.333314,-0.770856,2.121314,1.971065,-0.240326,1.494379


In [10]:
# we separate the list of videos into a training (i.e. offline) set and a test set (i.e. online)
train_ind, test_ind = train_test_split([k for k in range(len(v_names_train))], test_size = 0.25)
# training set indexes
# train_index = [v[:-4] for v in v_names_train]
train_index = [v_names_train[k][:-4] for k in train_ind]
# test set indexes
test_index = [v_names_train[k][:-4] for k in test_ind]
print(len(train_index), len(test_index))

787 263


In [11]:
# we add the input properties
name_col = list(meta_perf.columns)[1:]

# we add the x264 configuration options
for vcc in val_config.columns:
    name_col.append(vcc)

# Then, X (i.e the predicting variables) =  input properties + software configuration options

# we add the variable to predict, i.e. y the bitrate performance distribution
name_col.append("bitrate")

# X length, the number of predicting variables
nb_col = len(name_col)
# the number of configurations
nb_config = 201

# generate the datasets = (X,y)
def gen_dataset(inputs_names):
    # inputs : names of videos
    # output : aggregation of multiple (X,y) for all the videos in the list of names provided in input 
    
    # the final dataset
    res = pd.DataFrame(np.zeros(nb_config*len(inputs_names)*nb_col).reshape(nb_config*len(inputs_names), nb_col))
    res.columns = name_col
    
    # we add the data video per video
    # LINES 6-10 in Algorithm 1
    for i in range(len(inputs_names)):
        # first, we retrieve the name of the video
        video_name = inputs_names[i]
        index_video = np.where(np.array([v[:-4] for v in v_names], str)==video_name)[0][0]
        # we compute the performance, here
        bitrates = listVideo[index_video][predDimension]
        # get the input properties of the video
        video_prop = np.array(meta_perf.loc[video_name][1:], float)
        # compute the avrage value and the standard deviation for the bitrate
        # as we said in the paper, it does not change the order of variable
        # which is a good property
        moy = np.mean(bitrates)
        std = np.std(bitrates)
        # for each configuration, we add the values of the input properties and the configuration options (=X)
        # and the normalized values of bitrates (=y)
        for config_id in range(nb_config):
            val = list(tuple(video_prop) + tuple(val_config.loc[config_id]))
            val.append((bitrates[config_id]-moy)/std)
            res.loc[i*nb_config+config_id] = val
    return res

# training dataset
training_data = gen_dataset(train_index)

# dimensions of the different sets = a proxy to the measurement cost 
print("Training size : ", training_data.shape[0])

# OFFLINE - Training data
X_train = np.array(training_data.drop(["bitrate"],axis=1), float)
y_train = np.array(training_data["bitrate"], float)

Training size :  158187


In [12]:
# test dataset
test_data = gen_dataset(test_index)

X_test = np.array(test_data.drop(["bitrate"],axis=1), float)
y_test = np.array(test_data["bitrate"], float)

print("Test size : ", test_data.shape[0])

Test size :  52863


In [13]:
# # Parameters from the grid search in the paper
# rgr = RandomForestRegressor(max_depth=None, max_features=15, min_samples_leaf=2, n_estimators=100, n_jobs=4)
# rgr.fit(X_train, y_train)
# # rgr.score(X_test, y_test)
# y_pred_train = rgr.predict(X_train)
# y_pred_test = rgr.predict(X_test)
# print("train error: ", mean_squared_error(y_train, y_pred_train))
# print("train error: ", mean_squared_error(y_test, y_pred_test))

In [14]:
# rgr.score(X_test, y_test)

In [32]:
# Inspired from https://modal-python.readthedocs.io/en/latest/content/examples/active_regression.html
# Performance improves, but not stellar. Could be related to the GP kernel.
from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.disagreement import max_std_sampling
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF

kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

n_init = 100

train_idx_al = np.random.choice(range(X_train.shape[0]), size=n_initial, replace=False)
X_train_al = X_train[train_idx_al]
y_train_al = y_train[train_idx_al]

# creating a reduced copy of the data with the known instances removed
X_pool = np.delete(X_train, train_idx_al, axis=0)
y_pool = np.delete(y_train, train_idx_al)

# initializing the learner
optimizer = ActiveLearner(
#     estimator=RandomForestRegressor(max_depth=None, max_features=15, min_samples_leaf=2, n_estimators=100, n_jobs=1),
    estimator=GaussianProcessRegressor(kernel),
    query_strategy=max_std_sampling,
    X_training=X_train_al, y_training=y_train_al
)

y_pred_train = optimizer.predict(X_train)
y_pred_test = optimizer.predict(X_test)
acc_train = mean_squared_error(y_train, y_pred_train)
acc_test =  mean_squared_error(y_test, y_pred_test)
print(f"{n_init} -> {acc_train:.2f} / {acc_test:.2f}")

for n_query in range(n_init, len(X_train)):
    query_idx, query_inst = optimizer.query(X_pool)
    optimizer.teach(X_pool[query_idx], y_pool[query_idx])
    
    if n_query % 20 == 0:
        y_pred_train = optimizer.predict(X_train)
        y_pred_test = optimizer.predict(X_test)
        acc_train = mean_squared_error(y_train, y_pred_train)
        acc_test =  mean_squared_error(y_test, y_pred_test)
        print(f"{n_query} ({n_query/len(X_train):.2f}%) -> {acc_train:.2f} / {acc_test:.2f}")
        
y_pred_train = optimizer.predict(X_train)
y_pred_test = optimizer.predict(X_test)
acc_train = mean_squared_error(y_train, y_pred_train)
acc_test =  mean_squared_error(y_test, y_pred_test)
print(f"{acc_train:.2f} / {acc_test:.2f}")

100 -> 0.87 / 0.87
100 (0.00%) -> 0.97 / 0.99
120 (0.00%) -> 0.80 / 0.79
140 (0.00%) -> 0.58 / 0.58
160 (0.00%) -> 0.56 / 0.56
180 (0.00%) -> 0.99 / 0.99
200 (0.00%) -> 0.48 / 0.48
220 (0.00%) -> 0.47 / 0.47
240 (0.00%) -> 0.44 / 0.45
260 (0.00%) -> 0.41 / 0.42
280 (0.00%) -> 0.43 / 0.44
300 (0.00%) -> 0.42 / 0.42
320 (0.00%) -> 0.42 / 0.42




340 (0.00%) -> 0.41 / 0.42




360 (0.00%) -> 0.40 / 0.41




380 (0.00%) -> 0.39 / 0.40




400 (0.00%) -> 0.38 / 0.39




420 (0.00%) -> 0.37 / 0.38




440 (0.00%) -> 0.36 / 0.37




460 (0.00%) -> 0.35 / 0.37




480 (0.00%) -> 0.35 / 0.36




500 (0.00%) -> 0.34 / 0.36




520 (0.00%) -> 0.35 / 0.37




540 (0.00%) -> 0.36 / 0.37




560 (0.00%) -> 0.36 / 0.37




580 (0.00%) -> 0.36 / 0.37




600 (0.00%) -> 0.35 / 0.36




620 (0.00%) -> 0.34 / 0.36




640 (0.00%) -> 0.34 / 0.35




660 (0.00%) -> 0.34 / 0.36




680 (0.00%) -> 0.42 / 0.43




700 (0.00%) -> 0.41 / 0.42




720 (0.00%) -> 0.41 / 0.42




740 (0.00%) -> 0.42 / 0.43




760 (0.00%) -> 0.42 / 0.43




780 (0.00%) -> 0.42 / 0.43




800 (0.01%) -> 0.41 / 0.43




820 (0.01%) -> 0.41 / 0.43




840 (0.01%) -> 0.41 / 0.43




860 (0.01%) -> 0.41 / 0.43




880 (0.01%) -> 0.42 / 0.43


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


900 (0.01%) -> 0.42 / 0.44




920 (0.01%) -> 0.43 / 0.44




940 (0.01%) -> 0.42 / 0.44




960 (0.01%) -> 0.43 / 0.45




980 (0.01%) -> 0.43 / 0.44




1000 (0.01%) -> 0.43 / 0.44




1020 (0.01%) -> 0.43 / 0.44




1040 (0.01%) -> 0.43 / 0.45




1060 (0.01%) -> 0.42 / 0.44




1080 (0.01%) -> 0.41 / 0.44




1100 (0.01%) -> 0.41 / 0.43




1120 (0.01%) -> 0.40 / 0.43




1140 (0.01%) -> 0.40 / 0.43




1160 (0.01%) -> 0.40 / 0.43




1180 (0.01%) -> 0.39 / 0.41




1200 (0.01%) -> 0.39 / 0.41




1220 (0.01%) -> 0.39 / 0.41




1240 (0.01%) -> 0.39 / 0.41




1260 (0.01%) -> 0.39 / 0.41




1280 (0.01%) -> 0.38 / 0.41




1300 (0.01%) -> 0.38 / 0.41




1320 (0.01%) -> 0.39 / 0.41




1340 (0.01%) -> 0.38 / 0.41




1360 (0.01%) -> 0.38 / 0.41




1380 (0.01%) -> 0.38 / 0.40




1400 (0.01%) -> 0.38 / 0.40




1420 (0.01%) -> 0.38 / 0.40




1440 (0.01%) -> 0.38 / 0.40




1460 (0.01%) -> 0.37 / 0.39




1480 (0.01%) -> 0.37 / 0.39




1500 (0.01%) -> 0.37 / 0.39




1520 (0.01%) -> 0.37 / 0.39




1540 (0.01%) -> 0.37 / 0.39




1560 (0.01%) -> 0.37 / 0.39




1580 (0.01%) -> 0.37 / 0.39




1600 (0.01%) -> 0.37 / 0.39




1620 (0.01%) -> 0.37 / 0.39




1640 (0.01%) -> 0.37 / 0.39




KeyboardInterrupt: 

In [29]:
# Inspired from https://modal-python.readthedocs.io/en/latest/content/examples/bayesian_optimization.html
# Doesn't work out-of-the-box due to memory limitations
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from modAL.models import BayesianOptimizer
from modAL.acquisition import optimizer_EI, max_EI

optimizer = BayesianOptimizer(
    estimator=regressor,
#     X_training=X_train[:100, :], y_training=y_train[:100],
    query_strategy=max_EI
)

y_pred_train = optimizer.predict(X_train)
y_pred_test = optimizer.predict(X_test)
acc_train = mean_squared_error(y_train, y_pred_train)
acc_test =  mean_squared_error(y_test, y_pred_test)
print(f"100 -> {acc_train:.2f} / {acc_test:.2f}")

for n_query in range(1, len(X_train)):
    query_idx, query_inst = optimizer.query(X_train)
    optimizer.teach(X_train[query_idx], y_train[query_idx])
    
    if n_query % 20 == 0:
        y_pred_train = optimizer.predict(X_train)
        y_pred_test = optimizer.predict(X_test)
        acc_train = mean_squared_error(y_train, y_pred_train)
        acc_test =  mean_squared_error(y_test, y_pred_test)
        print(f"{100+n_query} -> {acc_train:.2f} / {acc_test:.2f}")
        
y_pred_train = optimizer.predict(X_train)
y_pred_test = optimizer.predict(X_test)
acc_train = mean_squared_error(y_train, y_pred_train)
acc_test =  mean_squared_error(y_test, y_pred_test)
print(f"{acc_train:.2f} / {acc_test:.2f}")

MemoryError: Unable to allocate 186. GiB for an array with shape (158187, 158187) and data type float64