In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
video_feature_file = "deepvar_video_features.csv"
data_directory = "res_ugc/"

In [3]:
vf = pd.read_csv(video_feature_file, usecols=["FILENAME", "WIDTH", "HEIGHT", "SPATIAL_COMPLEXITY", "TEMPORAL_COMPLEXITY", "COLOR_COMPLEXITY"])  # "CHUNK_COMPLEXITY_VARIATION",
vf.head()

Unnamed: 0,FILENAME,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,COLOR_COMPLEXITY
0,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005
1,Animation_1080P-05f8,1920,1080,1.229,0.454,0.794
2,Animation_1080P-0c4f,1920,1080,1.714,0.136,0.0
3,Animation_1080P-0cdf,1920,1080,1.33,0.451,0.181
4,Animation_1080P-18f5,1920,1080,2.452,0.137,0.329


In [4]:
vf_extra = pd.read_csv("size_bitrate.csv")
vf_extra["ORIG_KBS"] = vf_extra["ORIG_BITRATE"]/1000
vf_extra.drop(columns=["ORIG_BITRATE"], inplace=True)
vf_extra.head()

Unnamed: 0,FILENAME,ORIG_SIZE,ORIG_KBS
0,LiveMusic_360P-5281,207392038,82956.815
1,LiveMusic_360P-54d0,207392038,82956.815
2,LiveMusic_360P-1d94,172826740,69130.696
3,LiveMusic_360P-6266,207046320,82873.912
4,LiveMusic_360P-265c,155292675,62220.773


In [5]:
vf = pd.merge(vf, vf_extra, on="FILENAME", how="inner")

In [6]:
def elapsedtime_to_sec(el):
    tab = el.split(":")
    return float(tab[0])*60+float(tab[1])

# the data folder, see the markdown there for additional explanations
res_dir = data_directory

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

# the list of measurements
listVideo = []

# we add each dataset in the list, converting the time to the right format
# third line asserts that the measures are complete
for v in v_names:
    data = pd.read_table(res_dir+v, delimiter = ',')
    data['etime'] = [*map(elapsedtime_to_sec, data['elapsedtime'])]
    assert data.shape == (201,34), v
    data["FILENAME"] = os.path.splitext(v)[0]
    listVideo.append(data)

In [7]:
predDimension = "kbs"
v_names_train = np.loadtxt("train_names.csv", dtype= str)
v_names_test = np.loadtxt("test_names.csv", dtype= str)

In [8]:
resdf = pd.concat(listVideo)
# resdf.join(vf, on="FILENAME")
resdf

Unnamed: 0,configurationID,cabac,ref,deblock,analyse,me,subme,mixed_ref,me_range,trellis,...,size,usertime,systemtime,elapsedtime,cpu,frames,fps,kbs,etime,FILENAME
0,1,0,1,0:0:0,0:0,dia,0,0,16,0,...,403085,7.94,1.36,0:02.14,434,600,375.22,161.07,2.14,Animation_1080P-01b3
1,101,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,...,234157,23.17,1.79,0:03.40,734,600,217.07,93.57,3.40,Animation_1080P-01b3
2,102,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,...,159836,18.42,1.66,0:02.71,739,600,293.42,63.87,2.71,Animation_1080P-01b3
3,103,1,2,0:0:0,0x3:0x3,umh,6,1,16,1,...,163586,22.39,1.53,0:02.78,858,600,276.79,65.37,2.78,Animation_1080P-01b3
4,104,1,16,1:0:0,0x3:0x113,hex,6,1,24,1,...,218392,17.40,1.76,0:02.74,699,600,287.79,87.27,2.74,Animation_1080P-01b3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,97,1,2,1:0:0,0x3:0x113,hex,4,1,16,0,...,9480468,21.65,1.47,0:03.28,704,599,207.00,3794.72,3.28,Vlog_720P-6d56
197,98,1,2,0:0:0,0x3:0x113,hex,4,0,24,0,...,9658211,18.87,1.28,0:02.52,799,599,279.83,3865.87,2.52,Vlog_720P-6d56
198,99,1,5,1:0:0,0x3:0x113,hex,6,1,16,1,...,11813580,30.67,1.15,0:03.39,939,599,199.49,4728.59,3.39,Vlog_720P-6d56
199,100,0,3,0:0:0,0x113:0x113,hex,6,1,16,1,...,14293894,28.61,1.14,0:02.90,1022,599,237.45,5721.38,2.90,Vlog_720P-6d56


In [9]:
video_features = ["WIDTH", "HEIGHT", "SPATIAL_COMPLEXITY", "TEMPORAL_COMPLEXITY", "COLOR_COMPLEXITY", "ORIG_SIZE", "ORIG_KBS"]
config_features = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", 
                "b_adapt", "direct", "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", 
                "mbtree", "qpmax", "aq-mode"]
config_features_categorical = ['analyse', 'me', 'direct', 'deblock', 'b_pyramid', 'b_adapt', 'weightb', 'open_gop', 'scenecut', 'rc_lookahead']

In [10]:
df = pd.merge(resdf, vf, how="inner", on="FILENAME")
df.head()

Unnamed: 0,configurationID,cabac,ref,deblock,analyse,me,subme,mixed_ref,me_range,trellis,...,kbs,etime,FILENAME,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,COLOR_COMPLEXITY,ORIG_SIZE,ORIG_KBS
0,1,0,1,0:0:0,0:0,dia,0,0,16,0,...,161.07,2.14,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,745763.278
1,101,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,...,93.57,3.4,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,745763.278
2,102,1,2,1:0:0,0x3:0x113,hex,6,1,16,1,...,63.87,2.71,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,745763.278
3,103,1,2,0:0:0,0x3:0x3,umh,6,1,16,1,...,65.37,2.78,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,745763.278
4,104,1,16,1:0:0,0x3:0x113,hex,6,1,24,1,...,87.27,2.74,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,745763.278


In [11]:
df.to_csv("all_features.csv", index=False)