In [42]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import ElasticNet

In [30]:
predDimension = "kbs"
v_names_train = np.loadtxt("train_names.csv", dtype=str)
v_names_test = np.loadtxt("test_names.csv", dtype=str)

video_features = ["WIDTH", "HEIGHT", "SPATIAL_COMPLEXITY", "TEMPORAL_COMPLEXITY", "COLOR_COMPLEXITY", "ORIG_SIZE", "ORIG_KBS"]  # "ORIG_DURATION", 
config_features = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", 
                "b_adapt", "direct", "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", 
                "mbtree", "qpmax", "aq-mode"]
config_features_categorical = ['analyse', 'me', 'direct', 'deblock', 'b_pyramid', 'b_adapt', 'weightb', 'open_gop', 'scenecut', 'rc_lookahead']

In [31]:
df = pd.read_csv("all_features.csv")

In [32]:
for c in config_features_categorical:
    df[c], _ = pd.factorize(df[c])

In [33]:
df.head()

Unnamed: 0,configurationID,cabac,ref,deblock,analyse,me,subme,mixed_ref,me_range,trellis,...,etime,FILENAME,WIDTH,HEIGHT,SPATIAL_COMPLEXITY,TEMPORAL_COMPLEXITY,COLOR_COMPLEXITY,ORIG_SIZE,ORIG_DURATION,ORIG_KBS
0,1,0,1,0,0,0,0,0,16,0,...,2.14,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
1,101,1,2,1,1,1,6,1,16,1,...,3.4,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
2,102,1,2,1,1,1,6,1,16,1,...,2.71,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
3,103,1,2,0,2,2,6,1,16,1,...,2.78,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278
4,104,1,16,1,1,1,6,1,24,1,...,2.74,Animation_1080P-01b3,1920,1080,0.098,0.004,0.005,1866272605,20.02,745763.278


In [34]:
df[["size", "kbs", "ORIG_SIZE", "ORIG_KBS"]].describe()

Unnamed: 0,size,kbs,ORIG_SIZE,ORIG_KBS
count,257883.0,257883.0,257883.0,257883.0
mean,23834290.0,9606.272538,2097679000.0,847210.7
std,39644790.0,15918.283128,4087176000.0,1640564.0
min,29831.0,11.93,2765721.0,7248.513
25%,3154450.0,1279.815,276512000.0,110494.3
50%,8685712.0,3512.14,828089500.0,331457.3
75%,26255750.0,10582.225,1863162000.0,745763.3
max,663872600.0,265726.63,39780010000.0,15909370.0


In [20]:
# we separate the list of videos into a training (i.e. offline) set and a test set (i.e. online)
train_ind, test_ind = train_test_split([k for k in range(len(v_names_train))], test_size = 0.25, random_state=0)
# training set indexes
# train_index = [v[:-4] for v in v_names_train]
train_index = [v_names_train[k][:-4] for k in train_ind]
# test set indexes
test_index = [v_names_train[k][:-4] for k in test_ind]
print(len(train_index), len(test_index))

train_df = df[df.FILENAME.isin(train_index)]
val_df = df[df.FILENAME.isin(test_index)]

X_train = train_df[video_features + config_features]
y_train = np.array(train_df[predDimension] / train_df["ORIG_KBS"]).reshape(-1, 1)
# y_train = np.array(train_df[predDimension]).reshape(-1, 1)
X_val = val_df[video_features + config_features]
y_val = np.array(val_df[predDimension] / val_df["ORIG_KBS"]).reshape(-1, 1)
# y_val = np.array(val_df[predDimension]).reshape(-1, 1)

787 263


In [21]:
y_val[:10], val_df["ORIG_KBS"].head()

(array([[13.98273117],
        [ 5.07171851],
        [ 4.45910709],
        [ 4.45991083],
        [ 6.24080809],
        [ 6.28374393],
        [ 5.77682446],
        [ 6.01923275],
        [ 4.96609086],
        [ 4.71976031]]),
 2412    622.090913
 2413    622.090913
 2414    622.090913
 2415    622.090913
 2416    622.090913
 Name: ORIG_KBS, dtype: float64)

In [22]:
obj_scaler = StandardScaler()
y_train = obj_scaler.fit_transform(y_train)
y_val = obj_scaler.transform(y_val)

In [23]:
def print_val_error(y_val, y_pred_val, scaler):
    mse = mean_squared_error(y_val, y_pred_val)
    mae_rescaled = mean_absolute_error(scaler.inverse_transform(y_val), scaler.inverse_transform(y_pred_val))
    mae_kbs = mean_absolute_error(scaler.inverse_transform(y_val).ravel()*val_df["ORIG_KBS"].to_numpy(), scaler.inverse_transform(y_pred_val).ravel()*val_df["ORIG_KBS"].to_numpy())
    print("val error", mse, mae_rescaled, mae_kbs)

In [24]:
# import xgboost as xgb
# xgb_model = xgb.XGBRegressor(n_jobs=4, n_estimators=5000)
# xgb_model.fit(X_train, y_train)
# y_pred_train = xgb_model.predict(X_train)
# y_pred_test = xgb_model.predict(X_val)
# print("train error: ", mean_squared_error(y_train, y_pred_train))
# print_val_error(y_val, y_pred_test, obj_scaler)
# # print(xgb_model.feature_importances_)

In [25]:
# Parameters modified from the grid search result in the paper
rgr = RandomForestRegressor(max_depth=None, max_features=20, min_samples_leaf=2, n_estimators=100, n_jobs=6)
#rgr = RandomForestRegressor(n_jobs=4)
rgr.fit(X_train, y_train.ravel())
# rgr.score(X_test, y_test)
y_pred_train = rgr.predict(X_train)
y_pred_test = rgr.predict(X_val)
print("train error: ", mean_squared_error(y_train, y_pred_train))
print_val_error(y_val, y_pred_test, obj_scaler)

train error:  0.002311513757837138
val error 0.2887155911826448 4.717866326808415 3912.434262174496


In [None]:
from sklearn.model_selection import GridSearchCV
LA_rf = RandomForestRegressor()

grid_search_larf = GridSearchCV(estimator = LA_rf,
                                param_grid = {'n_estimators': [100, 200, 500, 1000, 1500, 2000, 2500, 3000, 3500],
                                              # we didn't include 1 for min_samples_leaf to avoid overfitting
                                         'min_samples_leaf' : [2, 5, 10, 15, 20, 30, 50],
                                         'max_depth' : [3, 5, 10, 15, 20, 25, None],
                                         'max_features' : [5, 15, 25, 30, 50]},
                                scoring = 'neg_mean_squared_error',
                                verbose = True,
                                n_jobs = 6)

grid_search_larf.fit(X_train, y_train.ravel())

Fitting 5 folds for each of 2205 candidates, totalling 11025 fits




In [44]:
lr = ElasticNet()
lr.fit(X_train, y_train.ravel())
y_pred_test = lr.predict(X_val)
print_val_error(y_val, y_pred_test, obj_scaler)

val error 0.9103662779766275 9.712649666894537 8010.343232593032


  model = cd_fast.enet_coordinate_descent(


In [45]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train.ravel(),
        eval_set=[(X_val, y_val.ravel())],
        eval_metric='l1',
        early_stopping_rounds=5)

y_pred_test = gbm.predict(X_val, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_val, y_pred_test) #** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
print_val_error(y_val, y_pred_test, obj_scaler)

[1]	valid_0's l1: 0.658165	valid_0's l2: 0.880951
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 0.640347	valid_0's l2: 0.835228
[3]	valid_0's l1: 0.624555	valid_0's l2: 0.796592
[4]	valid_0's l1: 0.608567	valid_0's l2: 0.76035
[5]	valid_0's l1: 0.595	valid_0's l2: 0.730793
[6]	valid_0's l1: 0.580761	valid_0's l2: 0.700823
[7]	valid_0's l1: 0.566864	valid_0's l2: 0.670034
[8]	valid_0's l1: 0.555147	valid_0's l2: 0.647366
[9]	valid_0's l1: 0.543801	valid_0's l2: 0.623256
[10]	valid_0's l1: 0.532304	valid_0's l2: 0.600353
[11]	valid_0's l1: 0.522018	valid_0's l2: 0.57887
[12]	valid_0's l1: 0.51106	valid_0's l2: 0.558669
[13]	valid_0's l1: 0.503008	valid_0's l2: 0.54187
[14]	valid_0's l1: 0.49512	valid_0's l2: 0.524729
[15]	valid_0's l1: 0.485811	valid_0's l2: 0.507133
[16]	valid_0's l1: 0.477511	valid_0's l2: 0.492129
[17]	valid_0's l1: 0.469899	valid_0's l2: 0.478052
[18]	valid_0's l1: 0.462371	valid_0's l2: 0.464526
[19]	valid_0's l1: 0.455785	valid_0's l