## IMPORTS

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sp

from sklearn.svm import SVR
from sklearn.metrics import classification_report, accuracy_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor

## DATA PREPARATION

In [2]:
df_train = pd.read_csv("/Users/Giacomo/Desktop/DM2/PRJ/df_train_processed.csv")
df_test = pd.read_csv("/Users/Giacomo/Desktop/DM2/PRJ/df_test_processed.csv")

In [3]:
print(df_train.shape, df_test.shape)

(1828, 383) (624, 383)


In [4]:
df_train.head()

Unnamed: 0,vocal_channel,emotion,emotional_intensity,statement,repetition,actor,sex,frame_count,sum,mean,...,stft_std_w4,stft_min_w4,stft_q01_w4,stft_q05_w4,stft_q25_w4,stft_q50_w4,stft_q75_w4,stft_q95_w4,stft_kur_w4,stft_skew_w4
0,speech,neutral,normal,Kids are talking by the door,1st,1,M,158558,0.145081,9.15e-07,...,0.155455,0.0,0.0,0.566462,0.709962,0.799141,0.896606,1.0,6.74219,-1.706215
1,speech,neutral,normal,Kids are talking by the door,2nd,1,M,160160,0.114319,7.13779e-07,...,0.163183,0.270133,0.368623,0.472736,0.623183,0.744908,0.874713,1.0,-0.70042,-0.201495
2,speech,neutral,normal,Dogs are sitting by the door,1st,1,M,156956,0.149963,9.554485e-07,...,0.190634,0.0,0.0,0.417919,0.643636,0.774253,0.899156,1.0,1.688986,-1.024773
3,speech,neutral,normal,Dogs are sitting by the door,2nd,1,M,152152,0.139618,9.176213e-07,...,0.182551,0.205616,0.30628,0.399641,0.60691,0.755213,0.886474,1.0,-0.594111,-0.412871
4,speech,calm,normal,Kids are talking by the door,1st,1,M,169769,0.137665,8.108948e-07,...,0.177069,0.177847,0.248765,0.428202,0.634815,0.759914,0.878014,1.0,0.126535,-0.620782


### Standardization

In [5]:
df_train_num = df_train.select_dtypes(include="number").drop("actor", axis=1)
df_train_cat = df_train.select_dtypes(include=['object'])

scaler = MinMaxScaler()
df_train_num_scaled = pd.DataFrame(scaler.fit_transform(df_train_num), columns=df_train_num.columns)
df_train = pd.concat([df_train_num_scaled, df_train_cat], axis=1)

In [6]:
df_train.head()

Unnamed: 0,frame_count,sum,mean,std,min,max,q01,q05,q25,q50,...,stft_q75_w4,stft_q95_w4,stft_kur_w4,stft_skew_w4,vocal_channel,emotion,emotional_intensity,statement,repetition,sex
0,0.10784,0.385871,0.435971,0.022102,0.967474,0.036119,0.979319,0.978418,0.998047,0.25,...,0.907952,1.0,0.95606,0.048224,speech,neutral,normal,Kids are talking by the door,1st,M
1,0.117646,0.385792,0.435878,0.023967,0.963262,0.043729,0.977355,0.977549,0.998047,0.25,...,0.885585,1.0,0.107604,0.360443,speech,neutral,normal,Kids are talking by the door,2nd,M
2,0.098033,0.385883,0.435989,0.029249,0.968888,0.054101,0.972723,0.972712,1.0,0.25,...,0.910557,1.0,0.379996,0.189619,speech,neutral,normal,Dogs are sitting by the door,1st,M
3,0.068626,0.385857,0.435972,0.027946,0.969257,0.058336,0.974555,0.975069,0.998047,0.25,...,0.8976,1.0,0.119724,0.316584,speech,neutral,normal,Dogs are sitting by the door,2nd,M
4,0.176466,0.385852,0.435923,0.015209,0.979741,0.021665,0.985816,0.984619,0.998047,0.25,...,0.888957,1.0,0.201877,0.273444,speech,calm,normal,Kids are talking by the door,1st,M


In [7]:
df_test_num = df_test.select_dtypes(include="number").drop("actor", axis=1)
df_test_cat = df_test.select_dtypes(include=['object'])

scaler = MinMaxScaler()
df_test_num_scaled = pd.DataFrame(scaler.fit_transform(df_test_num), columns=df_test_num.columns)
df_test = pd.concat([df_test_num_scaled, df_test_cat], axis=1)

In [8]:
df_test.head()

Unnamed: 0,frame_count,sum,mean,std,min,max,q01,q05,q25,q50,...,stft_q75_w4,stft_q95_w4,stft_kur_w4,stft_skew_w4,vocal_channel,emotion,emotional_intensity,statement,repetition,sex
0,0.090907,0.380904,0.391299,0.021524,0.974882,0.025626,0.977625,0.977107,0.991848,1.0,...,0.904373,1.0,0.296687,0.323521,speech,neutral,normal,Kids are talking by the door,1st,M
1,0.050499,0.363973,0.373194,0.020201,0.973283,0.027224,0.98043,0.97924,0.98913,1.0,...,0.845924,1.0,0.260992,0.317085,speech,neutral,normal,Kids are talking by the door,2nd,M
2,0.080803,0.329705,0.335466,0.019156,0.982322,0.016224,0.980679,0.977107,0.994565,1.0,...,0.878457,1.0,0.484496,0.224814,speech,neutral,normal,Dogs are sitting by the door,1st,M
3,0.0707,0.354421,0.362496,0.024803,0.964305,0.027623,0.975693,0.974547,0.991848,1.0,...,0.899216,1.0,0.177111,0.380501,speech,neutral,normal,Dogs are sitting by the door,2nd,M
4,0.181814,0.3655,0.373951,0.007708,0.99173,0.006238,0.990714,0.9909,0.994565,1.0,...,0.902562,1.0,0.238178,0.329504,speech,calm,normal,Kids are talking by the door,1st,M


### One-hot encoding

In [9]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [10]:
print(df_train.shape, df_test.shape)

(1828, 394) (624, 394)


In [11]:
df_train.head()

Unnamed: 0,frame_count,sum,mean,std,min,max,q01,q05,q25,q50,...,emotion_sad,emotion_surprised,emotional_intensity_normal,emotional_intensity_strong,statement_Dogs are sitting by the door,statement_Kids are talking by the door,repetition_1st,repetition_2nd,sex_F,sex_M
0,0.10784,0.385871,0.435971,0.022102,0.967474,0.036119,0.979319,0.978418,0.998047,0.25,...,0,0,1,0,0,1,1,0,0,1
1,0.117646,0.385792,0.435878,0.023967,0.963262,0.043729,0.977355,0.977549,0.998047,0.25,...,0,0,1,0,0,1,0,1,0,1
2,0.098033,0.385883,0.435989,0.029249,0.968888,0.054101,0.972723,0.972712,1.0,0.25,...,0,0,1,0,1,0,1,0,0,1
3,0.068626,0.385857,0.435972,0.027946,0.969257,0.058336,0.974555,0.975069,0.998047,0.25,...,0,0,1,0,1,0,0,1,0,1
4,0.176466,0.385852,0.435923,0.015209,0.979741,0.021665,0.985816,0.984619,0.998047,0.25,...,0,0,1,0,0,1,1,0,0,1


In [12]:
df_test.head()

Unnamed: 0,frame_count,sum,mean,std,min,max,q01,q05,q25,q50,...,emotion_sad,emotion_surprised,emotional_intensity_normal,emotional_intensity_strong,statement_Dogs are sitting by the door,statement_Kids are talking by the door,repetition_1st,repetition_2nd,sex_F,sex_M
0,0.090907,0.380904,0.391299,0.021524,0.974882,0.025626,0.977625,0.977107,0.991848,1.0,...,0,0,1,0,0,1,1,0,0,1
1,0.050499,0.363973,0.373194,0.020201,0.973283,0.027224,0.98043,0.97924,0.98913,1.0,...,0,0,1,0,0,1,0,1,0,1
2,0.080803,0.329705,0.335466,0.019156,0.982322,0.016224,0.980679,0.977107,0.994565,1.0,...,0,0,1,0,1,0,1,0,0,1
3,0.0707,0.354421,0.362496,0.024803,0.964305,0.027623,0.975693,0.974547,0.991848,1.0,...,0,0,1,0,1,0,0,1,0,1
4,0.181814,0.3655,0.373951,0.007708,0.99173,0.006238,0.990714,0.9909,0.994565,1.0,...,0,0,1,0,0,1,1,0,0,1


### Train-Test Split

In [29]:
target = 'sc_min'
X_train = df_train.drop(target, axis=1).to_numpy()
y_train = df_train[target].to_numpy()
X_test = df_test.drop(target, axis=1).to_numpy()
y_test = df_test[target].to_numpy()

In [30]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1828, 393) (1828,)
(624, 393) (624,)


# OLR (baseline)

In [34]:
def evaluate(y_true, y_pred):
    scores = {}
    scores["rho"] = sp.spearmanr(y_true, y_pred)
    scores["r_squared"] = r2_score(y_true, y_pred)
    scores["mse"] = mean_squared_error(y_true, y_pred)
    scores["mae"] = mean_absolute_error(y_true, y_pred)
    return scores

In [35]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [36]:
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.18515506968038178, pvalue=3.220725772504329e-06),
 'r_squared': -107887.07684989767,
 'mse': 5714.6129197822265,
 'mae': 69.46159273481157}

# DT (baseline)

In [37]:
param_grid = {"min_samples_split": sp.loguniform(1e-2, 1e0), 
              "min_samples_leaf": sp.uniform(0.001, 0.2),
              "max_depth": sp.randint(2, 200),
              "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"]
             }
grid = RandomizedSearchCV(
    DecisionTreeRegressor(),
    param_distributions=param_grid,
    cv=KFold(10, shuffle=True, random_state=42),
    n_jobs=-1,
    n_iter=50,
    scoring="r2"
)

grid.fit(X_train, y_train)
print(grid.best_estimator_)
print(grid.best_score_)

DecisionTreeRegressor(criterion='friedman_mse', max_depth=171,
                      min_samples_leaf=0.022615367495907025,
                      min_samples_split=0.012124681634452997)
0.9749938165163268


In [38]:
reg = grid.best_estimator_
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.9606428244096117, pvalue=0.0),
 'r_squared': 0.9400141107574729,
 'mse': 0.0031773310608446195,
 'mae': 0.03046475317854541}

# SVR

- For the gamma value we use the one pre-computed by Sklearn (1 / (n_features * X.var()))
- C in the range between 1 and 100.
- eps between 1e-3 and 1.

References at:
http://adrem.uantwerpen.be/bibrem/pubs/IJCNN2007.pdf

In [43]:
# first search
param_grid = {"kernel": ["linear", "poly", "rbf", "sigmoid"],
             "C": [1e0, 1e1, 1e2],
             "epsilon": [1e-4, 1e-3, 1e-2, 1e-1, 1e0]
             } # size of the hyper-parameter space: 4*3*4 = 48

search = GridSearchCV(SVR(max_iter=100000), 
                      param_grid=param_grid, 
                      cv=KFold(5, shuffle=True, random_state=42), 
                      n_jobs=-1,
                      scoring="r2"
                      )
search.fit(X_train, y_train)

In [44]:
print(search.best_estimator_)
print(search.best_score_)

SVR(C=10.0, epsilon=0.001, max_iter=100000)
0.8872683694634453


In [45]:
reg = search.best_estimator_.fit(X_train, y_train)
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.8853395900430321, pvalue=4.800934798671229e-209),
 'r_squared': 0.8352443727534564,
 'mse': 0.008726771887683489,
 'mae': 0.06516318317493343}

# RF

In [48]:
param_grid = {"min_samples_split": sp.loguniform(1e-2, 1e0), 
              "min_samples_leaf": sp.uniform(0.001, 0.2),
              "max_depth": sp.randint(2, 200),
              "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
              "max_features":['sqrt', 'log2', None]
             }

search = RandomizedSearchCV(RandomForestRegressor(random_state=42, n_estimators=100),
                           param_distributions=param_grid,
                           cv=KFold(3, shuffle=True, random_state=42),
                           n_jobs=-1,
                           n_iter=30,
                           scoring="r2")
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

{'criterion': 'poisson', 'max_depth': 16, 'max_features': None, 'min_samples_leaf': 0.017949437795279088, 'min_samples_split': 0.025373598699519797}
0.9774113392175682


In [49]:
reg = search.best_estimator_.fit(X_train, y_train)
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.9668891623200789, pvalue=0.0),
 'r_squared': 0.9495861267883079,
 'mse': 0.0026703207583597976,
 'mae': 0.027644730463065332}

# Bagging Regressor (DT)

In [50]:
param_grid = {"base_estimator__min_samples_split": sp.loguniform(1e-2, 1e0), 
              "base_estimator__min_samples_leaf": sp.uniform(0.001, 0.2),
              "base_estimator__max_depth": sp.randint(2, 200),
              "base_estimator__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
              "max_samples": [0.5, 0.6, 0.7, 0.8]
             }

ensemble = BaggingRegressor(DecisionTreeRegressor(), random_state=42, n_estimators=100)


search = RandomizedSearchCV(ensemble,
                           param_distributions=param_grid,
                           cv=KFold(3, shuffle=True, random_state=42),
                           n_jobs=-1,
                           n_iter=30,
                           scoring="r2")
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

{'base_estimator__criterion': 'friedman_mse', 'base_estimator__max_depth': 59, 'base_estimator__min_samples_leaf': 0.01600050761493821, 'base_estimator__min_samples_split': 0.014309454748004083, 'max_samples': 0.7}
0.9770946442542541


In [51]:
reg = search.best_estimator_.fit(X_train, y_train)
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.9682495085432563, pvalue=0.0),
 'r_squared': 0.9467541743725194,
 'mse': 0.002820323542133477,
 'mae': 0.027969265397087367}

# AdaBoost Regressor

In [53]:
param_grid = {"base_estimator__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
              "learning_rate": [1e-4, 1e-3, 1e-2, 1e-1, 1e0],
              "loss": ["linear", "square", "exponential"]
             } # size of hyper-parameter space: 60

ensemble = AdaBoostRegressor(DecisionTreeRegressor(max_depth=1), random_state=42, n_estimators=50)


search = GridSearchCV(ensemble,
                      param_grid=param_grid,
                      cv=KFold(3, shuffle=True, random_state=42),
                      n_jobs=-1,
                      scoring="r2")
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

{'base_estimator__criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'square'}
0.8558780799711428


In [54]:
reg = search.best_estimator_.fit(X_train, y_train)
y_pred = reg.predict(X_test)
evaluate(y_test, y_pred)

{'rho': SpearmanrResult(correlation=0.9468416761298017, pvalue=1.4554849427733397e-308),
 'r_squared': 0.8650411759516256,
 'mse': 0.007148495571187748,
 'mae': 0.06539355833170574}