In [1]:
import os
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
socialGNN_encoding_dir = 'Data/encodings/GazeDataset_Jun1523_traintest5Jun23_2_0_SocialGNN_V_5_5_12_12_6_2_20_1.0_1.5_0.001_0.01.pkl'
target_ratings_dir = 'Data/behavioral_ratings.csv'
ratings_of_interest =['expanse', 'agent distance', 'facingness', 'transitivity', 'joint action', 'communication', 'valence', 'arousal']

In [3]:
def load_pickle(path):
    with open(path, 'rb') as f:
        pickled = pickle.load(f)
    return pickled

In [4]:
ratings_df = pd.read_csv(target_ratings_dir)[['video_name'] + ratings_of_interest]
ratings_df_sorted = ratings_df.sort_values(by='video_name')
ratings_df_sorted

Unnamed: 0,video_name,expanse,agent distance,facingness,transitivity,joint action,communication,valence,arousal
0,-YwZOeyAQC8_15.mp4,0.225,0.000000,0.045455,0.895833,0.700000,0.550000,0.650000,0.340909
1,1AIVH5cEWrI_35.mp4,0.250,0.075000,1.000000,0.104167,0.825000,0.900000,0.675000,0.340909
2,2-8-0-2-5-4-8-8-5028025488_7.mp4,0.275,0.045455,0.050000,1.000000,0.575000,0.500000,0.675000,0.075000
3,3PJaYWt0cws_5.mp4,0.625,0.725000,0.340909,1.000000,0.022727,0.150000,0.600000,0.400000
4,7-5-1-0-3-6-9-8-3775103698_9.mp4,0.275,0.275000,0.636364,0.925000,0.750000,0.825000,0.722222,0.125000
...,...,...,...,...,...,...,...,...,...
245,yt-ylWmBeCU2LE_101.mp4,0.225,0.250000,0.975000,0.925000,0.525000,0.840909,0.800000,0.425000
246,yt-zGPMQDq1VdU_39.mp4,0.125,0.150000,0.100000,0.825000,0.600000,0.750000,0.650000,0.575000
247,yt-zcz5nb7m-Y4_55.mp4,0.050,0.050000,1.000000,0.975000,0.825000,0.750000,0.850000,0.400000
248,yt_R-62JchDNyM7U_143.mp4,0.500,0.575000,0.275000,1.000000,0.590909,0.175000,0.425000,0.250000


In [5]:
model_features = load_pickle(socialGNN_encoding_dir)
X = np.vstack(model_features)

In [6]:
# Assuming you want to perform regression for each rating separately
results = {}
for rating in ratings_of_interest:
    y = ratings_df_sorted[rating].values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the Ridge regression model
    ridge = Ridge(alpha=1.0)
    
    # Fit the model on the training data
    ridge.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = ridge.predict(X_test)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    results[rating] = mse
    print(f'Mean Squared Error for {rating}: {mse}')

Mean Squared Error for expanse: 0.03493019681466938
Mean Squared Error for agent distance: 0.056789482399220735
Mean Squared Error for facingness: 0.16332902292921653
Mean Squared Error for transitivity: 0.11890391422149012
Mean Squared Error for joint action: 0.04596095695910531
Mean Squared Error for communication: 0.06717665527329952
Mean Squared Error for valence: 0.035691062292219855
Mean Squared Error for arousal: 0.03854679239238862


In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import gc

def ridge_regression_benchmarking(model, data, target_features, layers, 
                                  n_splits=4, random_seed=0, alphas=None):
    if alphas is None:
        alphas = [10.**power for power in np.arange(-5, 2)]
    
    # Extract layer representations using TensorFlow
    layer_representations = extract_layer_representations(model, data['input'], layers)
    
    # Initialize KFold and storage for results
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    results = []

    # Loop over each layer's representations
    for layer_name, X in layer_representations.items():
        X = X.reshape(X.shape[0], -1)  # Flatten the representation if necessary
        
        # Separate train/test indices and target values
        train_idx = data['train_idx']
        test_idx = data['test_idx']
        y_train = data['target'][train_idx]
        y_test = data['target'][test_idx]

        # Initialize RidgeCV model
        ridge = RidgeCV(alphas=alphas)
        
        # Perform cross-validation
        train_scores = []
        for train_idx, val_idx in cv.split(X[train_idx]):
            ridge.fit(X[train_idx][train_idx], y_train[train_idx])
            train_scores.append(ridge.score(X[train_idx][val_idx], y_train[val_idx]))

        # Fit on full train set and evaluate on test set
        ridge.fit(X[train_idx], y_train)
        test_score = ridge.score(X[test_idx], y_test)
        
        # Store results
        results.append({
            'layer': layer_name,
            'train_score': np.mean(train_scores),
            'test_score': test_score
        })
        
        # Memory cleanup
        gc.collect()

    return pd.DataFrame(results)


In [None]:
data = {
    'input': np.array(...),  # Your input data as a numpy array
    'train_idx': np.array(...),  # Indices for training data
    'test_idx': np.array(...),  # Indices for test data
    'target': np.array(...)  # Target features/labels
}

# Define the layers you're interested in
layers = ['layer1', 'layer2', ...]


In [None]:
results = ridge_regression_benchmarking(my_model, data, target_features=['feature1', 'feature2'], layers=layers)
print(results)
