In [5]:
import os
import datetime
import pickle
import warnings

import numpy as np
import pandas as pd

from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

from scipy.optimize import curve_fit
from scipy.stats import pearsonr, spearmanr, kendalltau

# Suppress warnings globally
warnings.filterwarnings('ignore')

In [9]:
class SVRWithVarianceThreshold(BaseEstimator, RegressorMixin):
    """Custom SVR estimator with integrated PCA preprocessing."""
    
    def __init__(self, variance_threshold=0.95, kernel='rbf', C=1, 
                 gamma='scale', epsilon=0.1, max_iter=-1):
        self.variance_threshold = variance_threshold
        self.kernel = kernel
        self.C = C
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_iter = max_iter
        
    def fit(self, X, y):
        # Apply power scaling and PCA with the specified variance threshold
        self.scaler_ = PowerTransformer()
        X_scaled = self.scaler_.fit_transform(X)
        
        pca = PCA()
        pca.fit(X_scaled)
        cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
        n_components = np.argmax(cumsum_variance >= self.variance_threshold) + 1
        self.pca_ = PCA(n_components=n_components)
        X_pca = self.pca_.fit_transform(X_scaled)
        
        # **FIX: Store the sign correction for consistency**
        self.pca_signs_ = np.sign(self.pca_.components_[0, :n_components])
        X_pca *= self.pca_signs_  # Apply sign correction
        
        # Create and fit the SVR model
        base_svr = SVR(kernel=self.kernel, C=self.C, 
                      gamma=self.gamma, epsilon=self.epsilon, 
                      max_iter=self.max_iter)
        self.model_ = MultiOutputRegressor(base_svr)
        self.model_.fit(X_pca, y)
        return self
        
    def predict(self, X):
        X_scaled = self.scaler_.transform(X)
        X_pca = self.pca_.transform(X_scaled)
        X_pca *= self.pca_signs_  # **FIX: Apply same sign correction**
        return self.model_.predict(X_pca)

In [10]:
class MultiOutputSVRWithPCA:
    """
    A class for multi-output Support Vector Regression with PCA preprocessing
    and hyperparameter tuning using GridSearchCV.
    """
    def __init__(self):
        """Initialize the class with empty attributes."""
        self.features_df = None
        self.labels_df = None
        self.X = None
        self.y = None
        self.X_train = None  # Added train/test split attributes
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.model = None
        self.best_params = None
        self.label_columns = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        self.logistic_model = "4pl"
        
    def load_data(self, features_file, labels_file):
        """Load features and labels from CSV files."""
        print("Loading data...")
        # Load features file
        self.features_df = pd.read_csv(features_file)
        print(f"Features shape: {self.features_df.shape}")
        # Load labels file
        self.labels_df = pd.read_csv(labels_file)
        print(f"Labels shape: {self.labels_df.shape}")
        # Merge dataframes on videoname
        merged_df = pd.merge(self.features_df, self.labels_df, on='videoname')
        print(f"Merged data shape: {merged_df.shape}")
        # Extract features (all columns except videoname)
        feature_columns = [col for col in self.features_df.columns if col != 'videoname']
        self.X = merged_df[feature_columns].values
        # Extract labels
        self.y = merged_df[self.label_columns].values
        print(f"Final features shape: {self.X.shape}")
        print(f"Final labels shape: {self.y.shape}")
        print("Data loading completed successfully!")

    def split_data(self, test_size=0.2, random_state=42):
        """Split data into train and test sets."""
        print(f"\nSplitting data into train/test sets (test_size={test_size})...")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state
        )
        print(f"Training data shape: X={self.X_train.shape}, y={self.y_train.shape}")
        print(f"Test data shape: X={self.X_test.shape}, y={self.y_test.shape}")

    # Removed preprocess_features method since PCA is now handled inside SVRWithVarianceThreshold
    
    def _logistic_4pl(self, x, A, D, C, B):
        """4-parameter logistic function."""
        return D + (A - D) / (1 + (x / C) ** B)
    
    def _logistic_5pl(self, x, A, D, C, B, G):
        """5-parameter logistic function."""
        return D + (A - D) / ((1 + (x / C) ** B) ** G)
        
    def _logistic_fit_and_map(self, y_pred: np.ndarray, y_true: np.ndarray, model: str = None):
        """
        Fit 4PL/5PL logistic function and return mapped predictions and metrics.
        """
        model = (model or self.logistic_model).lower()
        x = np.asarray(y_pred).ravel()
        y = np.asarray(y_true).ravel()
        if model == "4pl":
            func = self._logistic_4pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0]
        else:
            func = self._logistic_5pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0, 1.0]
        popt, _ = curve_fit(func, x, y, p0=beta0, maxfev=20000)
        z = func(x, *popt)
        plcc_fitted, _ = pearsonr(z, y)
        spearman_fitted, _ = spearmanr(z, y)
        rmse_fitted = float(np.sqrt(np.mean((z - y) ** 2)))
        return z, popt, plcc_fitted, spearman_fitted, rmse_fitted
        
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics."""
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan, 'logistic_params': None}
        
        # PLCC with fitted predictions (4PL/5PL)
        logistic_params = None
        try:
            _, params, plcc_fitted, _, _ = self._logistic_fit_and_map(
                y_pred_clean, y_true_clean, model=self.logistic_model
            )
            plcc = plcc_fitted
            logistic_params = params
        except Exception as e:
            print(f"        Warning: Logistic fitting failed ({e}), using original predictions for PLCC")
            plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # Other metrics with original predictions
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {
            'PLCC': plcc,
            'SRCC': srcc,
            'KRCC': krcc,
            'RMSE': rmse,
            'logistic_params': logistic_params
        }
        
    def tune_hyperparameters(self, cv=5):
        """Perform hyperparameter tuning using GridSearchCV on training data only."""
        if self.X_train is None:
            raise ValueError("Data not split yet. Run split_data() first.")
            
        print(f"\nPerforming hyperparameter tuning with {cv}-fold cross-validation...")
        
        # Define parameter grid
        param_grid = {
            # 'variance_threshold': [0.8, 0.85, 0.9, 0.95, 0.975, 0.99, 1],
            'variance_threshold': [0.99],
            # 'kernel': ['linear', 'rbf'],
            'kernel': ['rbf'],
            # 'C': [0.1, 1, 10, 100],
            'C': [10],
            # 'gamma': ['scale', 'auto'],
            'gamma': ['scale'],
            # 'max_iter': [100, 500, -1]
            'max_iter': [500]
        }
        
        print(f"Parameter grid size: {len(param_grid['variance_threshold']) * len(param_grid['kernel']) * len(param_grid['C']) * len(param_grid['gamma']) * len(param_grid['max_iter'])} combinations")
        
        # Perform grid search ONLY on training data
        grid_search = GridSearchCV(
            estimator=SVRWithVarianceThreshold(),
            param_grid=param_grid,
            cv=cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        print("Starting grid search on training data...")
        grid_search.fit(self.X_train, self.y_train)  # Only training data
        
        # Store best parameters and model
        self.best_params = grid_search.best_params_
        self.model = grid_search.best_estimator_
        print("Hyperparameter tuning completed!")
        
        self.print_best_parameters()
        
    def print_best_parameters(self):
        """Print the best hyperparameters found by GridSearchCV."""
        print("\n" + "="*50)
        print("BEST HYPERPARAMETERS FOUND:")
        print("="*50)
        if self.best_params:
            for param, value in self.best_params.items():
                print(f"{param}: {value}")
        else:
            print("No hyperparameters found. Run tune_hyperparameters() first.")
        print("="*50)
        
    def evaluate_model(self):
        """Evaluate the trained model on test data (default) or training data."""
        if self.model is None:
            print("No trained model found. Run tune_hyperparameters() first.")
            return
            
        if self.X_test is None:
            raise ValueError("Test data not available. Run split_data() first.")
        X_eval, y_eval = self.X_test, self.y_test
        dataset_name = "TEST"
            
        print(f"\n" + "="*60)
        print(f"MODEL PERFORMANCE ON {dataset_name} DATA:")
        print("="*60)
        
        # Make predictions
        y_pred = self.model.predict(X_eval)
        
        # Calculate metrics for each output
        print(f"{'Label':<8} {'PLCC':<8} {'SRCC':<8} {'KRCC':<8} {'RMSE':<8}")
        print("-" * 50)
        
        overall_plcc = []
        overall_srcc = []
        overall_krcc = []
        overall_rmse = []
        
        for i, label in enumerate(self.label_columns):
            metrics = self._calculate_metrics(y_eval[:, i], y_pred[:, i])
            
            print(f"{label:<8} {metrics['PLCC']:<8.4f} {metrics['SRCC']:<8.4f} "
                  f"{metrics['KRCC']:<8.4f} {metrics['RMSE']:<8.4f}")
            
            # Store for overall calculations
            if not np.isnan(metrics['PLCC']): overall_plcc.append(metrics['PLCC'])
            if not np.isnan(metrics['SRCC']): overall_srcc.append(metrics['SRCC'])
            if not np.isnan(metrics['KRCC']): overall_krcc.append(metrics['KRCC'])
            if not np.isnan(metrics['RMSE']): overall_rmse.append(metrics['RMSE'])
        
        print("-" * 50)
        
        # Overall metrics across all outputs
        if overall_plcc:
            print(f"Mean     {np.mean(overall_plcc):<8.4f} {np.mean(overall_srcc):<8.4f} "
                  f"{np.mean(overall_krcc):<8.4f} {np.mean(overall_rmse):<8.4f}")

    def save_model(self, filepath=None):
        """Save the complete trained model pipeline to a pickle file."""
        if self.model is None:
            raise ValueError("Model not trained yet. Run the complete pipeline first.")
        
        if filepath is None:
            filepath = f"svr_pca_model.pkl"
        
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
        
        model_package = {
            'model': self.model,
            'best_params': self.best_params,
            'label_columns': self.label_columns,
            'logistic_model': self.logistic_model,
            'pca_n_components': self.model.pca_.n_components_ if hasattr(self.model, 'pca_') else None,
            'explained_variance_ratio': self.model.pca_.explained_variance_ratio_ if hasattr(self.model, 'pca_') else None,
            'save_timestamp': datetime.datetime.now().isoformat()
        }
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model_package, f)
            
            print(f"\nModel saved successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            print(f"File size: {os.path.getsize(filepath) / (1024*1024):.2f} MB")
            
        except Exception as e:
            print(f"Error saving model: {str(e)}")
            raise
            
    def load_model(self, filepath):
        """Load a previously saved model from a pickle file."""
        try:
            with open(filepath, 'rb') as f:
                model_package = pickle.load(f)
            
            self.model = model_package['model']
            self.best_params = model_package['best_params']
            self.label_columns = model_package['label_columns']
            self.logistic_model = model_package.get('logistic_model', '4pl')
            
            print(f"\nModel loaded successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            
            if model_package.get('pca_n_components'):
                print(f"PCA components: {model_package['pca_n_components']}")
            print(f"Logistic model: {self.logistic_model}")
            print(f"Saved on: {model_package.get('save_timestamp', 'Unknown')}")
            
            if self.best_params:
                print("\nLoaded hyperparameters:")
                for param, value in self.best_params.items():
                    print(f"  {param}: {value}")
                    
        except FileNotFoundError:
            print(f"Error: Model file not found at {filepath}")
            raise
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise
            
    def run_complete_pipeline(
        self,
        features_file=None,
        labels_file=None,
        cv=5,
        test_size=0.2,
        save_model_path=None,
        logistic_model="4pl"
    ):
        print("Running SVR with PCA pipeline...")
        print(f"Features: {features_file}")
        print(f"Labels:   {labels_file}")
        print(f"Logistic model: {logistic_model}")
        print("=" * 60)
        
        self.logistic_model = logistic_model
        self.load_data(features_file, labels_file)
        self.split_data(test_size=test_size, random_state=42)  # Split data first
        self.tune_hyperparameters(cv)  # Train only on training data
        
        # Evaluate on test data for realistic performance estimate
        self.evaluate_model()
        
        # Save model after training
        if save_model_path:
            self.save_model(save_model_path)
        
        print("\nPipeline completed successfully!")
        
    def predict(self, new_features):
        """Make predictions on new data."""
        if self.model is None:
            raise ValueError("Model not trained yet. Run the complete pipeline first or load a saved model.")
        return self.model.predict(new_features)

In [11]:
if __name__ == "__main__":
    svr_model = MultiOutputSVRWithPCA()
    # Run pipeline with proper train/test split
    svr_model.run_complete_pipeline(
        features_file = r".\dataset\cleaned-svd-features.csv", 
        labels_file = r".\dataset\cleaned-mos.csv",
        save_model_path = r".\trained_models\svr_pca_model.pkl",
        test_size = 0.2,
        cv = 1
    )
    print("Pipeline execution finished.")

Running SVR with PCA pipeline...
Features: .\dataset\cleaned-svd-features.csv
Labels:   .\dataset\cleaned-mos.csv
Logistic model: 4pl
Loading data...
Features shape: (1000, 1153)
Labels shape: (1000, 7)
Merged data shape: (1000, 1159)
Final features shape: (1000, 1152)
Final labels shape: (1000, 6)
Data loading completed successfully!

Splitting data into train/test sets (test_size=0.2)...
Training data shape: X=(800, 1152), y=(800, 6)
Test data shape: X=(200, 1152), y=(200, 6)

Performing hyperparameter tuning with 5-fold cross-validation...
Parameter grid size: 3 combinations
Starting grid search on training data...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Hyperparameter tuning completed!

BEST HYPERPARAMETERS FOUND:
C: 10
gamma: scale
kernel: rbf
max_iter: 500
variance_threshold: 0.99

MODEL PERFORMANCE ON TEST DATA:
Label    PLCC     SRCC     KRCC     RMSE    
--------------------------------------------------
TSV      0.8773   0.8527   0.6684   0.3186  
B       