In [1]:
import os
import datetime
import pickle
import warnings

import numpy as np
import pandas as pd

from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

from scipy.optimize import curve_fit
from scipy.stats import pearsonr, spearmanr, kendalltau

# Suppress warnings globally
warnings.filterwarnings('ignore')

In [2]:
class SVRWithVarianceThreshold(BaseEstimator, RegressorMixin):
    """Custom SVR estimator with integrated PCA preprocessing."""
    
    def __init__(self, variance_threshold=0.95, kernel='rbf', C=1, 
                 gamma='scale', epsilon=0.1, max_iter=-1):
        self.variance_threshold = variance_threshold
        self.kernel = kernel
        self.C = C
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_iter = max_iter
        
    def fit(self, X, y):
        # Apply power scaling and PCA with the specified variance threshold
        self.scaler_ = PowerTransformer()
        X_scaled = self.scaler_.fit_transform(X)
        
        pca = PCA()
        pca.fit(X_scaled)
        cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
        n_components = np.argmax(cumsum_variance >= self.variance_threshold) + 1
        self.pca_ = PCA(n_components=n_components)
        X_pca = self.pca_.fit_transform(X_scaled)
        
        # **FIX: Store the sign correction for consistency**
        self.pca_signs_ = np.sign(self.pca_.components_[0, :n_components])
        X_pca *= self.pca_signs_  # Apply sign correction
        
        # Create and fit the SVR model
        base_svr = SVR(kernel=self.kernel, C=self.C, 
                      gamma=self.gamma, epsilon=self.epsilon, 
                      max_iter=self.max_iter)
        self.model_ = MultiOutputRegressor(base_svr)
        self.model_.fit(X_pca, y)
        return self
        
    def predict(self, X):
        X_scaled = self.scaler_.transform(X)
        X_pca = self.pca_.transform(X_scaled)
        X_pca *= self.pca_signs_  # **FIX: Apply same sign correction**
        return self.model_.predict(X_pca)

In [3]:
class SecondStageSVRTrainer:
    """
    A class for training second-stage SVR using predictions from a pre-trained first-stage model
    combined with extra features (MeanNIQE, MeanSSIM).
    """
    def __init__(self, random_state=42):
        """Initialize the second-stage trainer."""
        self.first_stage_model = None
        self.model = None  # Second-stage model
        self.scaler = StandardScaler()
        self.best_params = None
        self.random_state = random_state
        self.label_columns = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        self.logistic_model = "4pl"
        
        # Data storage
        self.features_df = None
        self.labels_df = None
        self.extra_features_df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.extra_train = None
        self.extra_test = None

    def load_first_stage_model(self, filepath):
        """Load the pre-trained first-stage model from pickle file."""
        try:
            with open(filepath, 'rb') as f:
                model_package = pickle.load(f)
            
            self.first_stage_model = model_package['model']
            print(f"\nFirst-stage model loaded successfully from: {filepath}")
                
        except FileNotFoundError:
            print(f"Error: First-stage model file not found at {filepath}")
            raise
        except Exception as e:
            print(f"Error loading first-stage model: {str(e)}")
            raise

    def load_data(self, features_file, labels_file, extra_features_file):
        """Load features, labels, and extra features from CSV files."""
        print("Loading data for second-stage training...")
        
        # Load all files
        self.features_df = pd.read_csv(features_file)
        self.labels_df = pd.read_csv(labels_file)
        self.extra_features_df = pd.read_csv(extra_features_file)
        
        print(f"Features shape: {self.features_df.shape}")
        print(f"Labels shape: {self.labels_df.shape}")
        print(f"Extra features shape: {self.extra_features_df.shape}")
        
        # Merge dataframes on videoname
        merged_df = pd.merge(self.features_df, self.labels_df, on='videoname')
        merged_all = pd.merge(merged_df, self.extra_features_df, on='videoname')
        print(f"Merged data shape: {merged_all.shape}")
        
        # Extract features (all columns except videoname from features file)
        feature_columns = [col for col in self.features_df.columns if col != 'videoname']
        self.X = merged_all[feature_columns].values
        
        # Extract labels
        self.y = merged_all[self.label_columns].values
        
        # Store extra features separately
        self.extra_features_data = merged_all[['MeanNIQE', 'MeanSSIM']].reset_index(drop=True)
        
        print(f"Final features shape: {self.X.shape}")
        print(f"Final labels shape: {self.y.shape}")
        print(f"Extra features shape: {self.extra_features_data.shape}")
        print("Data loading completed successfully!")

    def split_data(self, test_size=0.2, random_state=None):
        """Split data into train and test sets."""
        if random_state is None:
            random_state = self.random_state
            
        print(f"\nSplitting data into train/test sets (test_size={test_size}, random_state={random_state})...")
        
        self.X_train, self.X_test, self.y_train, self.y_test, self.extra_train, self.extra_test = train_test_split(
            self.X, self.y, self.extra_features_data, 
            test_size=test_size, random_state=random_state
        )
        
        print(f"Training data shape: X={self.X_train.shape}, y={self.y_train.shape}")
        print(f"Test data shape: X={self.X_test.shape}, y={self.y_test.shape}")
        print(f"Extra features train/test shapes: {self.extra_train.shape}, {self.extra_test.shape}")

    def prepare_second_stage_features(self, X, extra_features_data, feature_choice='niqe_ssim'):
        """Prepare features for second-stage training by combining first-stage predictions with extra features."""
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
        
        # Select extra features based on choice
        if feature_choice == 'niqe':
            extra_cols = ['MeanNIQE']
        elif feature_choice == 'ssim':
            extra_cols = ['MeanSSIM']
        elif feature_choice == 'niqe_ssim':
            extra_cols = ['MeanNIQE', 'MeanSSIM']
        else:
            raise ValueError("feature_choice must be one of ['niqe', 'ssim', 'niqe_ssim']")
        
        # Get selected extra features
        extra_selected = extra_features_data[extra_cols].values
        
        # Get first-stage predictions
        first_stage_pred = self.first_stage_model.predict(X)
        
        # Combine first-stage predictions with extra features
        combined_features = np.hstack((first_stage_pred, extra_selected))
        
        return combined_features

    def tune_hyperparameters(self, feature_choice='niqe_ssim', cv=5):
        """Perform hyperparameter tuning for second-stage SVR."""
        if self.X_train is None:
            raise ValueError("Data not split yet. Run split_data() first.")
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
            
        print(f"\nTuning hyperparameters for second-stage SVR using features: {feature_choice}")
        print(f"Cross-validation folds: {cv}")
        
        # Prepare training features for second stage
        combined_train_features = self.prepare_second_stage_features(
            self.X_train, self.extra_train, feature_choice
        )
        
        # Scale the combined features
        combined_train_scaled = self.scaler.fit_transform(combined_train_features)
        
        print(f"Combined training features shape: {combined_train_scaled.shape}")
        
        # Define parameter grid for second-stage SVR
        param_grid = {
            # 'estimator__kernel': ['rbf', 'linear'],
            'estimator__kernel': ['rbf'],
            # 'estimator__C': [0.1, 1, 10, 100],
            'estimator__C': [1],
            # 'estimator__gamma': ['scale', 'auto'],
            'estimator__gamma': ['scale'],
            # 'estimator__epsilon': [0.01, 0.1],
            'estimator__epsilon': [0.1],
            # 'estimator__max_iter': [100, 500, 1000, -1]
            'estimator__max_iter': [100, 500, -1]
        }
        
        print(f"Parameter combinations to test: {np.prod([len(values) for values in param_grid.values()])}")
        
        # Create base SVR and MultiOutputRegressor
        base_svr = SVR()
        multi_output_svr = MultiOutputRegressor(base_svr)
        
        # Perform grid search
        grid_search = GridSearchCV(
            estimator=multi_output_svr,
            param_grid=param_grid,
            cv=cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        print("Starting grid search for second-stage SVR...")
        grid_search.fit(combined_train_scaled, self.y_train)
        
        # Store best parameters and model
        self.best_params = grid_search.best_params_
        self.model = grid_search.best_estimator_
        
        print("Second-stage hyperparameter tuning completed!")
        self.print_best_parameters()

    def print_best_parameters(self):
        """Print the best hyperparameters found by GridSearchCV."""
        print("\n" + "="*50)
        print("BEST SECOND-STAGE HYPERPARAMETERS:")
        print("="*50)
        if self.best_params:
            for param, value in self.best_params.items():
                print(f"{param}: {value}")
        else:
            print("No hyperparameters found. Run tune_hyperparameters() first.")
        print("="*50)

    def _logistic_4pl(self, x, A, D, C, B):
        """4-parameter logistic function."""
        return D + (A - D) / (1 + (x / C) ** B)
    
    def _logistic_5pl(self, x, A, D, C, B, G):
        """5-parameter logistic function."""
        return D + (A - D) / ((1 + (x / C) ** B) ** G)
        
    def _logistic_fit_and_map(self, y_pred: np.ndarray, y_true: np.ndarray, model: str = None):
        """Fit 4PL/5PL logistic function and return mapped predictions and metrics."""
        model = (model or self.logistic_model).lower()
        x = np.asarray(y_pred).ravel()
        y = np.asarray(y_true).ravel()
        if model == "4pl":
            func = self._logistic_4pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0]
        else:
            func = self._logistic_5pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0, 1.0]
        popt, _ = curve_fit(func, x, y, p0=beta0, maxfev=20000)
        z = func(x, *popt)
        plcc_fitted, _ = pearsonr(z, y)
        spearman_fitted, _ = spearmanr(z, y)
        rmse_fitted = float(np.sqrt(np.mean((z - y) ** 2)))
        return z, popt, plcc_fitted, spearman_fitted, rmse_fitted
        
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics."""
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # PLCC with fitted predictions (4PL/5PL)
        try:
            _, params, plcc_fitted, _, _ = self._logistic_fit_and_map(
                y_pred_clean, y_true_clean, model=self.logistic_model
            )
            plcc = plcc_fitted
        except Exception as e:
            print(f"        Warning: Logistic fitting failed ({e}), using original predictions for PLCC")
            plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # Other metrics with original predictions
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {'PLCC': plcc, 'SRCC': srcc, 'KRCC': krcc, 'RMSE': rmse}

    def evaluate_model(self, feature_choice='niqe_ssim'):
        """Evaluate the second-stage model on test data."""
        if self.model is None:
            raise ValueError("Second-stage model not trained. Run tune_hyperparameters() first.")
        if self.X_test is None:
            raise ValueError("Test data not available. Run split_data() first.")

        print(f"\n" + "="*60)
        print(f"SECOND-STAGE MODEL PERFORMANCE ON TEST DATA:")
        print(f"Using features: {feature_choice}")
        print("="*60)
        
        # Prepare test features for second stage
        combined_test_features = self.prepare_second_stage_features(
            self.X_test, self.extra_test, feature_choice
        )
        combined_test_scaled = self.scaler.transform(combined_test_features)
        
        # Make predictions
        y_pred = self.model.predict(combined_test_scaled)
        
        # Calculate metrics for each output
        print(f"{'Label':<8} {'PLCC':<8} {'SRCC':<8} {'KRCC':<8} {'RMSE':<8}")
        print("-" * 50)
        
        overall_plcc = []
        overall_srcc = []
        overall_krcc = []
        overall_rmse = []
        
        for i, label in enumerate(self.label_columns):
            metrics = self._calculate_metrics(self.y_test[:, i], y_pred[:, i])
            
            print(f"{label:<8} {metrics['PLCC']:<8.4f} {metrics['SRCC']:<8.4f} "
                  f"{metrics['KRCC']:<8.4f} {metrics['RMSE']:<8.4f}")
            
            # Store for overall calculations
            if not np.isnan(metrics['PLCC']): overall_plcc.append(metrics['PLCC'])
            if not np.isnan(metrics['SRCC']): overall_srcc.append(metrics['SRCC'])
            if not np.isnan(metrics['KRCC']): overall_krcc.append(metrics['KRCC'])
            if not np.isnan(metrics['RMSE']): overall_rmse.append(metrics['RMSE'])
        
        print("-" * 50)
        
        # Overall metrics across all outputs
        if overall_plcc:
            print(f"Mean     {np.mean(overall_plcc):<8.4f} {np.mean(overall_srcc):<8.4f} "
                  f"{np.mean(overall_krcc):<8.4f} {np.mean(overall_rmse):<8.4f}")

    def save_model(self, filepath):
        """Save the second-stage model to a pickle file."""
        if self.model is None:
            raise ValueError("No second-stage model to save. Train the model first.")
            
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
        
        model_package = {
            'model': self.model,
            'scaler': self.scaler,
            'best_params': self.best_params,
            'label_columns': self.label_columns,
            'logistic_model': self.logistic_model,
            'random_state': self.random_state,
            'save_timestamp': datetime.datetime.now().isoformat()
        }
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model_package, f)
            
            print(f"\nSecond-stage model saved successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            print(f"File size: {os.path.getsize(filepath) / (1024*1024):.2f} MB")
            
        except Exception as e:
            print(f"Error saving second-stage model: {str(e)}")
            raise

    def load_model(self, filepath):
        """Load a previously saved second-stage model from pickle file."""
        try:
            with open(filepath, 'rb') as f:
                model_package = pickle.load(f)
            
            self.model = model_package['model']
            self.scaler = model_package['scaler']
            self.best_params = model_package['best_params']
            self.label_columns = model_package.get('label_columns', ['TSV', 'B', 'SR', 'S', 'U', 'O'])
            self.logistic_model = model_package.get('logistic_model', '4pl')
            self.random_state = model_package.get('random_state', 42)
            
            print(f"\nSecond-stage model loaded successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            print(f"Random state: {self.random_state}")
            print(f"Saved on: {model_package.get('save_timestamp', 'Unknown')}")
            
        except FileNotFoundError:
            print(f"Error: Second-stage model file not found at {filepath}")
            raise
        except Exception as e:
            print(f"Error loading second-stage model: {str(e)}")
            raise

    def run_complete_pipeline(
        self,
        first_stage_model_path,
        features_file,
        labels_file,
        extra_features_file,
        feature_choice='niqe_ssim',
        cv=5,
        test_size=0.2,
        save_model_path=None,
        random_state=42
    ):
        """Run the complete second-stage training pipeline."""
        print("Running Second-Stage SVR Training Pipeline...")
        print(f"First-stage model: {first_stage_model_path}")
        print(f"Features: {features_file}")
        print(f"Labels: {labels_file}")
        print(f"Extra features: {extra_features_file}")
        print(f"Feature choice: {feature_choice}")
        print(f"Random state: {random_state}")
        print("=" * 60)
        
        self.random_state = random_state
        
        # Load first-stage model
        self.load_first_stage_model(first_stage_model_path)
        
        # Load and split data
        self.load_data(features_file, labels_file, extra_features_file)
        self.split_data(test_size=test_size, random_state=random_state)
        
        # Train second-stage model
        self.tune_hyperparameters(feature_choice=feature_choice, cv=cv)
        
        # Evaluate second-stage model
        self.evaluate_model(feature_choice=feature_choice)
        
        # Save model if path provided
        if save_model_path:
            self.save_model(save_model_path)
        
        print("\nSecond-stage pipeline completed successfully!")

    def predict(self, new_features, extra_features_data, feature_choice='niqe_ssim'):
        """Make predictions using the trained second-stage model."""
        if self.model is None:
            raise ValueError("Second-stage model not trained. Run the pipeline first or load a saved model.")
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
        
        # Prepare features for second stage
        combined_features = self.prepare_second_stage_features(
            new_features, extra_features_data, feature_choice
        )
        combined_scaled = self.scaler.transform(combined_features)
        
        return self.model.predict(combined_scaled)

In [6]:
if __name__ == "__main__":
    # Example usage
    second_stage_trainer = SecondStageSVRTrainer(random_state=42)
    
    # Run complete pipeline
    second_stage_trainer.run_complete_pipeline(
        first_stage_model_path=r".\trained_models\svr_pca_model.pkl",
        features_file=r".\dataset\cleaned-svd-features.csv", 
        labels_file=r".\dataset\cleaned-mos.csv",
        extra_features_file=r".\dataset\cleaned-extra-features.csv",
        feature_choice='niqe_ssim',  # Options: 'niqe', 'ssim', 'niqe_ssim'
        cv=5,
        test_size=0.2,
        save_model_path=r".\trained_models\second_stage_svr_model.pkl",
        random_state=42
    )
    
    print("Second-stage training completed!")

Running Second-Stage SVR Training Pipeline...
First-stage model: .\trained_models\svr_pca_model.pkl
Features: .\dataset\cleaned-svd-features.csv
Labels: .\dataset\cleaned-mos.csv
Extra features: .\dataset\cleaned-extra-features.csv
Feature choice: ssim
Random state: 42

First-stage model loaded successfully from: .\trained_models\svr_pca_model.pkl
Loading data for second-stage training...
Features shape: (1000, 1153)
Labels shape: (1000, 7)
Extra features shape: (1000, 3)
Merged data shape: (1000, 1161)
Final features shape: (1000, 1152)
Final labels shape: (1000, 6)
Extra features shape: (1000, 2)
Data loading completed successfully!

Splitting data into train/test sets (test_size=0.2, random_state=42)...
Training data shape: X=(800, 1152), y=(800, 6)
Test data shape: X=(200, 1152), y=(200, 6)
Extra features train/test shapes: (800, 2), (200, 2)

Tuning hyperparameters for second-stage SVR using features: ssim
Cross-validation folds: 5
Combined training features shape: (800, 7)
Parame

In [1]:
import os
import datetime
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.optimize import curve_fit
from scipy.stats import pearsonr, spearmanr, kendalltau

# Suppress warnings globally
warnings.filterwarnings('ignore')

class SVRWithVarianceThreshold(BaseEstimator, RegressorMixin):
    """Custom SVR estimator with integrated PCA preprocessing."""
    
    def __init__(self, variance_threshold=0.95, kernel='rbf', C=1, 
                 gamma='scale', epsilon=0.1, max_iter=-1):
        self.variance_threshold = variance_threshold
        self.kernel = kernel
        self.C = C
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_iter = max_iter
        
    def fit(self, X, y):
        # Apply power scaling and PCA with the specified variance threshold
        self.scaler_ = PowerTransformer()
        X_scaled = self.scaler_.fit_transform(X)
        
        pca = PCA()
        pca.fit(X_scaled)
        cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
        n_components = np.argmax(cumsum_variance >= self.variance_threshold) + 1
        self.pca_ = PCA(n_components=n_components)
        X_pca = self.pca_.fit_transform(X_scaled)
        
        # **FIX: Store the sign correction for consistency**
        self.pca_signs_ = np.sign(self.pca_.components_[0, :n_components])
        X_pca *= self.pca_signs_  # Apply sign correction
        
        # Create and fit the SVR model
        base_svr = SVR(kernel=self.kernel, C=self.C, 
                      gamma=self.gamma, epsilon=self.epsilon, 
                      max_iter=self.max_iter)
        self.model_ = MultiOutputRegressor(base_svr)
        self.model_.fit(X_pca, y)
        return self
        
    def predict(self, X):
        X_scaled = self.scaler_.transform(X)
        X_pca = self.pca_.transform(X_scaled)
        X_pca *= self.pca_signs_  # **FIX: Apply same sign correction**
        return self.model_.predict(X_pca)

class SecondStageSVRTrainer:
    """
    A class for training second-stage SVR using predictions from a pre-trained first-stage model
    combined with extra features (MeanNIQE, MeanSSIM).
    """
    def __init__(self, random_state=42):
        """Initialize the second-stage trainer."""
        self.first_stage_model = None
        self.model = None  # Second-stage model
        self.scaler = StandardScaler()
        self.extra_scaler = StandardScaler()  # NEW: Independent scaler for extra features
        self.best_params = None
        self.random_state = random_state
        self.label_columns = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        self.logistic_model = "4pl"
        
        # Data storage
        self.features_df = None
        self.labels_df = None
        self.extra_features_df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.extra_train = None
        self.extra_test = None

    def load_first_stage_model(self, filepath):
        """Load the pre-trained first-stage model from pickle file."""
        try:
            with open(filepath, 'rb') as f:
                model_package = pickle.load(f)
            
            self.first_stage_model = model_package['model']
            print(f"\nFirst-stage model loaded successfully from: {filepath}")
                
        except FileNotFoundError:
            print(f"Error: First-stage model file not found at {filepath}")
            raise
        except Exception as e:
            print(f"Error loading first-stage model: {str(e)}")
            raise

    def load_data(self, features_file, labels_file, extra_features_file):
        """Load features, labels, and extra features from CSV files."""
        print("Loading data for second-stage training...")
        
        # Load all files
        self.features_df = pd.read_csv(features_file)
        self.labels_df = pd.read_csv(labels_file)
        self.extra_features_df = pd.read_csv(extra_features_file)
        
        print(f"Features shape: {self.features_df.shape}")
        print(f"Labels shape: {self.labels_df.shape}")
        print(f"Extra features shape: {self.extra_features_df.shape}")
        
        # Merge dataframes on videoname
        merged_df = pd.merge(self.features_df, self.labels_df, on='videoname')
        merged_all = pd.merge(merged_df, self.extra_features_df, on='videoname')
        print(f"Merged data shape: {merged_all.shape}")
        
        # Extract features (all columns except videoname from features file)
        feature_columns = [col for col in self.features_df.columns if col != 'videoname']
        self.X = merged_all[feature_columns].values
        
        # Extract labels
        self.y = merged_all[self.label_columns].values
        
        # Store extra features separately
        self.extra_features_data = merged_all[['MeanNIQE', 'MeanSSIM']].reset_index(drop=True)
        
        print(f"Final features shape: {self.X.shape}")
        print(f"Final labels shape: {self.y.shape}")
        print(f"Extra features shape: {self.extra_features_data.shape}")
        print("Data loading completed successfully!")

    def split_data(self, test_size=0.2, random_state=None):
        """Split data into train and test sets."""
        if random_state is None:
            random_state = self.random_state
            
        print(f"\nSplitting data into train/test sets (test_size={test_size}, random_state={random_state})...")
        
        self.X_train, self.X_test, self.y_train, self.y_test, self.extra_train, self.extra_test = train_test_split(
            self.X, self.y, self.extra_features_data, 
            test_size=test_size, random_state=random_state
        )
        
        # NEW: Fit the extra features scaler on training data only
        self.extra_scaler.fit(self.extra_train)
        
        print(f"Training data shape: X={self.X_train.shape}, y={self.y_train.shape}")
        print(f"Test data shape: X={self.X_test.shape}, y={self.y_test.shape}")
        print(f"Extra features train/test shapes: {self.extra_train.shape}, {self.extra_test.shape}")

    def prepare_second_stage_features(self, X, extra_features_data, feature_choice='niqe_ssim'):
        """Prepare features for second-stage training by combining first-stage predictions with extra features."""
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
        
        # Select extra features based on choice
        if feature_choice == 'niqe':
            extra_cols = ['MeanNIQE']
        elif feature_choice == 'ssim':
            extra_cols = ['MeanSSIM']
        elif feature_choice == 'niqe_ssim':
            extra_cols = ['MeanNIQE', 'MeanSSIM']
        else:
            raise ValueError("feature_choice must be one of ['niqe', 'ssim', 'niqe_ssim']")
        
        # Get selected extra features
        extra_selected = extra_features_data[extra_cols].values
        
        # NEW: Apply independent scaling to extra features
        extra_scaled = self.extra_scaler.transform(extra_selected)
        
        # Get first-stage predictions
        first_stage_pred = self.first_stage_model.predict(X)
        
        # Combine first-stage predictions with scaled extra features
        combined_features = np.hstack((first_stage_pred, extra_scaled))
        
        return combined_features

    def tune_hyperparameters(self, feature_choice='niqe_ssim', cv=5):
        """Perform hyperparameter tuning for second-stage SVR."""
        if self.X_train is None:
            raise ValueError("Data not split yet. Run split_data() first.")
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
            
        print(f"\nTuning hyperparameters for second-stage SVR using features: {feature_choice}")
        print(f"Cross-validation folds: {cv}")
        
        # Prepare training features for second stage
        combined_train_features = self.prepare_second_stage_features(
            self.X_train, self.extra_train, feature_choice
        )
        
        # Scale the combined features
        combined_train_scaled = self.scaler.fit_transform(combined_train_features)
        
        print(f"Combined training features shape: {combined_train_scaled.shape}")
        
        # Define parameter grid for second-stage SVR
        param_grid = {
            # 'estimator__kernel': ['rbf', 'linear'],
            'estimator__kernel': ['rbf'],
            # 'estimator__C': [0.1, 1, 10, 100],
            'estimator__C': [1],
            # 'estimator__gamma': ['scale', 'auto'],
            'estimator__gamma': ['scale'],
            # 'estimator__epsilon': [0.01, 0.1],
            'estimator__epsilon': [0.1],
            # 'estimator__max_iter': [100, 500, 1000, -1]
            'estimator__max_iter': [100, 500, -1]
        }
        
        print(f"Parameter combinations to test: {np.prod([len(values) for values in param_grid.values()])}")
        
        # Create base SVR and MultiOutputRegressor
        base_svr = SVR()
        multi_output_svr = MultiOutputRegressor(base_svr)
        
        # Perform grid search
        grid_search = GridSearchCV(
            estimator=multi_output_svr,
            param_grid=param_grid,
            cv=cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        print("Starting grid search for second-stage SVR...")
        grid_search.fit(combined_train_scaled, self.y_train)
        
        # Store best parameters and model
        self.best_params = grid_search.best_params_
        self.model = grid_search.best_estimator_
        
        print("Second-stage hyperparameter tuning completed!")
        self.print_best_parameters()

    def print_best_parameters(self):
        """Print the best hyperparameters found by GridSearchCV."""
        print("\n" + "="*50)
        print("BEST SECOND-STAGE HYPERPARAMETERS:")
        print("="*50)
        if self.best_params:
            for param, value in self.best_params.items():
                print(f"{param}: {value}")
        else:
            print("No hyperparameters found. Run tune_hyperparameters() first.")
        print("="*50)

    def _logistic_4pl(self, x, A, D, C, B):
        """4-parameter logistic function."""
        return D + (A - D) / (1 + (x / C) ** B)
    
    def _logistic_5pl(self, x, A, D, C, B, G):
        """5-parameter logistic function."""
        return D + (A - D) / ((1 + (x / C) ** B) ** G)
        
    def _logistic_fit_and_map(self, y_pred: np.ndarray, y_true: np.ndarray, model: str = None):
        """Fit 4PL/5PL logistic function and return mapped predictions and metrics."""
        model = (model or self.logistic_model).lower()
        x = np.asarray(y_pred).ravel()
        y = np.asarray(y_true).ravel()
        if model == "4pl":
            func = self._logistic_4pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0]
        else:
            func = self._logistic_5pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0, 1.0]
        popt, _ = curve_fit(func, x, y, p0=beta0, maxfev=20000)
        z = func(x, *popt)
        plcc_fitted, _ = pearsonr(z, y)
        spearman_fitted, _ = spearmanr(z, y)
        rmse_fitted = float(np.sqrt(np.mean((z - y) ** 2)))
        return z, popt, plcc_fitted, spearman_fitted, rmse_fitted
        
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics."""
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # PLCC with fitted predictions (4PL/5PL)
        try:
            _, params, plcc_fitted, _, _ = self._logistic_fit_and_map(
                y_pred_clean, y_true_clean, model=self.logistic_model
            )
            plcc = plcc_fitted
        except Exception as e:
            print(f"        Warning: Logistic fitting failed ({e}), using original predictions for PLCC")
            plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # Other metrics with original predictions
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {'PLCC': plcc, 'SRCC': srcc, 'KRCC': krcc, 'RMSE': rmse}

    def evaluate_model(self, feature_choice='niqe_ssim'):
        """Evaluate the second-stage model on test data."""
        if self.model is None:
            raise ValueError("Second-stage model not trained. Run tune_hyperparameters() first.")
        if self.X_test is None:
            raise ValueError("Test data not available. Run split_data() first.")
        print(f"\n" + "="*60)
        print(f"SECOND-STAGE MODEL PERFORMANCE ON TEST DATA:")
        print(f"Using features: {feature_choice}")
        print("="*60)
        
        # Prepare test features for second stage
        combined_test_features = self.prepare_second_stage_features(
            self.X_test, self.extra_test, feature_choice
        )
        combined_test_scaled = self.scaler.transform(combined_test_features)
        
        # Make predictions
        y_pred = self.model.predict(combined_test_scaled)
        
        # Calculate metrics for each output
        print(f"{'Label':<8} {'PLCC':<8} {'SRCC':<8} {'KRCC':<8} {'RMSE':<8}")
        print("-" * 50)
        
        overall_plcc = []
        overall_srcc = []
        overall_krcc = []
        overall_rmse = []
        
        for i, label in enumerate(self.label_columns):
            metrics = self._calculate_metrics(self.y_test[:, i], y_pred[:, i])
            
            print(f"{label:<8} {metrics['PLCC']:<8.4f} {metrics['SRCC']:<8.4f} "
                  f"{metrics['KRCC']:<8.4f} {metrics['RMSE']:<8.4f}")
            
            # Store for overall calculations
            if not np.isnan(metrics['PLCC']): overall_plcc.append(metrics['PLCC'])
            if not np.isnan(metrics['SRCC']): overall_srcc.append(metrics['SRCC'])
            if not np.isnan(metrics['KRCC']): overall_krcc.append(metrics['KRCC'])
            if not np.isnan(metrics['RMSE']): overall_rmse.append(metrics['RMSE'])
        
        print("-" * 50)
        
        # Overall metrics across all outputs
        if overall_plcc:
            print(f"Mean     {np.mean(overall_plcc):<8.4f} {np.mean(overall_srcc):<8.4f} "
                  f"{np.mean(overall_krcc):<8.4f} {np.mean(overall_rmse):<8.4f}")

    def save_model(self, filepath):
        """Save the second-stage model to a pickle file."""
        if self.model is None:
            raise ValueError("No second-stage model to save. Train the model first.")
            
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
        
        model_package = {
            'model': self.model,
            'scaler': self.scaler,
            'extra_scaler': self.extra_scaler,  # NEW: Save the extra features scaler
            'best_params': self.best_params,
            'label_columns': self.label_columns,
            'logistic_model': self.logistic_model,
            'random_state': self.random_state,
            'save_timestamp': datetime.datetime.now().isoformat()
        }
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model_package, f)
            
            print(f"\nSecond-stage model saved successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            print(f"File size: {os.path.getsize(filepath) / (1024*1024):.2f} MB")
            
        except Exception as e:
            print(f"Error saving second-stage model: {str(e)}")
            raise

    def load_model(self, filepath):
        """Load a previously saved second-stage model from pickle file."""
        try:
            with open(filepath, 'rb') as f:
                model_package = pickle.load(f)
            
            self.model = model_package['model']
            self.scaler = model_package['scaler']
            self.extra_scaler = model_package.get('extra_scaler', StandardScaler())  # NEW: Load extra scaler
            self.best_params = model_package['best_params']
            self.label_columns = model_package.get('label_columns', ['TSV', 'B', 'SR', 'S', 'U', 'O'])
            self.logistic_model = model_package.get('logistic_model', '4pl')
            self.random_state = model_package.get('random_state', 42)
            
            print(f"\nSecond-stage model loaded successfully!")
            print(f"Filepath: {os.path.abspath(filepath)}")
            print(f"Random state: {self.random_state}")
            print(f"Saved on: {model_package.get('save_timestamp', 'Unknown')}")
            
        except FileNotFoundError:
            print(f"Error: Second-stage model file not found at {filepath}")
            raise
        except Exception as e:
            print(f"Error loading second-stage model: {str(e)}")
            raise

    def run_complete_pipeline(
        self,
        first_stage_model_path,
        features_file,
        labels_file,
        extra_features_file,
        feature_choice='niqe_ssim',
        cv=5,
        test_size=0.2,
        save_model_path=None,
        random_state=42
    ):
        """Run the complete second-stage training pipeline."""
        print("Running Second-Stage SVR Training Pipeline...")
        print(f"First-stage model: {first_stage_model_path}")
        print(f"Features: {features_file}")
        print(f"Labels: {labels_file}")
        print(f"Extra features: {extra_features_file}")
        print(f"Feature choice: {feature_choice}")
        print(f"Random state: {random_state}")
        print("=" * 60)
        
        self.random_state = random_state
        
        # Load first-stage model
        self.load_first_stage_model(first_stage_model_path)
        
        # Load and split data
        self.load_data(features_file, labels_file, extra_features_file)
        self.split_data(test_size=test_size, random_state=random_state)
        
        # Train second-stage model
        self.tune_hyperparameters(feature_choice=feature_choice, cv=cv)
        
        # Evaluate second-stage model
        self.evaluate_model(feature_choice=feature_choice)
        
        # Save model if path provided
        if save_model_path:
            self.save_model(save_model_path)
        
        print("\nSecond-stage pipeline completed successfully!")

    def predict(self, new_features, extra_features_data, feature_choice='niqe_ssim'):
        """Make predictions using the trained second-stage model."""
        if self.model is None:
            raise ValueError("Second-stage model not trained. Run the pipeline first or load a saved model.")
        if self.first_stage_model is None:
            raise ValueError("First-stage model not loaded. Run load_first_stage_model() first.")
        
        # Prepare features for second stage
        combined_features = self.prepare_second_stage_features(
            new_features, extra_features_data, feature_choice
        )
        combined_scaled = self.scaler.transform(combined_features)
        
        return self.model.predict(combined_scaled)

if __name__ == "__main__":
    # Example usage
    second_stage_trainer = SecondStageSVRTrainer(random_state=42)
    
    # Run complete pipeline
    second_stage_trainer.run_complete_pipeline(
        first_stage_model_path=r".\trained_models\svr_pca_model.pkl",
        features_file=r".\dataset\cleaned-svd-features.csv", 
        labels_file=r".\dataset\cleaned-mos.csv",
        extra_features_file=r".\dataset\cleaned-extra-features.csv",
        feature_choice='niqe_ssim',  # Options: 'niqe', 'ssim', 'niqe_ssim'
        cv=5,
        test_size=0.2,
        save_model_path=r".\trained_models\second_stage_svr_model.pkl",
        random_state=42
    )
    
    print("Second-stage training completed!")


Running Second-Stage SVR Training Pipeline...
First-stage model: .\trained_models\svr_pca_model.pkl
Features: .\dataset\cleaned-svd-features.csv
Labels: .\dataset\cleaned-mos.csv
Extra features: .\dataset\cleaned-extra-features.csv
Feature choice: niqe_ssim
Random state: 42

First-stage model loaded successfully from: .\trained_models\svr_pca_model.pkl
Loading data for second-stage training...
Features shape: (1000, 1153)
Labels shape: (1000, 7)
Extra features shape: (1000, 3)
Merged data shape: (1000, 1161)
Final features shape: (1000, 1152)
Final labels shape: (1000, 6)
Extra features shape: (1000, 2)
Data loading completed successfully!

Splitting data into train/test sets (test_size=0.2, random_state=42)...
Training data shape: X=(800, 1152), y=(800, 6)
Test data shape: X=(200, 1152), y=(200, 6)
Extra features train/test shapes: (800, 2), (200, 2)

Tuning hyperparameters for second-stage SVR using features: niqe_ssim
Cross-validation folds: 5
Combined training features shape: (800,