In [1]:
# Import necessary libraries
!pip install xgboost
import pandas as pd
import numpy as np
import os
import pickle
import warnings

# Data / numerical
import numpy as np
import pandas as pd

# SciPy
from scipy.optimize import curve_fit
from scipy.stats import pearsonr, spearmanr, kendalltau

# Scikit-learn
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    ExtraTreesRegressor,
)
from sklearn.svm import SVR

# Gradient boosting libs
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [15]:
# ============================================================
#  Top-level GNN module
# ============================================================
class _GNNRegressor(nn.Module):
    def __init__(self, in_dim, h_dim, out_dim, num_layers, dropout, model_type):
        super().__init__()
        self.convs = nn.ModuleList()
        if model_type == "gcn":
            self.convs.append(GCNConv(in_dim, h_dim))
        else:
            self.convs.append(SAGEConv(in_dim, h_dim))
        for _ in range(num_layers - 1):
            if model_type == "gcn":
                self.convs.append(GCNConv(h_dim, h_dim))
            else:
                self.convs.append(SAGEConv(h_dim, h_dim))
        self.dropout = nn.Dropout(dropout)
        self.head = nn.Linear(h_dim, out_dim)

    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < len(self.convs) - 1:
                x = F.relu(x)
                x = self.dropout(x)
        return self.head(x).squeeze(-1)


# ============================================================
#  sklearn-compatible GNN regressor (pickle-friendly)
# ============================================================
class CustomCllassWhichWehavedefined(BaseEstimator, RegressorMixin):
    """
    A lightweight, scikit-learn compatible GNN regressor wrapper.

    - Single-target regression (the trainer fits per-label).
    - Works with RandomizedSearchCV.
    - Pickle-friendly via state_dict.
    """

    def __init__(
        self,
        model_type: str = "sage",       # "sage" or "gcn"
        k: int = 10,                    # kNN neighbors
        hidden_dim: int = 128,
        num_layers: int = 2,
        dropout: float = 0.1,
        lr: float = 0.001,
        weight_decay: float = 1e-4,
        epochs: int = 80,
        distance_metric: str = "cosine",  # "cosine" or "euclidean"
        random_state: int = 42,
        device: str = "auto",
    ):
        self.model_type = model_type
        self.k = k
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lr = lr
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.distance_metric = distance_metric
        self.random_state = random_state
        self.device = device

        # runtime-initialized
        self._scaler = None
        self._model = None
        self._torch_device = None
        self._input_dim = None

    # --------- Helpers ---------
    def _get_device(self):
        if self.device == "cpu":
            return torch.device("cpu")
        if self.device == "cuda":
            return torch.device("cuda" if torch.cuda.is_available() else "cpu")
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _set_seed(self):
        torch.manual_seed(self.random_state)
        np.random.seed(self.random_state)

    def _build_knn_graph(self, X: np.ndarray, k: int) -> torch.Tensor:
        if self.distance_metric == "cosine":
            sim = cosine_similarity(X)
            dist = 1 - sim
            np.fill_diagonal(dist, np.inf)
        elif self.distance_metric == "euclidean":
            dist = euclidean_distances(X)
            np.fill_diagonal(dist, np.inf)
        else:
            raise ValueError(f"Unsupported metric: {self.distance_metric}")

        edges = []
        for i in range(X.shape[0]):
            nbrs = np.argsort(dist[i])[:k]
            for j in nbrs:
                edges.append([i, j])
                edges.append([j, i])
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        if edge_index.numel() > 0:
            edge_index = torch.unique(edge_index, dim=1)
        edge_index, _ = add_self_loops(edge_index, num_nodes=X.shape[0])
        return edge_index

    def _make_model(self, input_dim: int) -> nn.Module:
        return _GNNRegressor(
            in_dim=input_dim,
            h_dim=self.hidden_dim,
            out_dim=1,  # single-target regression
            num_layers=self.num_layers,
            dropout=self.dropout,
            model_type=self.model_type if self.model_type in ("sage", "gcn") else "sage",
        )

    # --------- API: fit/predict ---------
    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        X: (n_samples, n_features)
        y: (n_samples,) single target regression
        """
        self._set_seed()
        self._torch_device = self._get_device()

        # scale features
        self._scaler = StandardScaler()
        Xs = self._scaler.fit_transform(X).astype(np.float32)
        self._input_dim = Xs.shape[1]

        # build graph on training data
        edge_index = self._build_knn_graph(Xs, self.k)

        # tensors
        x_tensor = torch.tensor(Xs, dtype=torch.float32, device=self._torch_device)
        y_tensor = torch.tensor(y.astype(np.float32), dtype=torch.float32, device=self._torch_device)
        edge_index = edge_index.to(self._torch_device)

        # model
        self._model = self._make_model(input_dim=self._input_dim).to(self._torch_device)
        optimizer = torch.optim.AdamW(self._model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        criterion = nn.SmoothL1Loss()

        self._model.train()
        for _ in range(int(self.epochs)):
            optimizer.zero_grad()
            out = self._model(x_tensor, edge_index)
            loss = criterion(out, y_tensor)
            loss.backward()
            optimizer.step()

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        if self._model is None or self._scaler is None or self._torch_device is None:
            raise RuntimeError("Model is not fitted yet.")

        Xs = self._scaler.transform(X).astype(np.float32)
        # graph for the current batch (inductive over batch)
        edge_index = self._build_knn_graph(Xs, self.k).to(self._torch_device)

        self._model.eval()
        with torch.no_grad():
            x_tensor = torch.tensor(Xs, dtype=torch.float32, device=self._torch_device)
            preds = self._model(x_tensor, edge_index).detach().cpu().numpy()
        return preds

    # --------- Pickle support ---------
    def __getstate__(self):
        state = self.__dict__.copy()
        if self._model is not None:
            state["_model_state_dict"] = self._model.state_dict()
            state["_model"] = None
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # restore device
        self._torch_device = self._get_device()
        # rebuild torch model from saved state_dict if available
        sd = state.get("_model_state_dict", None)
        if sd is not None and self._input_dim is not None:
            self._model = self._make_model(self._input_dim).to(self._torch_device)
            self._model.load_state_dict(sd)
            del self.__dict__["_model_state_dict"]


# ============================================================
#  CSVRegressionModelTrainer (with GNN in initialize_models)
#  + Integrated 4PL/5PL logistic fit for PLCC
# ============================================================
class CSVRegressionModelTrainer:
    
    def __init__(self, csv_file_path, labels_csv_path, models_save_path):
        self.csv_file_path = csv_file_path
        self.labels_csv_path = labels_csv_path
        self.models_save_path = models_save_path
        
        # Define feature ranges based on your dataset structure
        self.level1_features = {
            'U1': [f'U1_{i}' for i in range(1, 257)], 
            'S1': [f'S1_{i}' for i in range(1, 257)], 
            'V1': [f'V1_{i}' for i in range(1, 257)] 
        }
        self.level2_features = {
            'U2': [f'U2_{i}' for i in range(1, 129)], 
            'S2': [f'S2_{i}' for i in range(1, 129)], 
            'V2': [f'V2_{i}' for i in range(1, 129)]
        }
        self.all_level1_features = []
        for feature_group in self.level1_features.values():
            self.all_level1_features.extend(feature_group)
        self.all_level2_features = []
        for feature_group in self.level2_features.values():
            self.all_level2_features.extend(feature_group)
        self.all_features = self.all_level1_features + self.all_level2_features
        self.labels = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        self.feature_combinations = self._generate_feature_combinations()
        self.models = self._initialize_models()
        self._create_model_directories()
        self.data_df, self.labels_df, self.merged_df = self._load_csv_data()
        self.results = []

    # ---------- Logistic functions + fitter (inside class) ----------
    @staticmethod
    def _logistic_4pl(x, beta1, beta2, beta3, beta4):
        # Four-parameter logistic: beta1=top, beta2=bottom (note: matches user's provided signature)
        return beta2 + (beta1 - beta2) / (1 + np.exp(-(x - beta3) / abs(beta4)))

    @staticmethod
    def _logistic_5pl(x, beta1, beta2, beta3, beta4, beta5):
        # Five-parameter logistic with asymmetry beta5
        return beta2 + (beta1 - beta2) / ((1 + np.exp(-(x - beta3) / abs(beta4))) ** beta5)

    def _logistic_fit_and_map(self, y_pred: np.ndarray, y_true: np.ndarray, model: str = None):
        """
        Fit 4PL/5PL logistic function (no plotting) and return:
        - mapped predictions (z)
        - fitted params (popt)
        - Pearson corr of mapped vs true (plcc_fitted)
        - Spearman corr of mapped vs true (spearman_fitted)
        - RMSE of mapped vs true (rmse_fitted)
        """
        model = (model or self.logistic_model).lower()
        x = np.asarray(y_pred).ravel()
        y = np.asarray(y_true).ravel()

        if model == "4pl":
            func = self._logistic_4pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0]
        else:
            func = self._logistic_5pl
            beta0 = [float(np.max(y)), float(np.min(y)), float(np.median(x)), 1.0, 1.0]

        popt, _ = curve_fit(func, x, y, p0=beta0, maxfev=20000)
        z = func(x, *popt)

        plcc_fitted, _ = pearsonr(z, y)
        spearman_fitted, _ = spearmanr(z, y)
        rmse_fitted = float(np.sqrt(np.mean((z - y) ** 2)))

        return z, popt, plcc_fitted, spearman_fitted, rmse_fitted
    # ---------------------------------------------------------------

    def _generate_feature_combinations(self):
        combinations_dict = {}
        level1_component_names = ['U1', 'S1', 'V1']
        for r in range(1, len(level1_component_names) + 1):
            for combo in combinations(level1_component_names, r):
                features_list = []
                for component in combo:
                    features_list.extend(self.level1_features[component])
                combo_name = f"level1_{'_'.join(combo)}"
                combinations_dict[combo_name] = features_list
        level2_component_names = ['U2', 'S2', 'V2']
        for r in range(1, len(level2_component_names) + 1):
            for combo in combinations(level2_component_names, r):
                features_list = []
                for component in combo:
                    features_list.extend(self.level2_features[component])
                combo_name = f"level2_{'_'.join(combo)}"
                combinations_dict[combo_name] = features_list
        level1_level2_combinations = [
            (['U1'], ['U2']),
            (['S1'], ['S2']),
            (['V1'], ['V2']),
            (['U1', 'S1'], ['U2', 'S2']),
            (['U1', 'V1'], ['U2', 'V2']),
            (['S1', 'V1'], ['S2', 'V2']),
            (['U1', 'S1', 'V1'], ['U2', 'S2', 'V2'])
        ]
        for level1_combo, level2_combo in level1_level2_combinations:
            features_list = []
            for component in level1_combo:
                features_list.extend(self.level1_features[component])
            for component in level2_combo:
                features_list.extend(self.level2_features[component])
            combo_name = f"combined_{'_'.join(level1_combo + level2_combo)}"
            combinations_dict[combo_name] = features_list
        print(f"Generated {len(combinations_dict)} feature combinations")
        level1_count = sum(1 for name in combinations_dict.keys() if name.startswith('level1_'))
        level2_count = sum(1 for name in combinations_dict.keys() if name.startswith('level2_'))
        combined_count = sum(1 for name in combinations_dict.keys() if name.startswith('combined_'))
        print(f"Level 1 combinations: {level1_count}")
        print(f"Level 2 combinations: {level2_count}")
        print(f"Combined L1+L2 combinations: {combined_count}")
        return combinations_dict
    
    def _initialize_models(self):
        models = {
            'mlp_regressor': {
                'model': MLPRegressor,
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100), (100, 50, 25)],
                    'activation': ['relu', 'tanh'],
                    'solver': ['adam', 'lbfgs'],
                    'alpha': [0.0001, 0.001, 0.01, 0.1],
                    'learning_rate': ['constant', 'adaptive'],
                    'max_iter': [1000, 2000]
                }
            },
            'ridge_regressor': {
                'model': Ridge,
                'params': {
                    'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag', 'saga'],
                    'max_iter': [1000, 2000, 3000]
                }
            },
            'decision_tree_regressor': {
                'model': DecisionTreeRegressor,
                'params': {
                    'max_depth': [None, 5, 10, 15, 20, 25],
                    'min_samples_split': [2, 5, 10, 20],
                    'min_samples_leaf': [1, 2, 4, 8],
                    'max_features': ['auto', 'sqrt', 'log2', None],
                    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
                }
            },
            'random_forest_regressor': {
                'model': RandomForestRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'extra_trees_regressor': {
                'model': ExtraTreesRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'gradient_boosting_regressor': {
                'model': GradientBoostingRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'subsample': [0.8, 0.9, 1.0]
                }
            },
            'adaboost_regressor': {
                'model': AdaBoostRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'learning_rate': [0.01, 0.1, 0.5, 1.0, 2.0],
                    'loss': ['linear', 'square', 'exponential']
                }
            },
            'svr_regressor': {
                'model': SVR,
                'params': {
                    'kernel': ['linear', 'rbf'],
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto'],
                    'epsilon': [0.01, 0.1]
                }
            },
            'xgboost_regressor': {
                'model': xgb.XGBRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [3, 5, 7, 9],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 0.9, 1.0],
                    'colsample_bytree': [0.8, 0.9, 1.0],
                    'reg_alpha': [0, 0.1, 1],
                    'reg_lambda': [1, 1.5, 2]
                }
            },
            'catboost_regressor': {
                'model': CatBoostRegressor,
                'params': {
                    'iterations': [50, 100, 200],
                    'depth': [4, 6, 8, 10],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'l2_leaf_reg': [1, 3, 5, 7, 9],
                    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
                }
            }
            # -------------------------------------------------------------------
        }
        return models
    
    def _create_model_directories(self):
        model_names = list(self.models.keys())
        for model_name in model_names:
            model_dir = os.path.join(self.models_save_path, model_name)
            os.makedirs(model_dir, exist_ok=True)
    
    def _load_csv_data(self):
        try:
            data_df = pd.read_csv(self.csv_file_path)
            print(f"Features CSV loaded successfully. Shape: {data_df.shape}")
            labels_df = pd.read_csv(self.labels_csv_path)
            print(f"Labels CSV loaded successfully. Shape: {labels_df.shape}")
            merged_df = pd.merge(data_df, labels_df, on='videoname', how='inner')
            print(f"Merged dataset shape: {merged_df.shape}")
            return data_df, labels_df, merged_df
        except Exception as e:
            print(f"Error loading CSV files: {e}")
            return None, None, None
    
    def _model_exists(self, model_name, feature_combo_name, label_name):
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        return os.path.exists(filepath)
    
    def _load_existing_model(self, model_name, feature_combo_name, label_name):
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        try:
            with open(filepath, 'rb') as f:
                model = pickle.load(f)
            print(f"      Loaded existing model: {filename}")
            return model
        except Exception as e:
            print(f" Error loading existing model {filename}: {e}")
            return None
    
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics"""
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # Calculate correlation metrics
        plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        return {
            'PLCC': plcc,
            'SRCC': srcc,
            'KRCC': krcc,
            'RMSE': rmse,
            'logistic_params': logistic_params
        }
    
    def _train_model(self, model_name, X_train, y_train, X_test, y_test):
        model_config = self.models[model_name]
        model_class = model_config['model']
        param_grid = model_config['params']
        print(f" Training {model_name}...")
        
        if model_name == 'mlp_regressor':
            model = model_class(random_state=42, early_stopping=True, validation_fraction=0.1)

        elif model_name == 'xgboost_regressor':
            # --- MODIFIED FOR CPU ---
            # The original GPU-specific line is commented out below to prevent errors on Mac.
            # model = model_class(random_state=42, objective='reg:squarederror', tree_method="gpu_hist", predictor="gpu_predictor")
            
            # This line uses the default CPU trainer.
            model = model_class(random_state=42, objective='reg:squarederror')

        elif model_name == 'catboost_regressor':
            # --- MODIFIED FOR CPU ---
            # The original GPU-specific line is commented out below to prevent errors on Mac.
            # model = model_class(random_state=42, verbose=False, task_type="GPU")

            # This line uses the default CPU trainer.
            model = model_class(random_state=42, verbose=False)
            
        elif model_name == 'svr_regressor':
            model = model_class()
        else:
            model = model_class(random_state=42)
        
        # Perform hyperparameter search
        print(f"      Performing hyperparameter search...")
        n_iter = 15 if model_name in ['xgboost_regressor', 'catboost_regressor', 'mlp_regressor', 'svr_regressor'] else 20
        
        search_cv = RandomizedSearchCV(
            model, param_grid, n_iter=n_iter, cv=3, 
            scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
        )
        
        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        
        # Evaluate model
        print(f"      Evaluating model performance...")
        y_pred = best_model.predict(X_test)
        metrics = self._calculate_metrics(y_test, y_pred)
        
        return best_model, metrics
    
    def _save_model(self, model, model_name, feature_combo_name, label_name):
        """Save trained model to disk"""
        model_dir = os.path.join(self.models_save_path, model_name)
        os.makedirs(model_dir, exist_ok=True)
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model, f)
            print(f"      Model saved: {filename}")
            return True
        except Exception as e:
            print(f" Error saving model {filename}: {e}")
            return False
    
    def _save_intermediate_results(self):
        if self.results:
            results_df = pd.DataFrame(self.results)
            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            filename = f"intermediate_results_{timestamp}.csv"
            filepath = os.path.join(self.models_save_path, filename)
            try:
                results_df.to_csv(filepath, index=False)
                print(f" Intermediate results saved: {filename}")
            except Exception as e:
                print(f" Error saving intermediate results: {e}")
    
    def train_all_models(self, test_size=0.2, random_state=42, force_retrain=False):
        if self.merged_df is None:
            print("No merged data available. Cannot proceed with training.")
            return
        print("=" * 100)
        print("STARTING COMPREHENSIVE MODEL TRAINING (with Logistic Regression Fitting for PLCC)")
        print("=" * 100)
        level1_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('level1_'))
        level2_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('level2_'))
        combined_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('combined_'))
        print(f"Feature combinations breakdown:")
        print(f" - Level 1: {level1_count}")
        print(f" - Level 2: {level2_count}")
        print(f" - Combined: {combined_count}")
        print(f" - Total: {len(self.feature_combinations)}")
        print(f"\nTotal models to train: {len(self.feature_combinations)} × {len(self.labels)} × {len(self.models)} = "
              f"{len(self.feature_combinations) * len(self.labels) * len(self.models)} models\n")
        total_models = 0
        successful_models = 0
        skipped_models = 0
        for combo_idx, (feature_combo_name, feature_list) in enumerate(self.feature_combinations.items(), 1):
            print(f"Processing feature combination {combo_idx}/{len(self.feature_combinations)}: {feature_combo_name}")
            print(f" Features: {len(feature_list)} features")
            print("-" * 80)
            missing_features = [f for f in feature_list if f not in self.merged_df.columns]
            if missing_features:
                print(f" Missing features: {len(missing_features)} features not found. Skipping this combination.\n")
                if len(missing_features) <= 10:
                    print(f" Examples: {missing_features[:10]}")
                continue
            X = self.merged_df[feature_list].values
            for label in self.labels:
                print(f"\n Target label: {label}")
                if label not in self.merged_df.columns:
                    print(f" Label {label} not found in merged dataset")
                    continue
                y = self.merged_df[label].values
                if len(X) != len(y):
                    print(f" Dimension mismatch: Features={len(X)}, Labels={len(y)}")
                    continue
                mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
                X_clean = X[mask]
                y_clean = y[mask]
                if len(X_clean) == 0:
                    print(f" No valid samples after cleaning")
                    continue
                X_train, X_test, y_train, y_test = train_test_split(
                    X_clean, y_clean, test_size=test_size, random_state=random_state
                )
                print(f" Training samples: {len(X_train)}, Test samples: {len(X_test)}")
                for model_name in self.models.keys():
                    total_models += 1
                    print(f"\n Model: {model_name}")
                    if not force_retrain and self._model_exists(model_name, feature_combo_name, label):
                        print(f" Model already exists, loading...")
                        try:
                            existing_model = self._load_existing_model(model_name, feature_combo_name, label)
                            if existing_model is not None:
                                # Evaluate existing model
                                y_pred = existing_model.predict(X_test)
                                metrics = self._calculate_metrics(y_test, y_pred)
                                result = {
                                    'feature_combination': feature_combo_name,
                                    'features': ', '.join(feature_list[:5]) + '...' if len(feature_list) > 5 else ', '.join(feature_list),
                                    'feature_count': len(feature_list),
                                    'label': label,
                                    'model': model_name,
                                    'train_samples': len(X_train),
                                    'test_samples': len(X_test),
                                    'PLCC': metrics['PLCC'],
                                    'SRCC': metrics['SRCC'],
                                    'KRCC': metrics['KRCC'],
                                    'RMSE': metrics['RMSE']
                                }
                                self.results.append(result)
                                print(f" Performance - PLCC: {metrics['PLCC']:.4f}, "
                                      f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                                      f"RMSE: {metrics['RMSE']:.4f}")
                                successful_models += 1
                                skipped_models += 1
                                continue
                        except Exception as e:
                            print(f" Error with existing model: {e}, will retrain")
                    try:
                        model_data, metrics = self._train_model(
                            model_name, X_train, y_train, X_test, y_test
                        )
                        
                        # Save model
                        print(f"      Saving model...")
                        self._save_model(model, model_name, feature_combo_name, label)
                        
                        # Store results
                        result = {
                            'feature_combination': feature_combo_name,
                            'features': ', '.join(feature_list[:5]) + '...' if len(feature_list) > 5 else ', '.join(feature_list),
                            'feature_count': len(feature_list),
                            'label': label,
                            'model': model_name,
                            'train_samples': len(X_train),
                            'test_samples': len(X_test),
                            'PLCC': metrics['PLCC'],
                            'SRCC': metrics['SRCC'],
                            'KRCC': metrics['KRCC'],
                            'RMSE': metrics['RMSE']
                        }
                        self.results.append(result)
                        successful_models += 1
                        print(f" Performance - PLCC: {metrics['PLCC']:.4f}, "
                              f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                              f"RMSE: {metrics['RMSE']:.4f}")
                        if len(self.results) % 50 == 0:
                            self._save_intermediate_results()
                    except Exception as e:
                        print(f" Training failed: {e}")
                        print(f"\n{'='*80}")
                        print(f"Completed feature combination: {feature_combo_name}")
                        print(f"{'='*80}\n")
        print("=" * 100)
        print("TRAINING COMPLETED!")
        print("=" * 100)
        print(f"Successfully trained/loaded: {successful_models}/{total_models} models")
        print(f"Skipped (already existed): {skipped_models}/{total_models} models")
        print(f"Newly trained: {successful_models - skipped_models}/{total_models} models")
        print("=" * 100)
        self.save_results()
    
    def save_results(self, filename="csv_training_results.csv"):
        if self.results:
            results_df = pd.DataFrame(self.results)
            filepath = os.path.join(self.models_save_path, filename)
            results_df.to_csv(filepath, index=False)
            print(f"\nResults saved to: {filepath}")
            self.display_results_summary(results_df)
        else:
            print("No results to save.")
        print("\nTraining process completed successfully!")
    
    def display_results_summary(self, results_df):
        print("\n" + "="*100)
        print("TRAINING RESULTS SUMMARY")
        print("="*100)
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        print("\nBEST PERFORMING MODELS BY METRIC:")
        print("-" * 60)
        for metric in metrics:
            if metric == 'RMSE':
                best_result = results_df.loc[results_df[metric].idxmin()]
                print(f"\n Best {metric} (Lower is better):")
            else:
                best_result = results_df.loc[results_df[metric].idxmax()]
                print(f"\n Best {metric} (Higher is better):")
            print(f" Model: {best_result['model']}")
            print(f" Features: {best_result['feature_combination']}")
            print(f" Label: {best_result['label']}")
            print(f" Score: {best_result[metric]:.4f}")
        print(f"\n\nAVERAGE PERFORMANCE BY MODEL:")
        print("-" * 60)
        model_avg = results_df.groupby('model')[metrics].mean()
        for model in model_avg.index:
            print(f"\n {model.upper().replace('_', ' ')}:")
            for metric in metrics:
                print(f" {metric}: {model_avg.loc[model, metric]:.4f}")
        print(f"\n\nTOP 10 FEATURE COMBINATIONS (by PLCC):")
        print("-" * 60)
        combo_avg = results_df.groupby(['feature_combination'])['PLCC'].mean().sort_values(ascending=False).head(10)
        for i, (combo, plcc) in enumerate(combo_avg.items(), 1):
            print(f"  #{i} {combo}: {plcc:.4f}")
        
        print(f"\n\nBEST PERFORMANCE BY LABEL:")
        print("-" * 60)
        for label in self.labels:
            label_data = results_df[results_df['label'] == label]
            if not label_data.empty:
                best_idx = label_data['PLCC'].idxmax()
                best_result = label_data.loc[best_idx]
                print(f"\n {label}:")
                print(f" Best Model: {best_result['model']}")
                print(f" Features: {best_result['feature_combination']}")
                print(f" PLCC: {best_result['PLCC']:.4f}")
                print(f" SRCC: {best_result['SRCC']:.4f}")
                print(f" KRCC: {best_result['KRCC']:.4f}")
                print(f" RMSE: {best_result['RMSE']:.4f}")
        print("\n" + "="*100)
    
    def load_model_for_prediction(self, model_name, feature_combo_name, label_name):
        """Load a specific trained model for prediction"""
        return self._load_existing_model(model_name, feature_combo_name, label_name)
    
    def predict_with_model(self, model_name, feature_combo_name, label_name, X_new):
        """Make predictions using a trained model"""
        model = self.load_model_for_prediction(model_name, feature_combo_name, label_name)
        if model is not None:
            y_pred = model.predict(X_new)
            
            # Apply logistic fitting if requested and parameters are available
            if apply_logistic_fit and (logistic_params is not None):
                try:
                    func = self._logistic_5pl if (logistic_model_kind or self.logistic_model).lower() == "5pl" else self._logistic_4pl
                    y_pred_fitted = func(np.asarray(y_pred).ravel(), *logistic_params)
                    return y_pred_fitted
                except Exception as e:
                    print(f"Warning: Could not apply logistic fitting ({e}), returning original predictions")
                    return y_pred
            else:
                return y_pred
        else:
            print(f"Model not found: {model_name}_{feature_combo_name}_{label_name}")
            return None
    
    def get_model_summary(self):
        summary = []
        for model_name in self.models.keys():
            model_dir = os.path.join(self.models_save_path, model_name)
            if os.path.exists(model_dir):
                model_files = [f for f in os.listdir(model_dir) if f.endswith('.pkl')]
                for file in model_files:
                    parts = file.replace('.pkl', '').split('_')
                    if len(parts) >= 2:
                        feature_combo = '_'.join(parts[:-1])
                        label = parts[-1]
                        summary.append({
                            'model': model_name,
                            'feature_combination': feature_combo,
                            'label': label,
                            'file_path': os.path.join(model_dir, file)
                        })
        return pd.DataFrame(summary)
    
    def export_best_models_summary(self, top_n=10, filename="best_models_summary.csv"):
        if not self.results:
            print("No results available to export.")
            return
        results_df = pd.DataFrame(self.results)
        best_models_data = []
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        for label in self.labels:
            label_data = results_df[results_df['label'] == label]
            if label_data.empty:
                continue
            for metric in metrics:
                if metric == 'RMSE':
                    top_models = label_data.nsmallest(top_n, metric)
                else:
                    top_models = label_data.nlargest(top_n, metric)
                for rank, (_, row) in enumerate(top_models.iterrows(), 1):
                    best_models_data.append({
                        'label': label,
                        'metric': metric,
                        'rank': rank,
                        'model': row['model'],
                        'feature_combination': row['feature_combination'],
                        'score': row[metric],
                        'plcc': row['PLCC'],
                        'srcc': row['SRCC'],
                        'krcc': row['KRCC'],
                        'rmse': row['RMSE'],
                        'feature_count': row['feature_count']
                    })
        best_models_df = pd.DataFrame(best_models_data)
        filepath = os.path.join(self.models_save_path, filename)
        best_models_df.to_csv(filepath, index=False)
        print(f"Best models summary exported to: {filepath}")
        return best_models_df

In [16]:
def run_training(csv_file_path, labels_csv_path, models_save_path, test_size=0.2, force_retrain=False):
    """
    Simple function to run the complete training process with separate features and labels files
    """
    print("CSV Regression Model Trainer")
    print("=" * 50)
    
    if not os.path.exists(csv_file_path):
        print(f"Features CSV file not found: {csv_file_path}")
        return None
    if not os.path.exists(labels_csv_path):
        print(f"Labels CSV file not found: {labels_csv_path}")
        return None
    
    print(f"Features CSV file: {csv_file_path}")
    print(f"Labels CSV file: {labels_csv_path}")
    print(f"Models directory: {models_save_path}")
    print(f"Test size: {test_size}")
    print(f"Force retrain: {force_retrain}")

    trainer = CSVRegressionModelTrainer(
        csv_file_path=csv_file_path,
        labels_csv_path=labels_csv_path,
        models_save_path=models_save_path
    )
    if trainer.merged_df is None:
        print("Failed to load and merge data. Exiting.")
        return None
    
    print("\nStarting model training...")
    trainer.train_all_models(
        test_size=test_size,
        random_state=42,
        force_retrain=force_retrain
    )
    
    if trainer.results:
        print("\nExporting best models summary...")
        trainer.export_best_models_summary(top_n=10)
    
    print("\nGetting model summary...")
    model_summary = trainer.get_model_summary()
    if not model_summary.empty:
        print(f"Total trained models: {len(model_summary)}")
        print("\nModel distribution:")
        print(model_summary.groupby('model').size().to_string())
        print("\nLabel distribution:")
        print(model_summary.groupby('label').size().to_string())
    else:
        print("No models found.")
    
    print("\nAll processes completed successfully!")
    print("\nNext steps:")
    print("1. Check the results CSV file for performance analysis")
    print("2. Use the best_models_summary.csv to identify top performers")
    print("3. Load specific models using trainer.load_model_for_prediction() for predictions")
    
    return trainer

# --- Example Usage ---
# Update these paths to match your project's directory structure
features_csv_path = "../preprocessing/scaling/scaled-features/power/all_features_power.csv"
labels_csv_path = "../../dataset/cleaned/cleaned-mos.csv"
models_path = "./trained_models"

# Run the training
trainer = run_training(
    csv_file_path=features_csv_path,
    labels_csv_path=labels_csv_path,
    models_save_path=models_path,
    test_size=0.2,
    force_retrain=False
)

CSV Regression Model Trainer
Features CSV file: ../preprocessing/scaling/scaled-features/power/all_features_power.csv
Labels CSV file: ../../dataset/cleaned/cleaned-mos.csv
Models directory: ./trained_models
Test size: 0.2
Force retrain: False
Generated 21 feature combinations
Level 1 combinations: 7
Level 2 combinations: 7
Combined L1+L2 combinations: 7
Features CSV loaded successfully. Shape: (1000, 1153)
Labels CSV loaded successfully. Shape: (1000, 7)
Merged dataset shape: (1000, 1159)

Starting model training...
STARTING COMPREHENSIVE MODEL TRAINING
Feature combinations breakdown:
 - Level 1: 7
 - Level 2: 7
 - Combined: 7
 - Total: 21

Total models to train: 21 × 6 × 11 = 1386 models

Processing feature combination 1/21: level1_U1
 Features: 256 features
--------------------------------------------------------------------------------

 Target label: TSV
 Training samples: 800, Test samples: 200

 Model: mlp_regressor
 Model already exists, loading...
 Loaded existing model: level

In [31]:
# Example usage
if __name__ == "__main__":
    # Update these paths to match your setup
    features_csv_path = "../preprocessing/scaling/scaled-features/power/all_features_power.csv"
    labels_csv_path = "../../dataset/cleaned/cleaned-mos.csv"  # Path to your labels CSV file
    models_path = "./trained_models"
    
    # Run training
    trainer = run_training(
        csv_file_path=features_csv_path,
        labels_csv_path=labels_csv_path,
        models_save_path=models_path,
        test_size=0.2,
        force_retrain=False
    )

CSV Regression Model Trainer
Features CSV file: ../preprocessing/scaling/scaled-features/power/all_features_power.csv
Labels CSV file: ../../dataset/cleaned/cleaned-mos.csv
Models directory: ./trained_models
Test size: 0.2
Force retrain: False
Generated 21 feature combinations
Level 1 combinations: 7
Level 2 combinations: 7
Combined L1+L2 combinations: 7
Features CSV loaded successfully. Shape: (1000, 1153)
Total columns in features: 1153
Labels CSV loaded successfully. Shape: (1000, 7)
Total columns in labels: 7
All expected feature columns found in features dataset
All expected label columns found in labels dataset
Merging features and labels datasets...
Merged dataset shape: (1000, 1159)
Successfully merged 1000 samples
Merge statistics:
  - Original features samples: 1000
  - Original labels samples: 1000
  - Merged samples: 1000
  - Samples lost: 0

Merged dataset info:
  - Rows: 1000
  - Features available: 1152/1152
  - Labels available: 6/6

Starting model training...
STARTING 