# Phase 2 : Stack Learner

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score
from sklearn.base import clone
import os
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

def load_dataset_labels(dataset_path):
    class_names = sorted(os.listdir(dataset_path))
    class_to_idx = {cls: idx for idx, cls in enumerate(class_names)}
    labels = []
    for cls in class_names:
        cls_folder = os.path.join(dataset_path, cls)
        if not os.path.isdir(cls_folder): continue
        for img_file in os.listdir(cls_folder):
            if img_file.lower().endswith(('jpg', 'jpeg', 'png')):
                labels.append(class_to_idx[cls])
    return np.array(labels), class_names

def evaluate_model(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_macro': f1_score(y_true, y_pred, average='macro')
    }

def tune_with_overfit_check(classifier, param_grid, X_train, y_train, overfit_thresh=0.8, cv=5):
    """
    Stricter overfitting prevention with lower threshold and more CV folds.
    """
    best_model = None
    best_val_f1 = -np.inf
    best_gap = np.inf
    best_params = None

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # Iterate all param combos manually
    import itertools
    keys, values = zip(*param_grid.items())
    param_combos = [dict(zip(keys, v)) for v in itertools.product(*values)]
    
    print(f"Testing {len(param_combos)} parameter combinations...")

    for i, params in enumerate(param_combos):
        val_f1_scores = []
        val_acc_scores = []
        train_acc_scores = []

        for train_idx, val_idx in skf.split(X_train, y_train):
            X_tr, X_val = X_train[train_idx], X_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]

            model = clone(classifier).set_params(**params)
            model.fit(X_tr, y_tr)

            y_tr_pred = model.predict(X_tr)
            y_val_pred = model.predict(X_val)

            train_acc = accuracy_score(y_tr, y_tr_pred)
            val_acc = accuracy_score(y_val, y_val_pred)
            val_f1 = f1_score(y_val, y_val_pred, average='macro')

            train_acc_scores.append(train_acc)
            val_acc_scores.append(val_acc)
            val_f1_scores.append(val_f1)

        mean_train_acc = np.mean(train_acc_scores)
        mean_val_acc = np.mean(val_acc_scores)
        mean_val_f1 = np.mean(val_f1_scores)
        acc_gap = mean_train_acc - mean_val_acc

        # Print progress for monitoring
        if (i + 1) % 10 == 0 or i == 0:
            print(f"  Combo {i+1}/{len(param_combos)}: Gap={acc_gap:.4f}, Val F1={mean_val_f1:.4f}")

        # Strict selection: only accept models with low overfitting
        if acc_gap <= overfit_thresh:
            if mean_val_f1 > best_val_f1:
                best_model = clone(classifier).set_params(**params)
                best_val_f1 = mean_val_f1
                best_gap = acc_gap
                best_params = params
        if best_model is None or (acc_gap< best_gap and best_gap> overfit_thresh):
            best_model = clone(classifier).set_params(**params)
            best_val_f1 = mean_val_f1
            best_gap = acc_gap
            best_params = params


    print(f"Selected params: {best_params}")
    best_model.fit(X_train, y_train)  # retrain on full training set
    return best_model, best_val_f1, best_gap

class StackedLearner:
    def __init__(self, base_learners, meta_learner, use_original_features=False):
        self.base_learners = base_learners
        self.meta_learner = meta_learner
        self.use_original_features = use_original_features  # Changed default to False
        self.scaler = StandardScaler()
        self.pca = None
        
    def fit(self, X, y):
        X_scaled = self.scaler.fit_transform(X)
        
        # More aggressive PCA for overfitting prevention
        if X_scaled.shape[1] > 500:  # Lower threshold
            self.pca = PCA(n_components=0.85, random_state=42)  # Keep less variance
            X_processed = self.pca.fit_transform(X_scaled)
            print(f"PCA reduced features from {X_scaled.shape[1]} to {X_processed.shape[1]}")
        else:
            X_processed = X_scaled
        
        base_predictions = []
        for name, learner in self.base_learners.items():
            print(f"Training base learner: {name}")
            learner.fit(X_processed, y)
            # Use cross-val predictions for stacking
            from sklearn.model_selection import cross_val_predict
            cv_preds = cross_val_predict(learner, X_processed, y, cv=5, method='predict_proba')
            base_predictions.append(cv_preds)
        
        stacked_features = np.column_stack(base_predictions)
        
        # Reduced feature inclusion to prevent overfitting
        if self.use_original_features and X_processed.shape[1] < 100:
            stacked_features = np.column_stack([stacked_features, X_processed])
        
        self.meta_learner.fit(stacked_features, y)
        return self
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        if self.pca:
            X_processed = self.pca.transform(X_scaled)
        else:
            X_processed = X_scaled
            
        base_predictions = [learner.predict_proba(X_processed) for learner in self.base_learners.values()]
        stacked_features = np.column_stack(base_predictions)
        
        if self.use_original_features and X_processed.shape[1] < 100:
            stacked_features = np.column_stack([stacked_features, X_processed])
            
        return self.meta_learner.predict(stacked_features)

# ---- Main phase2 training & tuning loop ----

# Load labels and features (adjust paths accordingly)
labels, class_names = load_dataset_labels('dataset')
print(f"Loaded {len(labels)} labels for {len(class_names)} classes: {class_names}")

feature_types = ['early', 'middle', 'high']
extracted_features = {}
for ftype in feature_types:
    feat_path = f"saved_features/{ftype}_features.npy"
    if os.path.exists(feat_path):
        extracted_features[ftype] = np.load(feat_path)
        print(f"Loaded {ftype} features: {extracted_features[ftype].shape}")
    else:
        print(f"Warning: {feat_path} not found!")

results_phase2 = {}

# More regularized parameter grids to prevent overfitting
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 8],  # Reduced max depth
        'min_samples_split': [10, 20, 30],  # Increased min samples
        'min_samples_leaf': [5, 10, 15],  # Increased min samples
        'max_features': ['sqrt', 'log2'],  # Removed 'auto' which can cause overfitting
        'max_samples': [0.6, 0.8]  # Added bootstrap sampling limit
    },
    'SVM (RBF)': {
        'C': [0.01, 0.1, 1],  # Reduced C values for more regularization
        'gamma': ['scale', 0.001, 0.01],  # Added smaller gamma values
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1],  # More regularization
        'penalty': ['l2'],
        'solver': ['liblinear', 'lbfgs']
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.05, 0.1],  # Added lower learning rate
        'max_depth': [2, 3, 4],  # Reduced max depth
        'subsample': [0.6, 0.8],  # Reduced subsample ratios
        'min_samples_leaf': [5, 10, 15],  # Increased min samples
        'min_samples_split': [10, 20]  # Added min samples split
    }
}

for ftype in feature_types:
    print(f"\n{'='*50}")
    print(f"PHASE 2 - STACKED LEARNER FOR {ftype.upper()} FEATURES")
    print(f"{'='*50}")

    X = extracted_features[ftype]
    X_train, X_test, y_train, y_test = train_test_split(
        X, labels, test_size=0.2, random_state=42, stratify=labels)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # More aggressive dimensionality reduction
    if X_train_scaled.shape[1] > 500:
        pca = PCA(n_components=0.85, random_state=42)  # Keep less variance
        X_train_reduced = pca.fit_transform(X_train_scaled)
        X_test_reduced = pca.transform(X_test_scaled)
        print(f"PCA reduced features from {X_train_scaled.shape[1]} to {X_train_reduced.shape[1]}")
    else:
        X_train_reduced = X_train_scaled
        X_test_reduced = X_test_scaled

    base_learners = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=200),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }

    tuned_learners = {}

    for name, learner in base_learners.items():
        print(f"\nTuning and selecting {name} with strict overfit prevention...")
        if name in param_grids:
            tuned_model, val_f1, acc_gap = tune_with_overfit_check(
                learner, param_grids[name], X_train_reduced, y_train, 
                overfit_thresh=0.05, cv=5)  # Stricter threshold and more CV folds
            print(f"Selected {name} model val F1: {val_f1:.4f}, train-val acc gap: {acc_gap:.4f}")
            tuned_learners[name] = tuned_model
        else:
            learner.fit(X_train_reduced, y_train)
            tuned_learners[name] = learner

    # Use more regularized meta-learner
    meta_learner = LogisticRegression(random_state=42, max_iter=200, C=0.1)
    stacked_model = StackedLearner(tuned_learners, meta_learner, use_original_features=False)

    print("\nTraining stacked learner...")
    stacked_model.fit(X_train, y_train)

    y_train_pred = stacked_model.predict(X_train)
    y_test_pred = stacked_model.predict(X_test)

    train_metrics = evaluate_model(y_train, y_train_pred)
    test_metrics = evaluate_model(y_test, y_test_pred)

    # Calculate final overfitting gap
    overfitting_gap = train_metrics['accuracy'] - test_metrics['accuracy']

    results_phase2[ftype] = {
        'feature_type': ftype,
        'train_accuracy': train_metrics['accuracy'],
        'train_f1_macro': train_metrics['f1_macro'],
        'test_accuracy': test_metrics['accuracy'],
        'test_f1_macro': test_metrics['f1_macro'],
        'overfitting_gap': overfitting_gap,
        'base_learners': list(tuned_learners.keys())
    }

    print(f"\nStacked Learner Results for {ftype} features:")
    print(f"Train Accuracy: {train_metrics['accuracy']:.4f}")
    print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Test F1 Macro: {test_metrics['f1_macro']:.4f}")
    print(f"Overfitting Gap: {overfitting_gap:.4f}")
    print(f"Base learners used: {', '.join(tuned_learners.keys())}")

print(f"\n{'='*60}")
print("PHASE 2 RESULTS SUMMARY")
print(f"{'='*60}")

phase2_df = pd.DataFrame(results_phase2).T
print("\nStacked Learner Performance:")
print(phase2_df[['feature_type', 'test_accuracy', 'test_f1_macro', 'overfitting_gap']])

# Check if overfitting is resolved
print(f"\nOverfitting Analysis:")
for ftype in feature_types:
    gap = results_phase2[ftype]['overfitting_gap']
    status = "✓ Good" if gap <= 0.05 else "⚠ Needs attention" if gap <= 0.10 else "✗ Overfitting"
    print(f"{ftype}: Gap = {gap:.4f} - {status}")

Loaded 606 labels for 3 classes: ['cats', 'dogs', 'horses']
Loaded early features: (606, 802816)
Loaded middle features: (606, 802816)
Loaded high features: (606, 100352)

PHASE 2 - STACKED LEARNER FOR EARLY FEATURES
PCA reduced features from 802816 to 282

Tuning and selecting Logistic Regression with strict overfit prevention...
Testing 8 parameter combinations...
  Combo 1/8: Gap=0.4907, Val F1=0.5060
Selected params: {'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}
Selected Logistic Regression model val F1: 0.5625, train-val acc gap: 0.4376

Tuning and selecting Random Forest with strict overfit prevention...
Testing 216 parameter combinations...
  Combo 1/216: Gap=0.2913, Val F1=0.5284
  Combo 10/216: Gap=0.2717, Val F1=0.5153
  Combo 20/216: Gap=0.3110, Val F1=0.4929
  Combo 30/216: Gap=0.2536, Val F1=0.5305
  Combo 40/216: Gap=0.4242, Val F1=0.4897
  Combo 50/216: Gap=0.3719, Val F1=0.5311
  Combo 60/216: Gap=0.3869, Val F1=0.4860
  Combo 70/216: Gap=0.3621, Val F1=0.5110
  Comb