# Polynomial Feature Expansion Experiment
This notebook implements Polynomial Feature Expansion to capture non-linear relationships and improve R-squared scores, supplementing existing hyperparameter optimization.


In [8]:
# 1. Setup Project Path and Imports
from pathlib import Path
import sys
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline

# Set project root
p = Path.cwd().parent
if str(p) not in sys.path:
    sys.path.insert(0, str(p))
print("Project root set to:", p)


Project root set to: c:\ComfyUI\trainer


In [9]:
# 2. Load and Reload Configuration Modules
import importlib
import shared.config
import training.run
import training.data_utils

# Reload to ensure fresh state
importlib.reload(shared.config)
from shared.config import config

importlib.reload(training.run)
importlib.reload(training.data_utils)

from training.helpers import resolve_path
from training.data_utils import get_filtered_data


In [10]:
# 3. Load and Preprocess Data
# Load Configuration Paths
vectors_path = resolve_path(config["vectors_file"])
scores_path = resolve_path(config["scores_file"])

print(f"Loading data from:\nVectors: {vectors_path}\nScores: {scores_path}")

# Load and Filter Data
# X is the feature matrix, y is the target scores
X, y, kept_indices = get_filtered_data(vectors_path, scores_path)

print(f"Original Data Shape: X={X.shape}, y={y.shape}")


Loading data from:
Vectors: c:\ComfyUI\trainer\prepare\output\vectors.jsonl
Scores: c:\ComfyUI\trainer\prepare\output\scores.jsonl
Loading filtered data from cache: C:\ComfyUI\trainer\training\output\filtered_data_cache.npz
Data ready (cached). Filtered shape: (6105, 1308)
Original Data Shape: X=(6105, 1308), y=(6105,)


In [11]:
# 4. Generate Additive Interaction Features (Batched)
import psutil
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_regression
import gc
# Use standard tqdm which works well in VS Code text output if widgets aren't available
from tqdm import tqdm 

print("Generating Interaction Features to ADD to original data...")

# Configuration
# Strategy Change: Instead of filtering everything, we KEEP the original X (baseline)
# and only append the top K *new* interaction features.
# This ensures we don't perform worse than baseline, only better or equal.
target_new_k = 200 # Add top 500 strongest interactions
degrees = 2

# Memory Check
AVAILABLE_RAM = psutil.virtual_memory().available
RAM_LIMIT = AVAILABLE_RAM * 0.90
print(f"Available RAM: {AVAILABLE_RAM/1024**3:.2f} GB. Using Limit: {RAM_LIMIT/1024**3:.2f} GB")

n_features_in = X.shape[1]
n_samples = X.shape[0]

# We use interaction_only=True because x^2 is often redundant with x after scaling,
# but x*y captures unique relationships.
poly = PolynomialFeatures(degree=degrees, include_bias=False, interaction_only=True)

# Calculate expected total features (Original + Interactions)
# n + n(n-1)/2
n_features_total = n_features_in + (n_features_in * (n_features_in - 1)) // 2
n_interactions = n_features_total - n_features_in

print(f"Input Features: {n_features_in}")
print(f"Total Potential Interactions: {n_interactions}")
print(f"Target Additive Features: {target_new_k}")

estimated_matrix_size = n_samples * n_features_total * 8 
print(f"Estimated Full Matrix Size (if instantiated): {estimated_matrix_size/1024**3:.2f} GB")

# Batch Size Calculation
# We need to compute stats for the interaction columns. 
# We'll generate the full poly expansion per batch, but only accumulate stats for the interaction part.
BATCH_MEMORY_TARGET = 1 * 1024**3 
bytes_per_row = n_features_total * 8
batch_size = int(BATCH_MEMORY_TARGET / bytes_per_row)
if batch_size < 1: batch_size = 1
if batch_size > 5000: batch_size = 5000 

print(f"Processing in batches of {batch_size} samples...")

# Accumulators for correlation calculation (Only for interactions)
# We only care about columns [n_features_in : ]
sum_x = np.zeros(n_interactions)
sum_x_sq = np.zeros(n_interactions)
sum_xy = np.zeros(n_interactions)
sum_y = 0
sum_y_sq = 0
N = 0

# Check if single row is too big (rare with interaction_only, but possible)
if bytes_per_row > RAM_LIMIT * 0.8:
    print("Warning: Matrix row size is extremely large. This might be slow or OOM.")

# Pass 1: Statistics on Interactions
print("Pass 1: Computing Interaction Correlations...")

with tqdm(total=n_samples, desc="Scanning Interactions", unit="samples") as pbar:
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        
        X_batch = X[i:end_idx]
        y_batch = y[i:end_idx]
        
        # This returns [X_batch, Interactions]
        # We only want the interaction part
        X_poly_full = poly.fit_transform(X_batch)
        X_inter_batch = X_poly_full[:, n_features_in:]
        
        # Accumulate
        sum_x += np.sum(X_inter_batch, axis=0)
        sum_x_sq += np.sum(X_inter_batch ** 2, axis=0)
        sum_xy += np.dot(X_inter_batch.T, y_batch)
        
        # Only do Y stats once (redundant but cheap)
        if N == 0: 
            # Actually we need sum over all batches for Y, doing incrementally is fine
            pass
        sum_y += np.sum(y_batch)
        sum_y_sq += np.sum(y_batch ** 2)
        
        N += (end_idx - i)
        
        del X_poly_full, X_inter_batch
        gc.collect()
        pbar.update(end_idx - i)

# Compute Correlations
print("Computing scores...")
numerator = (N * sum_xy) - (sum_x * sum_y)
denominator_x = (N * sum_x_sq) - (sum_x ** 2)
denominator_y = (N * sum_y_sq) - (sum_y ** 2)

denominator_x[denominator_x <= 0] = 1e-10
denominator = np.sqrt(denominator_x * denominator_y)
correlation = numerator / denominator
f_scores = (correlation ** 2) / (1 - correlation ** 2 + 1e-10) * (N - 2)
f_scores = np.nan_to_num(f_scores, nan=0.0)

# Select Top K Interactions
k = min(target_new_k, n_interactions)
print(f"Selecting Top {k} interactions...")
top_k_indices_local = np.argsort(f_scores)[-k:]
top_k_indices_local = np.sort(top_k_indices_local) # Indices relative to the interaction-only block

# Pass 2: Construction
print("Pass 2: Constructing Augmented Dataset...")
# Result will be Original X + Selected Interactions
X_interactions = np.zeros((n_samples, k), dtype=X.dtype)

with tqdm(total=n_samples, desc="Building Matrix", unit="samples") as pbar:
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        X_batch = X[i:end_idx]
        
        X_poly_full = poly.fit_transform(X_batch)
        # Select from the interaction part
        # Shift indices? No, top_k_indices_local is 0-based relative to the interaction slice
        X_interactions[i:end_idx] = X_poly_full[:, n_features_in:][:, top_k_indices_local]
        
        del X_poly_full
        gc.collect()
        pbar.update(end_idx - i)

print("Concatenating with original features...")
X_selected = np.hstack([X, X_interactions])

print(f"Final Data Shape: {X_selected.shape}")
print(f"Added {X_selected.shape[1] - X.shape[1]} interaction features.")

Generating Interaction Features to ADD to original data...
Available RAM: 22.57 GB. Using Limit: 20.31 GB
Input Features: 1308
Total Potential Interactions: 854778
Target Additive Features: 200
Estimated Full Matrix Size (if instantiated): 38.94 GB
Processing in batches of 156 samples...
Pass 1: Computing Interaction Correlations...


Scanning Interactions: 100%|██████████| 6105/6105 [00:40<00:00, 152.28samples/s]


Computing scores...
Selecting Top 200 interactions...
Pass 2: Constructing Augmented Dataset...


Building Matrix: 100%|██████████| 6105/6105 [00:12<00:00, 474.88samples/s]

Concatenating with original features...
Final Data Shape: (6105, 1508)
Added 200 interaction features.





In [12]:
# 5. Review Selection (Already done in batched step)
print("Feature selection completed in previous step due to memory optimizations.")
print(f"X_selected shape: {X_selected.shape}")
# We skip re-running SelectKBest because we manually computed it to save RAM.


Feature selection completed in previous step due to memory optimizations.
X_selected shape: (6105, 1508)


In [15]:
# 6. Evaluate Base vs Polynomial Models
print("Evaluating Models using 5-Fold Cross-Validation...")

from sklearn.model_selection import KFold, cross_val_score
import lightgbm as lgb
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Load Top Config for LightGBM
try:
    training_cfg = config["training"]
    # Access 'top' directly if it exists
    if "top" in training_cfg:
        top_config = training_cfg["top"]
    else:
        top_config = None
except KeyError:
    top_config = None

if top_config:
    print("Loaded Top LightGBM Config:")
    # Filter out metadata keys AND early_stopping_rounds
    ignore_keys = ['best_score', 'training_time', 'early_stopping_rounds']
    clean_params = {k: v for k, v in top_config.items() if k not in ignore_keys}
    
    # Handle device
    device_setting = "cpu"
    if "device" in training_cfg:
        device_setting = training_cfg["device"]
        if device_setting == "cuda":
            device_setting = "gpu"
            
    clean_params['device'] = device_setting
    clean_params['verbosity'] = -1
    print(clean_params)
else:
    print("Warning: No 'top' config found. Using defaults.")
    clean_params = {"n_estimators": 500, "learning_rate": 0.05, "verbosity": -1}

print("\nEvaluating Baseline (LightGBM on Original X)...")
# Note: LightGBM doesn't usually need StandardScaler, but it doesn't hurt. 
# We'll pass raw data to LGBM to be idiomatic.
model_base = lgb.LGBMRegressor(**clean_params)
scores_base = cross_val_score(model_base, X, y, cv=kf, scoring='r2')

print("\nEvaluating Enhanced (LightGBM on Original + Interactions)...")
# We use the same params. Ideally we might re-tune for more features, but this is a direct comparison.
model_poly = lgb.LGBMRegressor(**clean_params)
scores_poly = cross_val_score(model_poly, X_selected, y, cv=kf, scoring='r2')

print(f"\nBaseline R2: {np.mean(scores_base):.6f} (+/- {np.std(scores_base):.6f})")
print(f"Polynomial R2: {np.mean(scores_poly):.6f} (+/- {np.std(scores_poly):.6f})")

improvement = np.mean(scores_poly) - np.mean(scores_base)
print(f"Improvement: {improvement:.6f}")

Evaluating Models using 5-Fold Cross-Validation...
Loaded Top LightGBM Config:
{'learning_rate': 0.44550000000000006, 'n_estimators': 675, 'num_leaves': 372, 'max_depth': 1, 'min_child_samples': 90, 'reg_alpha': 8.1, 'reg_lambda': 4.099784786849741, 'subsample': 0.11979000000000002, 'colsample_bytree': 0.1, 'min_split_gain': 0.5, 'device': 'gpu', 'verbosity': -1}

Evaluating Baseline (LightGBM on Original X)...





Evaluating Enhanced (LightGBM on Original + Interactions)...





Baseline R2: 0.384593 (+/- 0.014043)
Polynomial R2: 0.391540 (+/- 0.009532)
Improvement: 0.006946




In [16]:
# 7. Save Enhanced Feature Configuration
if improvement > 0.001:
    print("Improvement found! Saving transformed dataset...")
    
    # Save the transformed dataset for the Training Loop to pick up
    out_dir = resolve_path("training/output")
    import os
    os.makedirs(out_dir, exist_ok=True)
    output_path = resolve_path("training/output/poly_selected_data.npz")
    
    np.savez(output_path, X=X_selected, y=y)
    print(f"Saved transformed feature set to: {output_path}")
    print("You can modify 'training/data_utils.py' or 'training/run.py' to load this file if it exists.")
else:
    print("No significant improvement. Polynomial features might not be necessary or need tuning, or LightGBM handles interactions well enough already.")


Improvement found! Saving transformed dataset...
Saved transformed feature set to: c:\ComfyUI\trainer\training\output\poly_selected_data.npz
You can modify 'training/data_utils.py' or 'training/run.py' to load this file if it exists.
