# Incremental Feature Engineering (12 features per step)

参考: 
- Chris Deotte's 1st Place Solution
- Feature Importance Chart from Phase 1

12個ずつ特徴量を追加し、各ステップでCVスコアを評価します。
CVスコアが改善しない場合は、そのステップの特徴量は追加しません。

In [1]:
# Library Setup
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datetime import datetime
import time
import warnings
import gc
warnings.filterwarnings('ignore')

In [2]:
# Data Setup
train = pd.read_csv('/workspace/competitions/playground-series-s5e12/data/input/train.csv')
test = pd.read_csv('/workspace/competitions/playground-series-s5e12/data/input/test.csv')

print(f'Train Shape: {train.shape}')
print(f'Test Shape: {test.shape}')

Train Shape: (700000, 26)
Test Shape: (300000, 25)


In [3]:
# Baseline Setup
X_base = train.drop("diagnosed_diabetes", axis=1).copy()
y = train["diagnosed_diabetes"].copy()
test_base = test.copy()

# Separate categorical and numerical columns
cat_cols_base = X_base.select_dtypes(include="object").columns.tolist()
num_cols_base = X_base.select_dtypes(include=[np.number]).columns.tolist()

# Handle NaN for numerical columns first
for col in num_cols_base:
    X_base[col] = X_base[col].fillna(0)
    test_base[col] = test_base[col].fillna(0)

# Handle NaN for categorical columns (add 'Missing' category first)
for col in cat_cols_base:
    # Add 'Missing' category if there are NaN values
    if X_base[col].isna().any() or test_base[col].isna().any():
        X_base[col] = X_base[col].astype("category").cat.add_categories(['Missing'])
        test_base[col] = test_base[col].astype("category").cat.add_categories(['Missing'])
        X_base[col] = X_base[col].fillna('Missing')
        test_base[col] = test_base[col].fillna('Missing')
    else:
        X_base[col] = X_base[col].astype("category")
        test_base[col] = test_base[col].astype("category")

print(f"Baseline features: {len(X_base.columns)}")
print(f"Categorical columns: {len(cat_cols_base)}")
print(f"Numerical columns: {len(X_base.select_dtypes(include=[np.number]).columns)}")

Baseline features: 25
Categorical columns: 6
Numerical columns: 19


In [4]:
# Quick CV Evaluation Function
def quick_cv_eval(X, y, name="Model", n_folds=3):
    """Quick CV evaluation to check feature engineering impact"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'n_estimators': 200,  # Reduced for memory efficiency
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'random_state': 42,
        'n_jobs': -1,  # Use all CPUs (memory is the constraint, not CPU)
        'verbosity': -1
    }
    
    scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False), lgb.log_evaluation(0)]
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=callbacks)
        
        y_pred = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
    
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{name} - CV AUC: {mean_score:.5f} (+/- {std_score:.5f})")
    return mean_score, std_score

In [6]:
# Evaluate Baseline
print("="*70)
print("Baseline Evaluation")
print("="*70)
baseline_score, baseline_std = quick_cv_eval(X_base, y, "Baseline", n_folds=3)
print("="*70 + "\n")

# Store results
results = []
results.append({
    'step': 'Baseline',
    'n_features': len(X_base.columns),
    'cv_auc': baseline_score,
    'cv_std': baseline_std,
    'improvement': 0,
    'features_added': 0
})

# Initialize current state
train_current = train.copy()
test_current = test.copy()
X_current = X_base.copy()
test_current_fe = test_base.copy()
current_score = baseline_score
current_std = baseline_std
step_num = 1

Baseline Evaluation
Baseline - CV AUC: 0.72260 (+/- 0.00061)



In [6]:
# Step 1: Groupby Aggregations (mean, std) - 12個を6個ずつバッチ処理
print(f"\n{'='*70}")
print(f"[Step {step_num}] Groupby Aggregations (mean, std) - Target: 12 features (6 per batch)")
print("-"*70)

cat_cols = train_current.select_dtypes(include="object").columns.tolist()
num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

# 高重要度特徴量を優先
high_importance_cats = ['family_history_diabetes', 'gender', 'ethnicity'] if 'family_history_diabetes' in cat_cols else cat_cols[:3]
high_importance_nums = ['physical_activity_minutes_per_week', 'age', 'bmi', 'triglycerides', 'ldl_cholesterol'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:5]

# Generate feature combinations (without creating them yet)
feature_combinations = []
stats = ['mean', 'std']
for cat_col in high_importance_cats[:2]:
    for num_col in high_importance_nums[:3]:
        for stat in stats:
            if len(feature_combinations) >= 12:
                break
            new_col = f'{cat_col}_{num_col}_{stat}'
            if new_col not in X_current.columns:
                feature_combinations.append((cat_col, num_col, stat, new_col))
        if len(feature_combinations) >= 12:
            break
    if len(feature_combinations) >= 12:
        break

# Process in batches of 6
batch_size = 6
total_kept = 0

for batch_idx in range(0, len(feature_combinations), batch_size):
    batch = feature_combinations[batch_idx:batch_idx + batch_size]
    print(f"\n  Processing batch {batch_idx // batch_size + 1}/{(len(feature_combinations) + batch_size - 1) // batch_size} ({len(batch)} features)...")
    
    # Create features in this batch
    batch_features = []
    for cat_col, num_col, stat, new_col in batch:
        try:
            # Calculate statistics on train only (memory efficient)
            grouped = train_current.groupby(cat_col)[num_col].agg(stat)
            train_current[new_col] = train_current[cat_col].map(grouped).fillna(0)
            test_current[new_col] = test_current[cat_col].map(grouped).fillna(0)
            batch_features.append(new_col)
            del grouped
        except:
            pass
    
    if len(batch_features) == 0:
        continue
    
    # Prepare data for CV evaluation
    X_batch = train_current.drop("diagnosed_diabetes", axis=1)
    test_batch_fe = test_current.copy()
    
    # Handle NaN for numerical and categorical columns
    num_cols_batch = X_batch.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols_batch = X_batch.select_dtypes(include=["object", "category"]).columns.tolist()
    
    for col in num_cols_batch:
        X_batch[col] = X_batch[col].fillna(0)
        test_batch_fe[col] = test_batch_fe[col].fillna(0)
    
    for col in cat_cols_batch:
        if X_batch[col].isna().any() or test_batch_fe[col].isna().any():
            if X_batch[col].dtype.name == 'category':
                X_batch[col] = X_batch[col].cat.add_categories(['Missing'])
                test_batch_fe[col] = test_batch_fe[col].cat.add_categories(['Missing'])
            else:
                X_batch[col] = X_batch[col].astype("category").cat.add_categories(['Missing'])
                test_batch_fe[col] = test_batch_fe[col].astype("category").cat.add_categories(['Missing'])
            X_batch[col] = X_batch[col].fillna('Missing')
            test_batch_fe[col] = test_batch_fe[col].fillna('Missing')
        else:
            if X_batch[col].dtype.name != 'category':
                X_batch[col] = X_batch[col].astype("category")
                test_batch_fe[col] = test_batch_fe[col].astype("category")
    
    # Evaluate batch
    score, std = quick_cv_eval(X_batch, y, f"  Batch {batch_idx // batch_size + 1}", n_folds=3)
    improvement = score - current_score
    
    if improvement > 0:
        current_score = score
        current_std = std
        X_current = X_batch.copy()
        test_current_fe = test_batch_fe.copy()
        total_kept += len(batch_features)
        print(f"  ✓ Batch improved CV score! (+{improvement:+.5f}) - Keeping {len(batch_features)} features")
    else:
        # Revert this batch
        for col in batch_features:
            if col in train_current.columns:
                train_current = train_current.drop(col, axis=1)
            if col in test_current.columns:
                test_current = test_current.drop(col, axis=1)
        print(f"  ✗ Batch did not improve ({improvement:+.5f}) - Reverting {len(batch_features)} features")
    
    # Free memory
    del X_batch, test_batch_fe
    gc.collect()

print(f"\nTotal features kept: {total_kept}")
print(f"Final CV AUC: {current_score:.5f} (+/- {current_std:.5f})")
print(f"Total improvement from baseline: {current_score - baseline_score:+.5f}")

results.append({
    'step': f'Step {step_num}: Groupby (mean, std)',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': current_score - baseline_score,
    'features_added': total_kept
})
step_num += 1


[Step 1] Groupby Aggregations (mean, std) - Target: 12 features (6 per batch)
----------------------------------------------------------------------

  Processing batch 1/2 (6 features)...
  Batch 1 - CV AUC: 0.72281 (+/- 0.00067)
  ✓ Batch improved CV score! (++0.00021) - Keeping 6 features

  Processing batch 2/2 (6 features)...
  Batch 2 - CV AUC: 0.72298 (+/- 0.00071)
  ✓ Batch improved CV score! (++0.00017) - Keeping 6 features

Total features kept: 12
Final CV AUC: 0.72298 (+/- 0.00071)
Total improvement from baseline: +0.00038


In [7]:
# Step 2: Groupby Aggregations (count, min, max) - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Groupby Aggregations (count, min, max) - Target: 12 features")
print("-"*70)

cat_cols = train_current.select_dtypes(include="object").columns.tolist()
num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

high_importance_cats = ['family_history_diabetes', 'gender', 'ethnicity'] if 'family_history_diabetes' in cat_cols else cat_cols[:3]
high_importance_nums = ['physical_activity_minutes_per_week', 'age', 'bmi', 'triglycerides', 'ldl_cholesterol'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:5]

# Memory-efficient approach: calculate statistics on train only, then map to test
new_features = []
stats = ['count', 'min', 'max']
for cat_col in high_importance_cats[:2]:
    for num_col in high_importance_nums[:4]:
        for stat in stats:
            if len(new_features) >= 12:
                break
            try:
                new_col = f'{cat_col}_{num_col}_{stat}'
                if new_col not in X_current.columns:
                    # Calculate statistics on train only (memory efficient)
                    grouped = train_current.groupby(cat_col)[num_col].agg(stat)
                    train_current[new_col] = train_current[cat_col].map(grouped).fillna(0)
                    test_current[new_col] = test_current[cat_col].map(grouped).fillna(0)
                    new_features.append(new_col)
                    # Free memory
                    del grouped
            except:
                pass
        if len(new_features) >= 12:
            break
    if len(new_features) >= 12:
        break

gc.collect()

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Groupby (count, min, max)',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 2] Groupby Aggregations (count, min, max) - Target: 12 features
----------------------------------------------------------------------
Step 2 - CV AUC: 0.72264 (+/- 0.00082)
Features added: 12
Improvement: -0.00034
Total improvement from baseline: +0.00003
✗ Step 2 did not improve. Reverting...


In [8]:
# Step 3: Quantile Features - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Quantile Features - Target: 12 features")
print("-"*70)

cat_cols = train_current.select_dtypes(include="object").columns.tolist()
num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

high_importance_cats = ['family_history_diabetes', 'gender'] if 'family_history_diabetes' in cat_cols else cat_cols[:2]
high_importance_nums = ['physical_activity_minutes_per_week', 'age', 'bmi'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:3]

# Memory-efficient approach: calculate quantiles on train only, then map to test
new_features = []
quantiles = [5, 10, 25, 40, 45, 55, 60, 75, 90, 95]
for cat_col in high_importance_cats[:1]:
    for num_col in high_importance_nums[:2]:
        try:
            # Calculate quantiles on train only (memory efficient)
            grouped = train_current.groupby(cat_col)[num_col]
            for q in quantiles:
                if len(new_features) >= 12:
                    break
                quantile_vals = grouped.quantile(q / 100.0)
                new_col = f'{cat_col}_{num_col}_q{q}'
                train_current[new_col] = train_current[cat_col].map(quantile_vals).fillna(0)
                test_current[new_col] = test_current[cat_col].map(quantile_vals).fillna(0)
                new_features.append(new_col)
                # Free memory
                del quantile_vals
        except:
            pass
        if len(new_features) >= 12:
            break
    if len(new_features) >= 12:
        break

del grouped
gc.collect()

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Quantiles',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 3] Quantile Features - Target: 12 features
----------------------------------------------------------------------
Step 3 - CV AUC: 0.72272 (+/- 0.00059)
Features added: 12
Improvement: -0.00026
Total improvement from baseline: +0.00011
✗ Step 3 did not improve. Reverting...


In [9]:
# Step 4: Binning Features - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Binning Features - Target: 12 features")
print("-"*70)

num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

important_nums = ['physical_activity_minutes_per_week', 'age', 'bmi', 'triglycerides'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:4]
bins = [1, 2, 5, 10, 20, 50]

new_features = []
for num_col in important_nums[:2]:
    for bin_size in bins:
        if len(new_features) >= 12:
            break
        try:
            new_col = f'{num_col}_bin_{bin_size}'
            train_current[new_col] = (train_current[num_col] / bin_size).round() * bin_size
            test_current[new_col] = (test_current[num_col] / bin_size).round() * bin_size
            new_features.append(new_col)
        except:
            pass
    if len(new_features) >= 12:
        break

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Binning',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 4] Binning Features - Target: 12 features
----------------------------------------------------------------------
Step 4 - CV AUC: 0.72177 (+/- 0.00086)
Features added: 12
Improvement: -0.00121
Total improvement from baseline: -0.00084
✗ Step 4 did not improve. Reverting...


In [10]:
# Step 5: Digit Extraction - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Digit Extraction Features - Target: 12 features")
print("-"*70)

num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

important_nums = ['physical_activity_minutes_per_week', 'age', 'bmi'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:3]

new_features = []
for num_col in important_nums[:4]:
    for digit_pos in range(1, 4):
        if len(new_features) >= 12:
            break
        try:
            new_col = f'{num_col}_digit_{digit_pos}'
            train_current[new_col] = ((train_current[num_col] * (10 ** digit_pos)).astype(int) % 10).astype(float)
            test_current[new_col] = ((test_current[num_col] * (10 ** digit_pos)).astype(int) % 10).astype(float)
            new_features.append(new_col)
        except:
            pass
    if len(new_features) >= 12:
        break

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Digit Extraction',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 5] Digit Extraction Features - Target: 12 features
----------------------------------------------------------------------
Step 5 - CV AUC: 0.72166 (+/- 0.00064)
Features added: 9
Improvement: -0.00132
Total improvement from baseline: -0.00094
✗ Step 5 did not improve. Reverting...


In [11]:
# Step 6: Ratio Features (特徴量重要度チャートを参考) - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Ratio Features (Based on Feature Importance) - Target: 12 features")
print("-"*70)

new_features = []
# 高重要度特徴量を中心にratio特徴量を作成
important_pairs = [
    ('age', 'physical_activity_minutes_per_week'),  # age_activity_ratio (最重要)
    ('bmi', 'physical_activity_minutes_per_week'),  # bmi_activity_ratio
    ('age', 'bmi'),
    ('triglycerides', 'ldl_cholesterol'),
    ('age', 'triglycerides'),
    ('age', 'ldl_cholesterol'),
    ('bmi', 'triglycerides'),
    ('cholesterol_total', 'hdl_cholesterol'),
    ('cholesterol_total', 'ldl_cholesterol'),
    ('systolic_bp', 'diastolic_bp'),
    ('heart_rate', 'physical_activity_minutes_per_week'),
    ('diet_score', 'physical_activity_minutes_per_week')
]

for col1, col2 in important_pairs:
    if len(new_features) >= 12:
        break
    if col1 in train_current.columns and col2 in train_current.columns:
        try:
            new_col = f'{col1}_{col2}_ratio'
            if new_col not in X_current.columns:
                train_current[new_col] = train_current[col1] / (train_current[col2] + 1e-8)
                test_current[new_col] = test_current[col1] / (test_current[col2] + 1e-8)
                new_features.append(new_col)
        except:
            pass

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Ratio Features',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 6] Ratio Features (Based on Feature Importance) - Target: 12 features
----------------------------------------------------------------------
Step 6 - CV AUC: 0.72114 (+/- 0.00058)
Features added: 12
Improvement: -0.00184
Total improvement from baseline: -0.00147
✗ Step 6 did not improve. Reverting...


In [12]:
# Step 7: Interaction Features (特徴量重要度チャートを参考) - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Interaction Features (Based on Feature Importance) - Target: 12 features")
print("-"*70)

new_features = []
# 高重要度特徴量の相互作用
important_interactions = [
    ('family_history_diabetes', 'age'),  # family_history_age (高重要度)
    ('family_history_diabetes', 'bmi'),  # family_history_bmi (高重要度)
    ('family_history_diabetes', 'triglycerides'),  # family_history_triglycerides
    ('age', 'bmi'),  # age_bmi_interaction (高重要度)
    ('age', 'triglycerides'),  # age_triglycerides_interaction
    ('age', 'ldl_cholesterol'),  # age_ldl_interaction
    ('physical_activity_minutes_per_week', 'age'),
    ('physical_activity_minutes_per_week', 'bmi'),
    ('family_history_diabetes', 'cholesterol_total'),
    ('family_history_diabetes', 'hdl_cholesterol'),
    ('family_history_diabetes', 'heart_rate'),
    ('bmi', 'triglycerides')
]

for col1, col2 in important_interactions:
    if len(new_features) >= 12:
        break
    if col1 in train_current.columns and col2 in train_current.columns:
        try:
            new_col = f'{col1}_{col2}_interaction'
            if new_col not in X_current.columns:
                train_current[new_col] = train_current[col1] * train_current[col2]
                test_current[new_col] = test_current[col1] * test_current[col2]
                new_features.append(new_col)
        except:
            pass

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Interaction Features',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 7] Interaction Features (Based on Feature Importance) - Target: 12 features
----------------------------------------------------------------------
Step 7 - CV AUC: 0.72137 (+/- 0.00062)
Features added: 12
Improvement: -0.00161
Total improvement from baseline: -0.00123
✗ Step 7 did not improve. Reverting...


In [7]:
# Step 8: Groupby (nunique) - 12個
print(f"\n{'='*70}")
print(f"[Step {step_num}] Groupby Aggregations (nunique) - Target: 12 features")
print("-"*70)

cat_cols = train_current.select_dtypes(include="object").columns.tolist()
num_cols = train_current.select_dtypes(include=[np.number]).columns.tolist()
if 'diagnosed_diabetes' in num_cols:
    num_cols.remove('diagnosed_diabetes')

high_importance_cats = ['family_history_diabetes', 'gender', 'ethnicity'] if 'family_history_diabetes' in cat_cols else cat_cols[:3]
high_importance_nums = ['physical_activity_minutes_per_week', 'age', 'bmi', 'triglycerides'] if 'physical_activity_minutes_per_week' in num_cols else num_cols[:4]

# Memory-efficient approach: calculate nunique on train only, then map to test
new_features = []
for cat_col in high_importance_cats[:3]:
    for num_col in high_importance_nums[:4]:
        if len(new_features) >= 12:
            break
        try:
            new_col = f'{cat_col}_{num_col}_nunique'
            if new_col not in X_current.columns:
                # Calculate nunique on train only (memory efficient)
                grouped = train_current.groupby(cat_col)[num_col].nunique()
                train_current[new_col] = train_current[cat_col].map(grouped).fillna(0)
                test_current[new_col] = test_current[cat_col].map(grouped).fillna(0)
                new_features.append(new_col)
                # Free memory
                del grouped
        except:
            pass
    if len(new_features) >= 12:
        break

gc.collect()

X_current = train_current.drop("diagnosed_diabetes", axis=1)
# Update test_current_fe to match test_current (with new features)
test_current_fe = test_current.copy()

# Handle NaN for numerical and categorical columns separately
num_cols_current = X_current.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_current = X_current.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill NaN for numerical columns
for col in num_cols_current:
    X_current[col] = X_current[col].fillna(0)
    test_current_fe[col] = test_current_fe[col].fillna(0)

# Handle NaN for categorical columns and ensure they are category type
for col in cat_cols_current:
    if X_current[col].isna().any() or test_current_fe[col].isna().any():
        # Add 'Missing' category if there are NaN values
        if X_current[col].dtype.name == 'category':
            X_current[col] = X_current[col].cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].cat.add_categories(['Missing'])
        else:
            X_current[col] = X_current[col].astype("category").cat.add_categories(['Missing'])
            test_current_fe[col] = test_current_fe[col].astype("category").cat.add_categories(['Missing'])
        X_current[col] = X_current[col].fillna('Missing')
        test_current_fe[col] = test_current_fe[col].fillna('Missing')
    else:
        # Ensure category type even if no NaN
        if X_current[col].dtype.name != 'category':
            X_current[col] = X_current[col].astype("category")
            test_current_fe[col] = test_current_fe[col].astype("category")

score, std = quick_cv_eval(X_current, y, f"Step {step_num}", n_folds=3)
improvement = score - current_score
print(f"Features added: {len(new_features)}")
print(f"Improvement: {improvement:+.5f}")
print(f"Total improvement from baseline: {score - baseline_score:+.5f}")

if improvement > 0:
    current_score = score
    current_std = std
    print(f"✓ Step {step_num} improved CV score! Keeping features.")
else:
    print(f"✗ Step {step_num} did not improve. Reverting...")
    for col in new_features:
        if col in X_current.columns:
            X_current = X_current.drop(col, axis=1)
            test_current_fe = test_current_fe.drop(col, axis=1)

results.append({
    'step': f'Step {step_num}: Groupby (nunique)',
    'n_features': len(X_current.columns),
    'cv_auc': current_score,
    'cv_std': current_std,
    'improvement': improvement if improvement > 0 else 0,
    'features_added': len(new_features) if improvement > 0 else 0
})
step_num += 1


[Step 1] Groupby Aggregations (nunique) - Target: 12 features
----------------------------------------------------------------------
Step 1 - CV AUC: 0.72304 (+/- 0.00074)
Features added: 12
Improvement: +0.00043
Total improvement from baseline: +0.00043
✓ Step 1 improved CV score! Keeping features.


In [8]:
# Final Summary
print("\n" + "="*70)
print("Feature Engineering Summary")
print("="*70)
print(f"{'Step':<35} {'Features':<12} {'CV AUC':<12} {'Improvement':<12} {'Added':<8}")
print("-"*85)
for r in results:
    improvement_str = f"{r.get('improvement', 0):+.5f}" if 'improvement' in r else "-"
    added_str = f"{r.get('features_added', 0)}" if 'features_added' in r else "-"
    print(f"{r['step']:<35} {r['n_features']:<12} {r['cv_auc']:<12.5f} {improvement_str:<12} {added_str:<8}")
print("="*85)

print(f"\n{'='*70}")
print("Final Results")
print(f"{'='*70}")
print(f"Final feature count: {len(X_current.columns)}")
print(f"Final CV AUC: {current_score:.5f} (+/- {current_std:.5f})")
print(f"Total improvement from baseline: {current_score - baseline_score:+.5f}")
print(f"{'='*70}")

# Calculate total features added
total_added = sum(r.get('features_added', 0) for r in results)
print(f"\nTotal features added (kept): {total_added}")
print(f"Baseline feature count: {len(X_base.columns)}")
print(f"Net increase: {len(X_current.columns) - len(X_base.columns)}")

# List all added features
baseline_cols = set(X_base.columns)
final_cols = set(X_current.columns)
added_features = sorted(list(final_cols - baseline_cols))

print(f"\nAdded Features ({len(added_features)} total):")
for i, feat in enumerate(added_features, 1):
    print(f"  {i:2d}. {feat}")


Feature Engineering Summary
Step                                Features     CV AUC       Improvement  Added   
-------------------------------------------------------------------------------------
Baseline                            25           0.72260      +0.00000     0       
Step 1: Groupby (nunique)           37           0.72304      +0.00043     12      

Final Results
Final feature count: 37
Final CV AUC: 0.72304 (+/- 0.00074)
Total improvement from baseline: +0.00043

Total features added (kept): 12
Baseline feature count: 25
Net increase: 12

Added Features (12 total):
   1. education_level_age_nunique
   2. education_level_bmi_nunique
   3. education_level_physical_activity_minutes_per_week_nunique
   4. education_level_triglycerides_nunique
   5. ethnicity_age_nunique
   6. ethnicity_bmi_nunique
   7. ethnicity_physical_activity_minutes_per_week_nunique
   8. ethnicity_triglycerides_nunique
   9. gender_age_nunique
  10. gender_bmi_nunique
  11. gender_physical_activity_

In [None]:
# Final feature set ready for model training
X_final = X_current.copy()
test_final = test_current_fe.copy()

print("="*70)
print("Final Feature Set Ready")
print("="*70)
print(f"Use X_final and test_final for model training.")
print(f"Final feature count: {len(X_final.columns)}")
print(f"Final CV AUC: {current_score:.5f}")
print("="*70)