# 05 - Feature Selection

**Objective**: Select optimal 25-30 features for modeling

**Methods**:
1. Variance filter
2. Correlation filter (r > 0.85)
3. Mutual Information scores
4. RFE with cross-validation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

print('Libraries loaded!')

In [None]:
# Paths
FEATURE_PATH = Path('../data/04_feature')
MODEL_INPUT_PATH = Path('../data/05_model_input')
MODEL_PATH = Path('../data/06_models')
REPORTING_PATH = Path('../data/08_reporting')

for p in [MODEL_INPUT_PATH, MODEL_PATH, REPORTING_PATH]:
    p.mkdir(parents=True, exist_ok=True)

# Load data
df_train = pd.read_csv(FEATURE_PATH / 'engineered_train.csv')
df_holdout = pd.read_csv(FEATURE_PATH / 'engineered_holdout.csv')

TARGET = 'Churn'
feature_cols = [c for c in df_train.columns if c != TARGET]

print(f"Initial features: {len(feature_cols)}")
print(f"Train samples: {len(df_train):,}")

In [None]:
X = df_train[feature_cols]
y = df_train[TARGET]

## 1. Variance Filter (Remove < 0.01)

In [None]:
# Scale data for fair variance comparison
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Variance threshold
var_threshold = 0.01
selector = VarianceThreshold(threshold=var_threshold)
selector.fit(X_scaled)

# Get low variance features
low_var_mask = ~selector.get_support()
low_var_features = X.columns[low_var_mask].tolist()

print(f" Low variance features (< {var_threshold}): {len(low_var_features)}")
for f in low_var_features:
    print(f"   - {f}: variance = {X_scaled[f].var():.4f}")

In [None]:
# Remove low variance features
features_after_var = [c for c in feature_cols if c not in low_var_features]
print(f"\n Features after variance filter: {len(features_after_var)}")

## 2. Correlation Filter (Drop from pairs r > 0.85)

In [None]:
def remove_correlated_features(df, features, target, threshold=0.85):
    """Remove highly correlated features, keeping the one with higher target correlation."""
    X_subset = df[features]
    corr_matrix = X_subset.corr().abs()
    
    # Get target correlations
    target_corr = df[features].corrwith(df[target]).abs()
    
    # Find highly correlated pairs
    to_drop = set()
    drop_log = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > threshold:
                feat1, feat2 = corr_matrix.columns[i], corr_matrix.columns[j]
                
                # Drop the one with lower target correlation
                if target_corr[feat1] < target_corr[feat2]:
                    to_drop.add(feat1)
                    drop_log.append(f"Dropped {feat1} (r={corr_matrix.iloc[i,j]:.2f} with {feat2})")
                else:
                    to_drop.add(feat2)
                    drop_log.append(f"Dropped {feat2} (r={corr_matrix.iloc[i,j]:.2f} with {feat1})")
    
    features_kept = [f for f in features if f not in to_drop]
    return features_kept, list(to_drop), drop_log

features_after_corr, dropped_corr, drop_log = remove_correlated_features(
    df_train, features_after_var, TARGET, threshold=0.85
)

print(f" Dropped for high correlation: {len(dropped_corr)}")
for log in drop_log[:10]:
    print(f"   {log}")
if len(drop_log) > 10:
    print(f"   ... and {len(drop_log)-10} more")
    
print(f"\n Features after correlation filter: {len(features_after_corr)}")

## 3. Mutual Information Scores

In [None]:
# Calculate Mutual Information scores
X_mi = df_train[features_after_corr].fillna(0)

mi_scores = mutual_info_classif(X_mi, y, random_state=42)
mi_df = pd.DataFrame({
    'Feature': features_after_corr,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print(" TOP 20 FEATURES BY MUTUAL INFORMATION:")
display(mi_df.head(20))

In [None]:
# Visualize MI scores
plt.figure(figsize=(12, 8))
mi_df.head(25).sort_values('MI_Score').plot(
    kind='barh', x='Feature', y='MI_Score',
    color='#3498db', edgecolor='black', legend=False
)
plt.title('Top 25 Features by Mutual Information', fontsize=14, fontweight='bold')
plt.xlabel('Mutual Information Score')
plt.tight_layout()
plt.savefig(REPORTING_PATH / 'mi_scores.png', dpi=150)
plt.show()

In [None]:
# Filter to top 40 by MI for RFE
top_mi_features = mi_df.head(40)['Feature'].tolist()
print(f"\n Top 40 MI features selected for RFE")

## 4. Recursive Feature Elimination (RFE)

In [None]:
# Prepare data for RFE
X_rfe = df_train[top_mi_features].fillna(0)
X_rfe_scaled = StandardScaler().fit_transform(X_rfe)

# RFE with Cross-Validation
estimator = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Running RFECV (this may take a few minutes)...")
rfecv = RFECV(
    estimator=estimator,
    step=1,
    cv=cv,
    scoring='roc_auc',
    min_features_to_select=15,
    n_jobs=-1
)
rfecv.fit(X_rfe_scaled, y)

print(f"\n Optimal number of features: {rfecv.n_features_}")

In [None]:
# RFECV results
selected_mask = rfecv.support_
selected_features = [f for f, s in zip(top_mi_features, selected_mask) if s]

print(f" SELECTED FEATURES ({len(selected_features)}):")
for f in selected_features:
    print(f"    {f}")

In [None]:
# Plot RFE results
plt.figure(figsize=(10, 6))
plt.plot(range(rfecv.min_features_to_select, len(rfecv.cv_results_['mean_test_score']) + rfecv.min_features_to_select), 
         rfecv.cv_results_['mean_test_score'], marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation AUC-ROC')
plt.title('RFE Feature Selection', fontsize=14, fontweight='bold')
plt.axvline(x=rfecv.n_features_, color='r', linestyle='--', label=f'Optimal: {rfecv.n_features_}')
plt.legend()
plt.tight_layout()
plt.savefig(REPORTING_PATH / 'rfe_curve.png', dpi=150)
plt.show()

## 5. Final Feature List

In [None]:
# Compile final feature list (ensure 25-30 features)
final_features = selected_features.copy()

# If less than 25, add more from MI ranking
if len(final_features) < 25:
    additional = [f for f in top_mi_features if f not in final_features][:25-len(final_features)]
    final_features.extend(additional)

# If more than 30, trim
final_features = final_features[:30]

print(f" FINAL SELECTED FEATURES ({len(final_features)}):")
print("="*60)
for i, f in enumerate(final_features, 1):
    mi_score = mi_df[mi_df['Feature'] == f]['MI_Score'].values
    mi = mi_score[0] if len(mi_score) > 0 else 0
    print(f"{i:2d}. {f:<35} (MI: {mi:.4f})")

## 6. Save Selected Features

In [None]:
# Create final datasets with selected features
X_train_final = df_train[final_features + [TARGET]]
X_holdout_final = df_holdout[final_features]

# Save
X_train_final.to_csv(MODEL_INPUT_PATH / 'selected_train.csv', index=False)
X_holdout_final.to_csv(MODEL_INPUT_PATH / 'selected_holdout.csv', index=False)

# Save feature list as JSON
with open(MODEL_PATH / 'feature_list.json', 'w') as f:
    json.dump(final_features, f, indent=2)

print(" Saved:")
print(f"   - {MODEL_INPUT_PATH / 'selected_train.csv'}")
print(f"   - {MODEL_INPUT_PATH / 'selected_holdout.csv'}")
print(f"   - {MODEL_PATH / 'feature_list.json'}")

In [None]:
# Summary
print("\n" + "="*60)
print(" FEATURE SELECTION SUMMARY")
print("="*60)
print(f"Initial features: {len(feature_cols)}")
print(f"After variance filter: {len(features_after_var)} (dropped {len(low_var_features)})")
print(f"After correlation filter: {len(features_after_corr)} (dropped {len(dropped_corr)})")
print(f"After RFECV: {len(final_features)}")
print("\n NEXT: Proceed to 06_Model_Training.ipynb")
print("="*60)