# Feature Selection - Stress Level Prediction

This notebook performs feature selection to identify the most important features for stress level prediction.

## Objectives:
1. Load cleaned dataset
2. Apply correlation-based feature selection
3. Univariate feature selection
4. Recursive Feature Elimination (RFE)
5. Feature importance analysis
6. Compare different selection methods
7. Select final feature set

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../src')
from utils.config import *
from features.feature_selector import FeatureSelector

print("Libraries imported successfully!")

## 1. Load Cleaned Dataset

In [None]:
# Load the cleaned dataset
try:
    cleaned_data_path = PROCESSED_DATA_DIR / "cleaned_stress_data.csv"
    df = pd.read_csv(cleaned_data_path)
    print(f"Loaded cleaned dataset from: {cleaned_data_path}")
except FileNotFoundError:
    print("Cleaned dataset not found. Please run the data cleaning notebook first.")
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 800
    
    # Create sample features (after cleaning/scaling)
    sample_data = {
        'heart_rate': np.random.normal(0, 1, n_samples),
        'sleep_hours': np.random.normal(0, 1, n_samples),
        'exercise_minutes': np.random.normal(0, 1, n_samples),
        'caffeine_intake': np.random.normal(0, 1, n_samples),
        'work_hours': np.random.normal(0, 1, n_samples),
        'age': np.random.normal(0, 1, n_samples),
        'bmi': np.random.normal(0, 1, n_samples),
        'blood_pressure_sys': np.random.normal(0, 1, n_samples),
        'blood_pressure_dia': np.random.normal(0, 1, n_samples),
        'gender_Male': np.random.choice([0, 1], n_samples),
        'stress_level': np.random.choice([0, 1, 2], n_samples, p=[0.3, 0.5, 0.2])  # Encoded target
    }
    
    df = pd.DataFrame(sample_data)
    # Make some features more correlated with target for realistic selection
    df.loc[df['stress_level'] == 2, 'heart_rate'] += 1.5
    df.loc[df['stress_level'] == 2, 'work_hours'] += 1.2
    df.loc[df['stress_level'] == 0, 'sleep_hours'] += 1.0
    
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Separate features and target
target_col = 'stress_level'
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")
print(f"\nTarget distribution:")
print(y.value_counts().sort_index())

## 2. Initialize Feature Selector

In [None]:
# Initialize feature selector
selector = FeatureSelector()

# Store results from different methods
selection_results = {}

print("Feature selector initialized!")

## 3. Correlation-Based Feature Selection

In [None]:
# Apply correlation analysis to remove highly correlated features
print("Applying correlation-based feature selection...")

df_corr_reduced = selector.correlation_analysis(df, target_col, threshold=0.8)
X_corr = df_corr_reduced.drop(columns=[target_col])

removed_by_correlation = set(X.columns) - set(X_corr.columns)

print(f"Original features: {len(X.columns)}")
print(f"Features after correlation filtering: {len(X_corr.columns)}")
print(f"Features removed: {removed_by_correlation}")

selection_results['correlation_based'] = {
    'selected_features': list(X_corr.columns),
    'n_features': len(X_corr.columns),
    'removed_features': list(removed_by_correlation)
}

# Visualize correlation matrix of remaining features
if len(X_corr.columns) > 1:
    plt.figure(figsize=(10, 8))
    corr_matrix = X_corr.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Correlation Matrix After Correlation-Based Selection')
    plt.tight_layout()
    plt.show()

## 4. Univariate Feature Selection

In [None]:
# Apply univariate feature selection
from sklearn.feature_selection import f_classif

print("Applying univariate feature selection...")

k_features = min(8, len(X.columns))  # Select top 8 features or all if less
univariate_features = selector.univariate_selection(X, y, k=k_features, score_func=f_classif)

selection_results['univariate'] = {
    'selected_features': univariate_features,
    'n_features': len(univariate_features),
    'feature_scores': selector.feature_scores
}

print(f"Selected {len(univariate_features)} features using univariate selection")
print(f"Selected features: {univariate_features}")

# Visualize feature scores
if selector.feature_scores:
    plt.figure(figsize=(12, 6))
    scores_df = pd.DataFrame(list(selector.feature_scores.items()), 
                           columns=['Feature', 'Score'])
    scores_df = scores_df.sort_values('Score', ascending=True)
    
    plt.barh(scores_df['Feature'], scores_df['Score'])
    plt.title('Univariate Feature Scores (F-test)')
    plt.xlabel('F-Score')
    plt.tight_layout()
    plt.show()

## 5. Recursive Feature Elimination (RFE)

In [None]:
# Apply Recursive Feature Elimination
print("Applying Recursive Feature Elimination...")

n_features_rfe = min(6, len(X.columns))  # Select top 6 features
rfe_features = selector.recursive_feature_elimination(X, y, n_features=n_features_rfe)

selection_results['rfe'] = {
    'selected_features': rfe_features,
    'n_features': len(rfe_features)
}

print(f"Selected {len(rfe_features)} features using RFE")
print(f"Selected features: {rfe_features}")

## 6. Feature Importance-Based Selection

In [None]:
# Apply feature importance-based selection
print("Applying feature importance-based selection...")

importance_features = selector.feature_importance_selection(X, y, threshold=0.05)

selection_results['importance_based'] = {
    'selected_features': importance_features,
    'n_features': len(importance_features),
    'feature_scores': selector.feature_scores
}

print(f"Selected {len(importance_features)} features using importance threshold")
print(f"Selected features: {importance_features}")

# Visualize feature importance
if selector.feature_scores:
    plt.figure(figsize=(12, 6))
    importance_df = pd.DataFrame(list(selector.feature_scores.items()), 
                               columns=['Feature', 'Importance'])
    importance_df = importance_df.sort_values('Importance', ascending=True)
    
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.title('Random Forest Feature Importance')
    plt.xlabel('Importance')
    plt.axvline(x=0.05, color='red', linestyle='--', label='Threshold (0.05)')
    plt.legend()
    plt.tight_layout()
    plt.show()

## 7. Mutual Information-Based Selection

In [None]:
# Apply mutual information-based selection
print("Applying mutual information-based selection...")

k_features_mi = min(7, len(X.columns))  # Select top 7 features
mi_features = selector.mutual_information_selection(X, y, k=k_features_mi)

selection_results['mutual_information'] = {
    'selected_features': mi_features,
    'n_features': len(mi_features),
    'feature_scores': selector.feature_scores
}

print(f"Selected {len(mi_features)} features using mutual information")
print(f"Selected features: {mi_features}")

# Visualize mutual information scores
if selector.feature_scores:
    plt.figure(figsize=(12, 6))
    mi_df = pd.DataFrame(list(selector.feature_scores.items()), 
                        columns=['Feature', 'MI_Score'])
    mi_df = mi_df.sort_values('MI_Score', ascending=True)
    
    plt.barh(mi_df['Feature'], mi_df['MI_Score'])
    plt.title('Mutual Information Scores')
    plt.xlabel('Mutual Information Score')
    plt.tight_layout()
    plt.show()

## 8. Compare Selection Methods

In [None]:
# Compare different selection methods
print("FEATURE SELECTION COMPARISON")
print("=" * 50)

for method, results in selection_results.items():
    print(f"\n{method.upper().replace('_', ' ')}:")
    print(f"  Features selected: {results['n_features']}")
    print(f"  Selected features: {results['selected_features']}")

# Create a comparison matrix
all_features = list(X.columns)
comparison_df = pd.DataFrame(index=all_features)

for method, results in selection_results.items():
    comparison_df[method] = comparison_df.index.isin(results['selected_features']).astype(int)

# Add a sum column to see how many methods selected each feature
comparison_df['total_selections'] = comparison_df.sum(axis=1)
comparison_df = comparison_df.sort_values('total_selections', ascending=False)

print("\nFeature Selection Comparison Matrix:")
print(comparison_df)

# Visualize the comparison
plt.figure(figsize=(12, 8))
sns.heatmap(comparison_df.drop('total_selections', axis=1).T, 
            annot=True, cmap='RdYlBu', cbar_kws={'label': 'Selected (1) / Not Selected (0)'})
plt.title('Feature Selection Methods Comparison')
plt.xlabel('Features')
plt.ylabel('Selection Methods')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. Consensus-Based Feature Selection

In [None]:
# Select features based on consensus (selected by multiple methods)
min_votes = 2  # Feature must be selected by at least 2 methods

consensus_features = comparison_df[comparison_df['total_selections'] >= min_votes].index.tolist()

print(f"Consensus-based feature selection (min votes: {min_votes}):")
print(f"Selected {len(consensus_features)} features: {consensus_features}")

# Show which methods selected each consensus feature
print("\nConsensus features selection details:")
for feature in consensus_features:
    selecting_methods = []
    for method in selection_results.keys():
        if comparison_df.loc[feature, method] == 1:
            selecting_methods.append(method)
    print(f"  {feature}: selected by {selecting_methods} ({len(selecting_methods)} votes)")

# Add to results
selection_results['consensus'] = {
    'selected_features': consensus_features,
    'n_features': len(consensus_features),
    'min_votes': min_votes
}

## 10. Final Feature Set Selection

In [None]:
# Choose final feature set (using consensus approach)
final_features = consensus_features

# If consensus gives too few features, fall back to importance-based selection
if len(final_features) < 3:
    print("Consensus selected too few features. Using importance-based selection.")
    final_features = selection_results['importance_based']['selected_features']

print(f"\nFINAL FEATURE SET:")
print(f"Selected {len(final_features)} features: {final_features}")

# Create final datasets
X_final = X[final_features]
y_final = y

print(f"\nFinal feature matrix shape: {X_final.shape}")
print(f"Final target vector shape: {y_final.shape}")

# Visualize final feature correlations with target
final_df = pd.concat([X_final, y_final], axis=1)
correlations_with_target = final_df.corr()[target_col].drop(target_col).sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
correlations_with_target.plot(kind='barh')
plt.title('Final Features Correlation with Target Variable')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nCorrelations with target:")
for feature, corr in correlations_with_target.items():
    print(f"  {feature}: {corr:.3f}")

## 11. Save Selected Features

In [None]:
# Save the selected features dataset
selected_features_path = PROCESSED_DATA_DIR / "selected_features.csv"
final_target_path = PROCESSED_DATA_DIR / "final_target.csv"
final_dataset_path = PROCESSED_DATA_DIR / "final_dataset.csv"

# Save individual components
X_final.to_csv(selected_features_path, index=False)
y_final.to_csv(final_target_path, index=False, header=['stress_level'])
final_df.to_csv(final_dataset_path, index=False)

print(f"Selected features saved to: {selected_features_path}")
print(f"Target variable saved to: {final_target_path}")
print(f"Final dataset saved to: {final_dataset_path}")

# Save feature selection results
import json

selection_summary = {
    'original_features': list(X.columns),
    'n_original_features': len(X.columns),
    'final_features': final_features,
    'n_final_features': len(final_features),
    'selection_methods_used': list(selection_results.keys()),
    'detailed_results': {
        method: {
            'selected_features': results['selected_features'],
            'n_features': results['n_features']
        } for method, results in selection_results.items()
    },
    'feature_selection_strategy': 'consensus_based',
    'correlations_with_target': correlations_with_target.to_dict()
}

selection_results_path = PROCESSED_DATA_DIR / "feature_selection_results.json"
with open(selection_results_path, 'w') as f:
    json.dump(selection_summary, f, indent=2, default=str)

print(f"Feature selection results saved to: {selection_results_path}")

## 12. Feature Selection Summary

In [None]:
# Generate comprehensive feature selection summary
print("FEATURE SELECTION SUMMARY")
print("=" * 50)
print(f"Original features: {len(X.columns)}")
print(f"Final features: {len(final_features)}")
print(f"Reduction: {len(X.columns) - len(final_features)} features ({((len(X.columns) - len(final_features))/len(X.columns)*100):.1f}%)")

print(f"\nSelected Features:")
for i, feature in enumerate(final_features, 1):
    corr = correlations_with_target[feature]
    print(f"  {i:2d}. {feature:20s} (correlation: {corr:6.3f})")

print(f"\nFeature Selection Methods Applied:")
for method in selection_results.keys():
    print(f"  ✓ {method.replace('_', ' ').title()}")

print(f"\nDataset is ready for model training!")
print(f"\nNext steps:")
print(f"1. Train-test split")
print(f"2. Model training and comparison")
print(f"3. Hyperparameter tuning")
print(f"4. Model evaluation")