In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')

from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection
import toad

# 1. LOAD DATA

print("Loading data...")
df = pd.read_csv("../../data/data.csv")
print(f"Dataset shape: {df.shape}")
print(f"Total features: {df.shape[1]}")

# 2. FEATURE CATEGORIZATION

print("\n")
print("CATEGORIZING FEATURES")

# Separate features by type
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Further categorize numerical features
binary_features = [col for col in numerical_features if df[col].nunique() == 2]
continuous_features = [col for col in numerical_features if col not in binary_features]

print(f"\nNumerical features: {len(numerical_features)}")
print(f"  - Continuous: {len(continuous_features)}")
print(f"  - Binary: {len(binary_features)}")

# 3. TARGET VARIABLE

TARGET = 'spell_length_of_stay_hours' 
print(f"\nTarget variable: {TARGET}")

# Remove target from feature lists
if TARGET in continuous_features:
    continuous_features.remove(TARGET)
if TARGET in numerical_features:
    numerical_features.remove(TARGET)

# 4. CORRELATION ANALYSIS FUNCTIONS

def calculate_correlation_matrix(data, method='spearman'):
    """Calculate correlation matrix using following methods"""
    if method == 'spearman':
        corr_matrix = data.corr(method='spearman')
    elif method == 'pearson':
        corr_matrix = data.corr(method='pearson')
    else:
        raise ValueError("Method must be 'spearman' or 'pearson'")
    return corr_matrix

# 5. FEATURE GROUPING FOR VISUALIZATION

feature_groups = {
    'Demographics': [
        'ethnic_origin_description',
        'patient_age_on_admission',
        'sex_national_code',
        'Deprivation Decile'
    ],
    'Admission_Info': [
        'elective_admission_flag',
        'non_elective_admission_flag',
        'IP_admission',
        'ward_type_admission',
        'general_medical_practice_desc'
    ],
    'Clinical_Diagnosis': [
        'spell_primary_diagnosis_encoded',
        'spell_secondary_diagnosis_encoded',
        'spell_primary_diagnosis_description',
        'spell_dominant_proc_encoded',
        'spell_dominant_proc_description',
        'hrg_group',
        'hrg_sub_group_encoded'
    ],
    'Comorbidities': [col for col in df.columns if 'comorbidity_' in col],
    'Chronic_Conditions': [col for col in df.columns if 'chronic_condition_' in col],
    'Special_Flags': [
        'dementia_diagnosis_flag',
        'covid19_diagnosis_flag',
        'frailty_score',
        'comorbidity_score'
    ],
    'Emergency_Department': [
        'attendancetype',
        'arrival_mode_description',
        'place_of_incident',
        'source_of_ref_description',
        'presenting_complaint_encoded',
        'acuity_code',
        'inj_or_ail',
        'NEWS2',
        'ae_unplanned_attendance',
        'location'
    ]
}

# Filter groups to only include columns that exist
feature_groups = {
    group: [col for col in cols if col in df.columns]
    for group, cols in feature_groups.items()
}

print("\n")
print("FEATURE GROUPS")
for group, cols in feature_groups.items():
    print(f"\n{group}: {len(cols)} features")

# 6. CORRELATION ANALYSIS - NUMERICAL FEATURES

print("\n")
print("NUMERICAL FEATURE CORRELATION ANALYSIS")

# Prepare numerical data
numerical_data = df[numerical_features].copy()

# Calculate correlations using both methods
print("\nCalculating Spearman correlation...")
spearman_corr = calculate_correlation_matrix(numerical_data, method='spearman')

print("Calculating Pearson correlation...")
pearson_corr = calculate_correlation_matrix(numerical_data, method='pearson')

# 7. IDENTIFY CORRELATED FEATURE GROUPS

def find_correlated_groups(corr_matrix, threshold=0.7):
    """Identify groups of correlated features"""
    corr_matrix_abs = corr_matrix.abs()
    np.fill_diagonal(corr_matrix_abs.values, 0)
    
    groups = []
    processed = set()
    
    for col in corr_matrix.columns:
        if col in processed:
            continue
        
        # Find all features correlated with this one
        correlated = corr_matrix_abs[col][corr_matrix_abs[col] > threshold].index.tolist()
        
        if correlated:
            group = [col] + correlated
            groups.append({
                'features': group,
                'size': len(group),
                'avg_correlation': corr_matrix_abs.loc[group, group].values[np.triu_indices(len(group), k=1)].mean()
            })
            processed.update(group)
    
    return groups

# Find correlated groups using Spearman
correlated_groups_spearman = find_correlated_groups(spearman_corr, threshold=0.7)
print(f"Found {len(correlated_groups_spearman)} correlated groups")

# 8. FEATURE SELECTION USING FEATURE-ENGINE

print("\n")
print("FEATURE SELECTION - FEATURE-ENGINE")

# Method 1: DropCorrelatedFeatures
print("\nMethod 1: DropCorrelatedFeatures (Spearman, threshold=0.7)")
dcf = DropCorrelatedFeatures(
    variables=None,
    method='spearman',
    threshold=0.7,
    missing_values='ignore'
)

dcf.fit(numerical_data)
features_to_drop_dcf = dcf.features_to_drop_
print(f"Features to drop: {len(features_to_drop_dcf)}")

# Method 2: SmartCorrelatedSelection
print("\nMethod 2: SmartCorrelatedSelection")
if TARGET in numerical_data.columns:
    scs = SmartCorrelatedSelection(
        variables=None,
        method='spearman',
        threshold=0.7,
        missing_values='ignore',
        selection_method='variance',
        estimator=None
    )
    
    scs.fit(numerical_data)
    features_to_drop_scs = scs.features_to_drop_
    print(f"Features to drop: {len(features_to_drop_scs)}")
else:
    features_to_drop_scs = []
    print("Target not in numerical features, skipping SmartCorrelatedSelection")

# 9. CORRELATION WITH TARGET

if TARGET in df.columns:
    print("\n")
    print(f"CORRELATION WITH TARGET VARIABLE: {TARGET}")
    
    target_corr_spearman = []
    target_corr_pearson = []
    
    for col in numerical_features:
        if col != TARGET:
            try:
                # Spearman
                s_corr, s_pval = spearmanr(df[col].fillna(0), df[TARGET].fillna(0))
                target_corr_spearman.append({
                    'feature': col,
                    'correlation': s_corr,
                    'p_value': s_pval
                })
                
                # Pearson
                p_corr, p_pval = pearsonr(df[col].fillna(0), df[TARGET].fillna(0))
                target_corr_pearson.append({
                    'feature': col,
                    'correlation': p_corr,
                    'p_value': p_pval
                })
            except:
                pass
    
    target_corr_spearman_df = pd.DataFrame(target_corr_spearman).sort_values('correlation', 
                                                                              key=abs, 
                                                                              ascending=False)
    target_corr_pearson_df = pd.DataFrame(target_corr_pearson).sort_values('correlation', 
                                                                            key=abs, 
                                                                            ascending=False)
    
    print("\nTop 10 features correlated with target (Spearman):")
    print(target_corr_spearman_df.head(10))

# 10. CREATE EXCEL REPORT
print("\n")
print("GENERATING EXCEL REPORT")

with pd.ExcelWriter('./output/feature_selection_report.xlsx', engine='openpyxl') as writer:
    
    # Sheet 1: Summary
    summary_data = {
        'Metric': [
            'Total Features',
            'Numerical Features',
            'Binary Features',
            'Continuous Features',
            'Features to Drop (DropCorrelatedFeatures)',
            'Features to Drop (SmartCorrelatedSelection)',
            'Correlated Groups Found',
            'Target Variable'
        ],
        'Value': [
            df.shape[1],
            len(numerical_features),
            len(binary_features),
            len(continuous_features),
            len(features_to_drop_dcf),
            len(features_to_drop_scs),
            len(correlated_groups_spearman),
            TARGET
        ]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
    
    # Sheet 2: Features to Drop/Keep (DropCorrelatedFeatures)
    drop_keep_dcf = pd.DataFrame({
        'Feature': numerical_features,
        'Decision': ['DROP' if f in features_to_drop_dcf else 'KEEP' for f in numerical_features],
        'Method': 'DropCorrelatedFeatures'
    })
    drop_keep_dcf.to_excel(writer, sheet_name='Drop_Keep_DCF', index=False)
    
    # Sheet 3: Features to Drop/Keep (SmartCorrelatedSelection)
    if features_to_drop_scs:
        drop_keep_scs = pd.DataFrame({
            'Feature': numerical_features,
            'Decision': ['DROP' if f in features_to_drop_scs else 'KEEP' for f in numerical_features],
            'Method': 'SmartCorrelatedSelection'
        })
        drop_keep_scs.to_excel(writer, sheet_name='Drop_Keep_SCS', index=False)
    
    # Sheet 4: Correlated Groups
    groups_data = []
    for i, group in enumerate(correlated_groups_spearman, 1):
        for feature in group['features']:
            groups_data.append({
                'Group_ID': i,
                'Feature': feature,
                'Group_Size': group['size'],
                'Avg_Correlation': round(group['avg_correlation'], 4),
                'Method': 'Spearman'
            })
    
    if groups_data:
        pd.DataFrame(groups_data).to_excel(writer, sheet_name='Correlated_Groups', index=False)
    
    # Sheet 5: Correlation with Target (Spearman)
    if TARGET in df.columns and len(target_corr_spearman_df) > 0:
        target_corr_spearman_df.to_excel(writer, sheet_name='Target_Corr_Spearman', index=False)
    
    # Sheet 6: Correlation with Target (Pearson)
    if TARGET in df.columns and len(target_corr_pearson_df) > 0:
        target_corr_pearson_df.to_excel(writer, sheet_name='Target_Corr_Pearson', index=False)
    
    # Sheet 7: Full Correlation Matrix (Spearman)
    spearman_corr.to_excel(writer, sheet_name='Full_Corr_Spearman')
    
    # Sheet 8: Full Correlation Matrix (Pearson)
    pearson_corr.to_excel(writer, sheet_name='Full_Corr_Pearson')
    
    # Sheet 9: Methods Used
    methods_info = pd.DataFrame({
        'Analysis_Type': [
            'Numerical-Numerical Correlation',
            'Numerical-Numerical Correlation',
            'Feature Selection Method 1',
            'Feature Selection Method 2',
            'Target Correlation Analysis',
        ],
        'Method': [
            'Spearman',
            'Pearson',
            'DropCorrelatedFeatures (Spearman)',
            'SmartCorrelatedSelection (Spearman)',
            'Spearman & Pearson',
        ],
        'Threshold': [
            '0.7',
            '0.7',
            '0.7',
            '0.7',
            'N/A',
        ],
        'Description': [
            'Non-parametric correlation for continuous and ordinal variables',
            'Parametric correlation for continuous variables (assumes normality)',
            'Drops one feature from each correlated pair',
            'Smart selection based on variance and target correlation',
            'Correlation of features with target variable',
        ]
    })
    methods_info.to_excel(writer, sheet_name='Methods_Used', index=False)

print("Excel report saved: feature_selection_report.xlsx")

# 11. VISUALIZATION - HEATMAPS BY GROUP

print("\n")
print("GENERATING HEATMAPS")

def plot_correlation_heatmap(corr_matrix, title, figsize=(12, 10)):
    """Plot correlation heatmap"""
    plt.figure(figsize=figsize)
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
    sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', 
                center=0, vmin=-1, vmax=1, square=True, 
                linewidths=0.5, cbar_kws={"shrink": 0.7})
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    return plt

# Plot heatmaps for each feature group
for group_name, group_features in feature_groups.items():
    # Get numerical features in this group
    group_num_features = [f for f in group_features if f in numerical_features]
    
    if len(group_num_features) > 1:
        print(f"\nGenerating heatmap for {group_name}...")
        
        # Get correlation submatrix
        group_corr = spearman_corr.loc[group_num_features, group_num_features]
        
        # Plot
        fig = plot_correlation_heatmap(
            group_corr, 
            f'Correlation Heatmap: {group_name} (Spearman)',
            figsize=(min(len(group_num_features)*0.8, 20), min(len(group_num_features)*0.7, 18))
        )
        plt.savefig(f'./heatmaps/heatmap_{group_name}.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved: heatmap_{group_name}.png")

# Overall heatmap for high correlation pairs
print("\nGenerating overall high correlation heatmap...")
high_corr_mask = (spearman_corr.abs() > 0.7) & (spearman_corr.abs() < 1.0)
high_corr_features = high_corr_mask.any(axis=1)
high_corr_subset = spearman_corr.loc[high_corr_features, high_corr_features]

if len(high_corr_subset) > 0:
    fig = plot_correlation_heatmap(
        high_corr_subset,
        'High Correlation Features (|r| > 0.7, Spearman)',
        figsize=(16, 14)
    )
    plt.savefig('./heatmaps/heatmap_high_correlations.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("   Saved: heatmap_high_correlations.png")

# 12. FINAL RECOMMENDATIONS

print("\n")
print("FEATURE SELECTION RECOMMENDATIONS")
print("\n")

# Combine recommendations
features_to_drop_final = list(set(features_to_drop_dcf) | set(features_to_drop_scs))
features_to_keep = [f for f in numerical_features if f not in features_to_drop_final]

print(f"\nTotal features analyzed: {len(numerical_features)}")
print(f"Recommended to DROP: {len(features_to_drop_final)}")
print(f"Recommended to KEEP: {len(features_to_keep)}")

print("\n Complete")

# Save final recommendations to CSV
final_recommendations = pd.DataFrame({
    'Feature': numerical_features,
    'Recommendation': ['DROP' if f in features_to_drop_final else 'KEEP' 
                       for f in numerical_features],
    'Dropped_by_DCF': [f in features_to_drop_dcf for f in numerical_features],
    'Dropped_by_SCS': [f in features_to_drop_scs for f in numerical_features]
})

final_recommendations.to_csv('./output/feature_selection_recommendations.csv', index=False)

# 13. APPLY FEATURE DROPPING

print("\n")
print("APPLYING FEATURE DROPPING")
print("\n")

# Create a new DataFrame with dropped features
df_reduced = df.copy()

# Drop the recommended features
df_reduced = df_reduced.drop(columns=features_to_drop_final)

print(f"Original dataset shape: {df.shape}")
print(f"Reduced dataset shape: {df_reduced.shape}")
print(f"Features removed: {df.shape[1] - df_reduced.shape[1]}")

# Save the reduced dataset
df_reduced.to_csv('../../data/data_reduced.csv', index=False)
print("\n✓ Reduced dataset saved: data_reduced.csv")

# Optional: Save list of dropped features for reference
dropped_features_df = pd.DataFrame({
    'Dropped_Feature': features_to_drop_final,
    'Reason': ['High correlation with other features'] * len(features_to_drop_final)
})
dropped_features_df.to_csv('./output/dropped_features_list.csv', index=False)
print(" Dropped features list saved: dropped_features_list.csv")

# Print comparison
print("\n")
print("FEATURE RETENTION SUMMARY")
print("\n")
print(f"Original features: {df.shape[1]}")
print(f"Retained features: {df_reduced.shape[1]}")
print(f"Retention rate: {(df_reduced.shape[1] / df.shape[1]) * 100:.1f}%")

Loading data...
Dataset shape: (41846, 88)
Total features: 88


CATEGORIZING FEATURES



Numerical features: 81
  - Continuous: 34
  - Binary: 47

Target variable: spell_length_of_stay_hours


FEATURE GROUPS



Demographics: 4 features

Admission_Info: 5 features

Clinical_Diagnosis: 7 features

Comorbidities: 18 features

Chronic_Conditions: 4 features

Special_Flags: 4 features

Emergency_Department: 10 features


NUMERICAL FEATURE CORRELATION ANALYSIS



Calculating Spearman correlation...
Calculating Pearson correlation...
Found 8 correlated groups


FEATURE SELECTION - FEATURE-ENGINE



Method 1: DropCorrelatedFeatures (Spearman, threshold=0.7)
Features to drop: 17

Method 2: SmartCorrelatedSelection
Target not in numerical features, skipping SmartCorrelatedSelection


CORRELATION WITH TARGET VARIABLE: spell_length_of_stay_hours



Top 10 features correlated with target (Spearman):
                                feature  correlation  p_value
38  spell_primary_diagnosis_descriptio