# 📘 Chi-Squared Feature Selection for Categorical Target: 'HasDetections'

"""
This script applies the Chi-squared (χ²) statistical test to evaluate the relevance 
of each feature with respect to the categorical target variable 'HasDetections'.

Chi-squared is suitable when both the features and the target are categorical 
(or when numerical features are discretized into bins).
"""

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2



In [None]:
# 🎯 Define the target and features
target_col = 'HasDetections'
X = df.drop(columns=[target_col])
y = df[target_col]

# 🏷️ Encode categorical features (Label Encoding)
# This assumes all features are categorical or already encoded as integers.
X_encoded = X.copy()
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object' or X_encoded[col].nunique() < 100:  # tweak threshold as needed
        X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col].astype(str))
    else:
        # Optional: bin numerical variables if needed for chi2
        X_encoded[col] = X_encoded[col]  # Keep as-is if already encoded or ordinal

# ⚙️ Apply the Chi-squared test
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X_encoded, y)

# 📊 Create a DataFrame of Chi-squared scores
chi2_scores = pd.Series(chi2_selector.scores_, index=X_encoded.columns)
chi2_scores = chi2_scores.sort_values(ascending=False)

# 🏆 Display top features ranked by Chi-squared score
print("🔝 Top 10 Features by Chi-squared Score:")
print(chi2_scores.head(10))

# Optional: Visualize scores (Bar Plot)
import matplotlib.pyplot as plt
chi2_scores.head(15).plot(kind='barh', figsize=(10, 6), title='Top 15 Features by Chi-Squared Score')
plt.gca().invert_yaxis()
plt.xlabel("Chi-squared Score")
plt.tight_layout()
plt.show()
