In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('hypothetical_dataset.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Correlation Analysis
def correlation_analysis(X):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    print(f"Features to drop based on high correlation: {to_drop}")
    return X.drop(to_drop, axis=1)

X_uncorrelated = correlation_analysis(X_train)

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(X_uncorrelated.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

# 2. Variance Threshold
from sklearn.feature_selection import VarianceThreshold

def variance_threshold_selection(X, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X.columns[selector.get_support()]

low_variance_features = variance_threshold_selection(X_train)
print(f"Features selected based on variance threshold: {low_variance_features}")

# 3. Univariate Feature Selection
def univariate_feature_selection(X, y, k=10):
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)
    return X.columns[selector.get_support()]

univariate_features = univariate_feature_selection(X_train, y_train)
print(f"Top features based on univariate selection: {univariate_features}")

# 4. Mutual Information
def mutual_info_feature_selection(X, y, k=10):
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    selector.fit(X, y)
    return X.columns[selector.get_support()]

mi_features = mutual_info_feature_selection(X_train, y_train)
print(f"Top features based on mutual information: {mi_features}")

# 5. Recursive Feature Elimination (RFE)
def rfe_feature_selection(X, y, n_features_to_select=10):
    estimator = LogisticRegression()
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector = selector.fit(X, y)
    return X.columns[selector.support_]

rfe_features = rfe_feature_selection(X_train, y_train)
print(f"Features selected by RFE: {rfe_features}")

# 6. Lasso (L1 Regularization)
def lasso_feature_selection(X, y, alpha=1.0):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_scaled, y)
    return X.columns[lasso.coef_ != 0]

lasso_features = lasso_feature_selection(X_train, y_train)
print(f"Features selected by Lasso: {lasso_features}")

# 7. Random Forest Feature Importance
def rf_feature_importance(X, y):
    rf = RandomForestClassifier()
    rf.fit(X, y)
    importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
    return importance.sort_values('importance', ascending=False)

rf_importance = rf_feature_importance(X_train, y_train)
print("Top 10 features based on Random Forest importance:")
print(rf_importance.head(10))

# Visualize Random Forest feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=rf_importance.head(20))
plt.title('Top 20 Features by Random Forest Importance')
plt.show()

# 8. Forward Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector

def forward_feature_selection(X, y, k_features=10):
    sfs = SequentialFeatureSelector(
        LogisticRegression(),
        k_features=k_features,
        forward=True,
        floating=False,
        scoring='accuracy',
        cv=5
    )
    sfs.fit(X, y)
    return list(sfs.k_feature_names_)

forward_features = forward_feature_selection(X_train, y_train)
print(f"Features selected by Forward Selection: {forward_features}")

# 9. Backward Feature Elimination
def backward_feature_elimination(X, y, k_features=10):
    sfs = SequentialFeatureSelector(
        LogisticRegression(),
        k_features=k_features,
        forward=False,
        floating=False,
        scoring='accuracy',
        cv=5
    )
    sfs.fit(X, y)
    return list(sfs.k_feature_names_)

backward_features = backward_feature_elimination(X_train, y_train)
print(f"Features selected by Backward Elimination: {backward_features}")

# 10. Feature Selection using PCA
from sklearn.decomposition import PCA

def pca_feature_selection(X, n_components=0.95):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=n_components)
    pca.fit(X_scaled)
    return pca.n_components_

n_pca_components = pca_feature_selection(X_train)
print(f"Number of PCA components explaining 95% variance: {n_pca_components}")

# 11. Stability Selection
from stability_selection import StabilitySelection

def stability_selection(X, y, n_features=10):
    selector = StabilitySelection(
        base_estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        lambda_name='C',
        lambda_grid=np.logspace(-5, -1, 25),
        n_features=n_features
    )
    selector.fit(X, y)
    return X.columns[selector.get_support()]

stability_features = stability_selection(X_train, y_train)
print(f"Features selected by Stability Selection: {stability_features}")

# Evaluate different feature sets
def evaluate_feature_set(X_train, X_test, y_train, y_test, features):
    model = LogisticRegression()
    model.fit(X_train[features], y_train)
    y_pred = model.predict(X_test[features])
    return accuracy_score(y_test, y_pred)

feature_sets = {
    'All Features': X_train.columns,
    'Uncorrelated': X_uncorrelated.columns,
    'Univariate': univariate_features,
    'Mutual Information': mi_features,
    'RFE': rfe_features,
    'Lasso': lasso_features,
    'Random Forest': rf_importance['feature'][:10],
    'Forward Selection': forward_features,
    'Backward Elimination': backward_features,
    'Stability Selection': stability_features
}

results = {}
for name, features in feature_sets.items():
    score = evaluate_feature_set(X_train, X_test, y_train, y_test, features)
    results[name] = score
    print(f"{name} - Accuracy: {score:.4f}")

# Visualize results
plt.figure(figsize=(12, 6))
plt.bar(results.keys(), results.values())
plt.title('Model Performance with Different Feature Selection Methods')
plt.xlabel('Feature Selection Method')
plt.ylabel('Accuracy')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()