In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif, RFE
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Load Breast Cancer dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data for certain methods
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection using Variance Threshold
print("1. Variance Threshold:")
selector = VarianceThreshold(threshold=0.1)
X_var_threshold = selector.fit_transform(X_train_scaled)
selected_features_var_threshold = cancer.feature_names[selector.get_support()]
print("Selected Feature Names:", selected_features_var_threshold)
print()

# Feature selection using SelectKBest with chi-squared test
print("2. SelectKBest with chi-squared:")
selector_chi2 = SelectKBest(chi2, k=15)
X_chi2 = selector_chi2.fit_transform(X_train, y_train)
selected_features_chi2 = cancer.feature_names[selector_chi2.get_support()]
print("Selected Feature Names:", selected_features_chi2)
print()

# Feature selection using SelectKBest with ANOVA F-statistic
print("3. SelectKBest with ANOVA F-statistic:")
selector_f = SelectKBest(f_classif, k=15)
X_f = selector_f.fit_transform(X_train, y_train)
selected_features_f = cancer.feature_names[selector_f.get_support()]
print("Selected Feature Names:", selected_features_f)
print()

# Feature selection using Recursive Feature Elimination (RFE)
print("4. Recursive Feature Elimination (RFE):")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
selector_rfe = RFE(estimator=rf_model, n_features_to_select=15)
X_rfe = selector_rfe.fit_transform(X_train, y_train)
selected_features_rfe = cancer.feature_names[selector_rfe.get_support()]
print("Selected Feature Names:", selected_features_rfe)
print()

# Feature selection using LASSO (L1 Regularization)
print("5. LASSO (L1 Regularization):")
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_scaled, y_train)
selected_features_lasso = cancer.feature_names[lasso_model.coef_ != 0]
print("Selected Feature Names:", selected_features_lasso)
print()

# Feature selection using Principal Component Analysis (PCA)
print("6. Principal Component Analysis (PCA):")
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_train_scaled)
selected_features_pca = [f"PC{i+1}" for i in range(pca.n_components_)]
print("Selected Feature Names:", selected_features_pca)
print()

# Feature selection using t-Distributed Stochastic Neighbor Embedding (t-SNE)
print("7. t-Distributed Stochastic Neighbor Embedding (t-SNE):")
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_scaled)
selected_features_tsne = ["t-SNE Component 1", "t-SNE Component 2"]
print("Selected Feature Names:", selected_features_tsne)
print()

# Feature selection using Mutual Information
print("8. Mutual Information:")
mutual_info_scores = mutual_info_classif(X_train, y_train)
selected_features_mutual_info = cancer.feature_names[mutual_info_scores > 0.05]
print("Selected Feature Names:", selected_features_mutual_info)
print()


1. Variance Threshold:
Selected Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

2. SelectKBest with chi-squared:
Selected Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean concavity' 'radius error' 'perimeter error' 'area error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst compactness' 'worst concavity' 'worst concave points']

3. SelectKBest with ANOVA F-statistic:
Selected Feature Names: ['mean radius' 'mean perime