In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif, RFE
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Load Breast Cancer dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data for certain methods
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection using Variance Threshold
print("1. Variance Threshold:")
selector = VarianceThreshold(threshold=0.1)
X_var_threshold = selector.fit_transform(X_train_scaled)
print(f"Original Features: {X_train.shape[1]}, Selected Features: {X_var_threshold.shape[1]}\n")

# Feature selection using SelectKBest with chi-squared test
print("2. SelectKBest with chi-squared:")
selector_chi2 = SelectKBest(chi2, k=15)
X_chi2 = selector_chi2.fit_transform(X_train, y_train)
print(f"Original Features: {X_train.shape[1]}, Selected Features: {X_chi2.shape[1]}\n")

# Feature selection using SelectKBest with ANOVA F-statistic
print("3. SelectKBest with ANOVA F-statistic:")
selector_f = SelectKBest(f_classif, k=15)
X_f = selector_f.fit_transform(X_train, y_train)
print(f"Original Features: {X_train.shape[1]}, Selected Features: {X_f.shape[1]}\n")

# Feature selection using Recursive Feature Elimination (RFE)
print("4. Recursive Feature Elimination (RFE):")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
selector_rfe = RFE(estimator=rf_model, n_features_to_select=15)
X_rfe = selector_rfe.fit_transform(X_train, y_train)
print(f"Original Features: {X_train.shape[1]}, Selected Features: {X_rfe.shape[1]}\n")

# Feature selection using LASSO (L1 Regularization)
print("5. LASSO (L1 Regularization):")
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_scaled, y_train)
selected_features_lasso = [feature for feature, coef in zip(cancer.feature_names, lasso_model.coef_) if coef != 0]
X_lasso = X_train[:, cancer.feature_names.tolist().index(selected_features_lasso[0])]
print(f"Original Features: {X_train.shape[1]}, Selected Features: {len(selected_features_lasso)}\n")

# Feature selection using Principal Component Analysis (PCA)
print("6. Principal Component Analysis (PCA):")
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_train_scaled)
print(f"Original Features: {X_train.shape[1]}, Selected Features (Principal Components): {X_pca.shape[1]}\n")

# Feature selection using t-Distributed Stochastic Neighbor Embedding (t-SNE)
print("7. t-Distributed Stochastic Neighbor Embedding (t-SNE):")
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_scaled)
print(f"Original Features: {X_train.shape[1]}, Selected Features (t-SNE components): {X_tsne.shape[1]}\n")

# Feature selection using Mutual Information
print("8. Mutual Information:")
mutual_info_scores = mutual_info_classif(X_train, y_train)
selected_features_mutual_info = [feature for feature, score in zip(cancer.feature_names, mutual_info_scores) if score > 0.05]
X_mutual_info = X_train[:, cancer.feature_names.tolist().index(selected_features_mutual_info[0])]
print(f"Original Features: {X_train.shape[1]}, Selected Features: {len(selected_features_mutual_info)}\n")


1. Variance Threshold:
Original Features: 30, Selected Features: 30

2. SelectKBest with chi-squared:
Original Features: 30, Selected Features: 15

3. SelectKBest with ANOVA F-statistic:
Original Features: 30, Selected Features: 15

4. Recursive Feature Elimination (RFE):
Original Features: 30, Selected Features: 15

5. LASSO (L1 Regularization):
Original Features: 30, Selected Features: 13

6. Principal Component Analysis (PCA):
Original Features: 30, Selected Features (Principal Components): 15

7. t-Distributed Stochastic Neighbor Embedding (t-SNE):
Original Features: 30, Selected Features (t-SNE components): 2

8. Mutual Information:
Original Features: 30, Selected Features: 25

