In [1]:
# ------------------------------
# FILTER METHOD (SelectKBest)
# ------------------------------

# Step 1: Import required libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, chi2

In [6]:
# Step 2: Load a sample dataset
data = load_breast_cancer()

In [7]:
# Convert into DataFrame for easy understanding
X = pd.DataFrame(data.data, columns=data.feature_names)   # independent variables
y = pd.Series(data.target)

In [14]:

# ------------------------------
# VISUALIZE THE DATASET
# ------------------------------
print("===== First 10 Rows of the Dataset =====")
print(X.head(10))               # shows first 10 rows

print("\n===== Feature Names =====")
print(list(X.columns))          # shows all column names

print("\n===== Target Values (0 = Malignant, 1 = Benign) =====")
print(y.head(10))               # shows first 10 target values

===== First 10 Rows of the Dataset =====
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   
5        12.45         15.70           82.57      477.1          0.12780   
6        18.25         19.98          119.60     1040.0          0.09463   
7        13.71         20.83           90.20      577.9          0.11890   
8        13.00         21.82           87.50      519.8          0.12730   
9        12.46         24.04           83.97      475.9          0.11860   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760         0.30010            

In [8]:

# Step 3: Apply SelectKBest (Filter Method)
# chi2 = Chi-Square statistical test
# k=5 means select top 5 best features
selector = SelectKBest(score_func=chi2, k=5)

In [9]:
# Step 4: Fit the selector to the data
X_new = selector.fit_transform(X, y)


In [10]:
# Step 5: Get the names of selected features
selected_features = X.columns[selector.get_support()]

In [11]:
# Step 6: Print results
print("Top 5 Selected Features using Filter Method (Chi-Square):")
print(selected_features)

Top 5 Selected Features using Filter Method (Chi-Square):
Index(['mean perimeter', 'mean area', 'area error', 'worst perimeter',
       'worst area'],
      dtype='object')


In [None]:
#1. Missing Value Filter

In [15]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
import numpy as np

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Introduce missing values for demo
np.random.seed(0)
X.iloc[0:30, 2] = np.nan        # example missing
X.iloc[10:50, 5] = np.nan

# Threshold: remove columns having > 20% missing
threshold = 0.2

missing_percent = X.isnull().mean()
selected_features_missing = X.columns[missing_percent < threshold]

print("Selected Features After Missing Value Filter:")
print(selected_features_missing)


Selected Features After Missing Value Filter:
Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


In [None]:
# 2. Information Gain (Mutual Information)

In [2]:
from sklearn.feature_selection import mutual_info_classif

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Calculate Information Gain Score
info_gain = mutual_info_classif(X, y)

info_gain_df = pd.DataFrame({
    "Feature": X.columns,
    "Information_Gain": info_gain
}).sort_values(by="Information_Gain", ascending=False)

print("Information Gain Scores:")
print(info_gain_df)


Information Gain Scores:
                    Feature  Information_Gain
22          worst perimeter          0.476303
23               worst area          0.465316
20             worst radius          0.454042
7       mean concave points          0.440097
27     worst concave points          0.437878
2            mean perimeter          0.403753
6            mean concavity          0.373430
0               mean radius          0.366186
3                 mean area          0.361112
13               area error          0.341904
26          worst concavity          0.316249
12          perimeter error          0.276060
10             radius error          0.247468
25        worst compactness          0.225611
5          mean compactness          0.209308
17     concave points error          0.125466
21            worst texture          0.125076
16          concavity error          0.117446
24         worst smoothness          0.098092
28           worst symmetry          0.097457
1        

In [None]:
# 3. Chi-Square Test

In [3]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Chi-square requires non-negative values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

chi_scores = chi2(X_scaled, y)

chi_df = pd.DataFrame({
    "Feature": X.columns,
    "Chi2_Score": chi_scores[0],
    "P_value": chi_scores[1]
}).sort_values(by="Chi2_Score", ascending=False)

print("Chi-Square Scores:")
print(chi_df)


Chi-Square Scores:
                    Feature  Chi2_Score       P_value
7       mean concave points   52.405743  4.514020e-13
27     worst concave points   46.341648  9.933002e-12
6            mean concavity   46.186395  1.075210e-11
23               worst area   35.043882  3.223582e-09
22          worst perimeter   34.438091  4.400270e-09
20             worst radius   34.124937  5.168467e-09
26          worst concavity   31.563031  1.930704e-08
3                 mean area   29.328594  6.108761e-08
2            mean perimeter   26.528902  2.596246e-07
0               mean radius   24.897293  6.046728e-07
25        worst compactness   20.992541  4.610749e-06
5          mean compactness   20.353176  6.438614e-06
13               area error   19.676975  9.169962e-06
10             radius error   17.324128  3.151598e-05
12          perimeter error   16.044344  6.187617e-05
21            worst texture    8.741628  3.110268e-03
1              mean texture    6.394071  1.145021e-02
17     co

In [None]:
#4. Fisherâ€™s Score


In [4]:
import numpy as np

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

fisher_scores = []

for col in X.columns:
    class0 = X[y == 0][col]
    class1 = X[y == 1][col]

    mean_diff = (class0.mean() - class1.mean()) ** 2
    var_sum = class0.var() + class1.var()

    fisher_score = mean_diff / var_sum
    fisher_scores.append(fisher_score)

fisher_df = pd.DataFrame({
    "Feature": X.columns,
    "Fisher_Score": fisher_scores
}).sort_values(by="Fisher_Score", ascending=False)

print("Fisher Scores:")
print(fisher_df)


Fisher Scores:
                    Feature  Fisher_Score
27     worst concave points      3.391648
22          worst perimeter      2.812874
7       mean concave points      2.703003
20             worst radius      2.699922
2            mean perimeter      2.253563
0               mean radius      2.103591
23               worst area      1.939571
6            mean concavity      1.751138
3                 mean area      1.732786
26          worst concavity      1.535963
5          mean compactness      1.045566
25        worst compactness      0.983969
10             radius error      0.801870
12          perimeter error      0.750703
13               area error      0.691208
21            worst texture      0.563899
1              mean texture      0.450222
24         worst smoothness      0.449966
17     concave points error      0.429469
28           worst symmetry      0.386940
4           mean smoothness      0.319615
8             mean symmetry      0.254161
29  worst fractal d