In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
# 1 Load Dataset

df = pd.read_csv("Dataset/dataset-uci.csv")

X = df.drop(columns=["Gallstone Status"])
y = df["Gallstone Status"]

In [None]:
# 2 Define 10-Fold CV
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# 3 Evaluate model on a feature subset

def eval_subset(features):
    results = {}

    X_sub = X[features]

    # Linear SVM
    svm_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(kernel="linear", probability=True))
    ])
    svm_acc = cross_val_score(svm_pipe, X_sub, y, cv=cv, scoring="accuracy").mean()
    svm_auc = cross_val_score(svm_pipe, X_sub, y, cv=cv, scoring="roc_auc").mean()

    results["SVM Acc"] = svm_acc
    results["SVM AUC"] = svm_auc

    # Random Forest
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    rf_acc = cross_val_score(rf, X_sub, y, cv=cv, scoring="accuracy").mean()
    rf_auc = cross_val_score(rf, X_sub, y, cv=cv, scoring="roc_auc").mean()

    results["RF Acc"] = rf_acc
    results["RF AUC"] = rf_auc

    return results

In [None]:
# 4 Feature Importance based on ANOVA F-score

select_f = SelectKBest(score_func=f_classif, k="all")
select_f.fit(X, y)

f_scores = pd.Series(select_f.scores_, index=X.columns)
f_ranked = f_scores.sort_values(ascending=False)

In [None]:
# 5 Feature Importance using Mutual Information

select_mi = SelectKBest(score_func=mutual_info_classif, k="all")
select_mi.fit(X, y)

mi_scores = pd.Series(select_mi.scores_, index=X.columns)
mi_ranked = mi_scores.sort_values(ascending=False)

In [None]:
# 6 Feature Importance using Random Forest

rf_full = RandomForestClassifier(n_estimators=300, random_state=42)
rf_full.fit(X, y)

rf_imp = pd.Series(rf_full.feature_importances_, index=X.columns)
rf_ranked = rf_imp.sort_values(ascending=False)

In [None]:
# 7 Combine rankings (F-score + MI + RF)

rank_df = pd.DataFrame({
    "F": f_ranked.rank(ascending=False),
    "MI": mi_ranked.rank(ascending=False),
    "RF": rf_ranked.rank(ascending=False),
})

rank_df["MeanRank"] = rank_df.mean(axis=1)
rank_df = rank_df.sort_values("MeanRank")

top_order = rank_df.index.tolist()

In [None]:
# 8 Evaluate different feature subset sizes

subset_sizes = [5, 8, 12, 15, 20, len(X.columns)]

results = []

for k in subset_sizes:
    feats = top_order[:k]
    res = eval_subset(feats)
    res["#Features"] = k
    results.append(res)

results_df = pd.DataFrame(results)
results_df = results_df[["#Features","SVM Acc","SVM AUC","RF Acc","RF AUC"]]
results_df