In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Load dataset

df = pd.read_csv("Dataset/Dataset.csv").drop("FILENAME", axis=1)

# Numeric only
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
numeric = [c for c in numeric if c != "label"]

X = df[numeric]
y = df["label"]

In [None]:
# 2 Train-test split

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [8]:
# 3 Evaluation Pipeline. This Step also does a stratified split on the 70% of the training data for the few shot sampling.

train_sizes = [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]

results = []

def evaluate_and_print(model_name, y_true, y_pred):
    print(f"{model_name} → Acc: {accuracy_score(y_true, y_pred):.6f}")
    print(f"{model_name} → Precision: {precision_score(y_true, y_pred):.6f}")
    print(f"{model_name} → Recall: {recall_score(y_true, y_pred):.6f}")
    print(f"{model_name} → F1: {f1_score(y_true, y_pred):.6f}")
    print(f"{model_name} → ROC-AUC: {roc_auc_score(y_true, y_pred):.6f}")
    print("=========================================")

for size in train_sizes:

    sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=42)
    for idx_train_small, _ in sss.split(X_train_full, y_train_full):
        X_train_small = X_train_full.iloc[idx_train_small]
        y_train_small = y_train_full.iloc[idx_train_small]

    print(f"\n=== Training on {int(size*100)}% of data ({len(X_train_small)} samples) ===")

    # Logistic Regression (scaled)
    lr = LogisticRegression(max_iter=10000)
    lr.fit(X_train_small, y_train_small)
    preds = lr.predict(X_test)
    evaluate_and_print("Logistic Regression", y_test, preds)

    # Extra Trees
    et = ExtraTreesClassifier(n_estimators=200)
    et.fit(X_train_small, y_train_small)
    preds = et.predict(X_test)
    evaluate_and_print("Extra Trees", y_test, preds)

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt.fit(X_train_small, y_train_small)
    preds = dt.predict(X_test)
    evaluate_and_print("Decision Tree", y_test, preds)

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200)
    rf.fit(X_train_small, y_train_small)
    preds = rf.predict(X_test)
    evaluate_and_print("Random Forest", y_test, preds)

    # store RF results for learning curve
    results.append((size, accuracy_score(y_test, preds)))


=== Training on 1% of data (1650 samples) ===
Logistic Regression → Acc: 0.996522
Logistic Regression → Precision: 0.998068
Logistic Regression → Recall: 0.995847
Logistic Regression → F1: 0.996956
Logistic Regression → ROC-AUC: 0.996636
Extra Trees → Acc: 0.999731
Extra Trees → Precision: 0.999555
Extra Trees → Recall: 0.999975
Extra Trees → F1: 0.999765
Extra Trees → ROC-AUC: 0.999690
Decision Tree → Acc: 0.999802
Decision Tree → Precision: 0.999654
Decision Tree → Recall: 1.000000
Decision Tree → F1: 0.999827
Decision Tree → ROC-AUC: 0.999769
Random Forest → Acc: 0.999915
Random Forest → Precision: 0.999876
Random Forest → Recall: 0.999975
Random Forest → F1: 0.999926
Random Forest → ROC-AUC: 0.999905

=== Training on 5% of data (8252 samples) ===
Logistic Regression → Acc: 0.999491
Logistic Regression → Precision: 0.999284
Logistic Regression → Recall: 0.999827
Logistic Regression → F1: 0.999555
Logistic Regression → ROC-AUC: 0.999435
Extra Trees → Acc: 0.999915
Extra Trees → Prec