In [1]:
# A2_randomized_search_fast.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")

# Drop rows with missing target
df = df.dropna(subset=[df.columns[-1]])

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Models & parameter grids (SVM replaced with LinearSVC)
models = {
    "LogisticRegression": (LogisticRegression(max_iter=500),
        {"model__C": np.logspace(-2, 2, 10)}),
    "LinearSVM": (LinearSVC(max_iter=2000),
        {"model__C": np.logspace(-2, 2, 5)}),
    "DecisionTree": (DecisionTreeClassifier(),
        {"model__max_depth": [3, 5, 10, None]}),
    "RandomForest": (RandomForestClassifier(),
        {"model__n_estimators": [100, 200, 400], "model__max_depth": [None, 5, 10]}),
    "AdaBoost": (AdaBoostClassifier(),
        {"model__n_estimators": [50, 100, 200]}),
    "MLP": (MLPClassifier(max_iter=500),
        {"model__hidden_layer_sizes": [(64,), (128,)], "model__alpha": [1e-4, 1e-3]})
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\n Randomized Search Results \n")
for name, (model, params) in models.items():
    print(f"➡ Running {name} ...")
    pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])
    search = RandomizedSearchCV(pipe, params, n_iter=5, cv=cv,
                                scoring="f1_macro", random_state=42, n_jobs=-1)
    search.fit(X_train, y_train)
    y_pred = search.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    print(f"\n{name}")
    print(f"   Best Params : {search.best_params_}")
    print(f"   CV F1       : {search.best_score_:.4f}")
    print(f"   Test Acc    : {acc:.4f}")
    print(f"   Test F1     : {f1:.4f}")
    print("-" * 50)

print("\n A2 Finished\n")


 Randomized Search Results 

➡ Running LogisticRegression ...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


LogisticRegression
   Best Params : {'model__C': np.float64(0.027825594022071243)}
   CV F1       : 0.4790
   Test Acc    : 0.5732
   Test F1     : 0.5124
--------------------------------------------------
➡ Running LinearSVM ...





LinearSVM
   Best Params : {'model__C': np.float64(0.01)}
   CV F1       : 0.4646
   Test Acc    : 0.5701
   Test F1     : 0.5104
--------------------------------------------------
➡ Running DecisionTree ...

DecisionTree
   Best Params : {'model__max_depth': 10}
   CV F1       : 0.3860
   Test Acc    : 0.4480
   Test F1     : 0.3830
--------------------------------------------------
➡ Running RandomForest ...

RandomForest
   Best Params : {'model__n_estimators': 200, 'model__max_depth': None}
   CV F1       : 0.4044
   Test Acc    : 0.5747
   Test F1     : 0.4305
--------------------------------------------------
➡ Running AdaBoost ...

AdaBoost
   Best Params : {'model__n_estimators': 200}
   CV F1       : 0.4210
   Test Acc    : 0.5445
   Test F1     : 0.4196
--------------------------------------------------
➡ Running MLP ...

MLP
   Best Params : {'model__hidden_layer_sizes': (64,), 'model__alpha': 0.0001}
   CV F1       : 0.4775
   Test Acc    : 0.5400
   Test F1     : 0.4736
-

In [3]:
# A3_classifiers_benchmark_fast.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")

# Drop rows with missing target
df = df.dropna(subset=[df.columns[-1]])

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Classifiers (SVM replaced with LinearSVC for speed)
models = [
    ("Perceptron", Perceptron()),
    ("LogisticRegression", LogisticRegression(max_iter=600)),
    ("LinearSVM", LinearSVC(max_iter=2000)),
    ("DecisionTree", DecisionTreeClassifier()),
    ("RandomForest", RandomForestClassifier()),
    ("AdaBoost", AdaBoostClassifier()),
    ("MLP", MLPClassifier(max_iter=600)),
    ("NaiveBayes", GaussianNB())
]

print("\n Benchmarking Results \n")
for name, model in models:
    print(f"➡ Running {name} ...")
    pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

    print(f"\n{name}")
    print(f"   Accuracy  : {acc:.4f}")
    print(f"   Precision : {prec:.4f}")
    print(f"   Recall    : {rec:.4f}")
    print(f"   F1-score  : {f1:.4f}")
    print("-" * 50)

print("\n A3 Finished\n")


 Benchmarking Results 

➡ Running Perceptron ...

Perceptron
   Accuracy  : 0.5158
   Precision : 0.4498
   Recall    : 0.4476
   F1-score  : 0.4486
--------------------------------------------------
➡ Running LogisticRegression ...

LogisticRegression
   Accuracy  : 0.5098
   Precision : 0.4458
   Recall    : 0.4508
   F1-score  : 0.4476
--------------------------------------------------
➡ Running LinearSVM ...

LinearSVM
   Accuracy  : 0.5023
   Precision : 0.4338
   Recall    : 0.4386
   F1-score  : 0.4349
--------------------------------------------------
➡ Running DecisionTree ...

DecisionTree
   Accuracy  : 0.4389
   Precision : 0.3752
   Recall    : 0.3767
   F1-score  : 0.3750
--------------------------------------------------
➡ Running RandomForest ...

RandomForest
   Accuracy  : 0.5792
   Precision : 0.6015
   Recall    : 0.4398
   F1-score  : 0.4380
--------------------------------------------------
➡ Running AdaBoost ...

AdaBoost
   Accuracy  : 0.5249
   Precision : 0.4