In [None]:
import pandas as pd
df = pd.read_csv("/content/training_set.csv")
print(df.head())

   feature1  feature2  feature3  feature4  feature5  Authenticated
0  0.599758  0.012166  0.209270  1.087926  4.088241              0
1 -5.512324 -0.426576 -0.092336  0.501431  3.844389              0
2 -3.387749  2.786477  0.195597  1.540859  4.148680              0
3 -4.690195 -1.024081 -0.101571  0.981619  3.712296              0
4  7.990252  0.398343 -0.302286  1.309536  3.870292              0


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
import warnings
warnings.filterwarnings("ignore")

In [None]:
X = df[["feature1", "feature2", "feature3", "feature4", "feature5"]]
y = df['Authenticated']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), {
        "C": [0.1, 1, 10],
        "solver": ["lbfgs"]
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5]
    }),
    "Random Forest": (RandomForestClassifier(), {
        "n_estimators": [100, 200],
        "max_depth": [None, 10],
    }),
    "Gradient Boosting": (GradientBoostingClassifier(), {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5]
    }),
    "KNN": (KNeighborsClassifier(), {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"]
    }),
    "SVM": (SVC(probability=True), {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"]
    }),
    "Naive Bayes": (GaussianNB(), {})
}

In [None]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, average="macro"),
    "recall": make_scorer(recall_score, average="macro"),
    "f1": make_scorer(f1_score, average="macro"),
    "roc_auc": make_scorer(roc_auc_score, average="macro", needs_proba=True),
}

In [None]:
results = []

for name, (model, params) in models.items():
    model.fit(X_train, y_train)

    print(f"\n Đang chạy GridSearch cho {name}...")
    y_pred = model.predict(X_test)
    grid = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring=scoring,
        refit="f1",
        cv=5,
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
            y_proba = model.decision_function(X_test)
    else:
            y_proba = None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro")
    rec = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")

    if y_proba is not None:
      try:
        roc = roc_auc_score(y_test, y_proba)
      except ValueError:
        roc = None
    else:
        roc = None

    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "CV Best f1": grid.best_score_,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC_AUC": roc
    })


 Đang chạy GridSearch cho Logistic Regression...

 Đang chạy GridSearch cho Decision Tree...

 Đang chạy GridSearch cho Random Forest...

 Đang chạy GridSearch cho Gradient Boosting...

 Đang chạy GridSearch cho KNN...

 Đang chạy GridSearch cho SVM...

 Đang chạy GridSearch cho Naive Bayes...


In [None]:
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="F1", ascending=False))

                 Model                                        Best Params  \
0  Logistic Regression                      {'C': 0.1, 'solver': 'lbfgs'}   
1        Decision Tree        {'max_depth': None, 'min_samples_split': 5}   
2        Random Forest           {'max_depth': None, 'n_estimators': 100}   
3    Gradient Boosting  {'learning_rate': 0.05, 'max_depth': 3, 'n_est...   
4                  KNN           {'n_neighbors': 3, 'weights': 'uniform'}   
5                  SVM                     {'C': 0.1, 'kernel': 'linear'}   
6          Naive Bayes                                                 {}   

   CV Best f1  Accuracy  Precision    Recall        F1 ROC_AUC  
0    0.798742      0.98   0.326667  0.333333  0.329966    None  
1    0.698113      0.98   0.326667  0.333333  0.329966    None  
2    0.798742      0.98   0.326667  0.333333  0.329966    None  
3    0.698113      0.98   0.326667  0.333333  0.329966    None  
4    0.798742      0.98   0.326667  0.333333  0.329966    