In [1]:
# load all libraries
import numpy as np
import pandas as pd
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    accuracy_score,
    f1_score,
    roc_curve,
    recall_score,
    precision_score,
    classification_report,
)
from sklearn.model_selection import (
    KFold,
    GridSearchCV,
    train_test_split,
    cross_val_score,
    RandomizedSearchCV,
)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    VotingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
    GradientBoostingClassifier,
)
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import pearsonr, uniform, randint
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
import os
import joblib
import seaborn as sns
from sklearn.svm import SVC
import warnings

warnings.filterwarnings("ignore")

## Quick Summary:

- [1. Loading data & Cleaning](#1-loading-data--cleaning)
    - [1.1 Loading ready-for-model dataset](#1-loading-data--cleaning)
    - [1.2 Cleaning, train-test splitting](#12-cleaning-train-test-splitting)

- [2. Classification Model Training ](#2-classification-model-training)
    - [2.1 Training all models](#21-training-all-models)
        - [2.1.1 Baseline - Logistic Regression](#211-baseline---logistic-regression)
        - [2.1.2 SVM Classifier](#212-svm-classifier)
        - [2.1.3 Decision Tree](#213-decision-tree) 
        - [2.1.4 Random Forest](#214-random-forest)
        - [2.1.5 Gradient Boosting](#215-gradient-boosting) 

        
    - [2.2 Comparison of Classifiers](#22-comparison-of-classifiers)
        - [2.2.1 Comparison Table](#221-comparison-table)
        - [2.2.2 Comparison ROC curve](#222-comparison-roc-curve)
- [3. Model extraction](#3-model-extraction)

Important things to keep in mind:
- False Positive is more dangerous, because it will defect your software and crash. 
- we can add lightweight classifier later to see how likely is the maintainers username is spam_likely - improve the main ML model. Maintainers likely to create username in pattern.

## 1. Loading data & Cleaning
#### 1.1 Loading ready-for-model dataset

In [3]:
from feature_engineering.load_data import load_json

df, _ = load_json("../data/20251207-bq-results-with-mid-pkgs.jsonl", lines=True)
pd.set_option("display.max_columns", None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15587 entries, 0 to 15586
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   pkg_name                            15587 non-null  object 
 1   n_name_len                          15587 non-null  int64  
 2   has_digit_in_name                   15587 non-null  bool   
 3   has_dash_or_underscore              15587 non-null  bool   
 4   cat_name_case                       15587 non-null  object 
 5   n_summary_len                       15587 non-null  int64  
 6   n_desc_len                          15587 non-null  int64  
 7   n_desc_lines                        15587 non-null  int64  
 8   has_code_block_in_desc              15587 non-null  bool   
 9   n_urls_in_desc                      15587 non-null  int64  
 10  has_suspicious_kw                   15587 non-null  bool   
 11  pct_non_ascii_desc                  15587

#### 1.2 Cleaning, train-test splitting
[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# X, y variables
drop_cols = ["pkg_name", "is_spam"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
y = df["is_spam"].astype(int)

# seperating categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# for col in categorical_cols:
#     print(col, X[col].unique())

# dummy encoding the dataset
preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"),
            categorical_cols,
        )
    ],
    remainder="passthrough",
)

# ----------------------------------------------------
# train-test split
# ----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# encoding cat values:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)
# Get the new feature names (encoded categories + passed-through numerical columns)
feature_names_out = preprocessor.get_feature_names_out().tolist()

# Convert the NumPy arrays back to DataFrames
X_train_encoded_df = pd.DataFrame(
    X_train_encoded, columns=feature_names_out, index=X_train.index
)
X_test_encoded_df = pd.DataFrame(
    X_test_encoded, columns=feature_names_out, index=X_test.index
)
scaler = StandardScaler().fit(X_train_encoded_df)
X_train_encoded_df = scaler.transform(X_train_encoded_df)
X_test_encoded_df = scaler.transform(X_test_encoded_df)

X_train_const = add_constant(X_train_encoded_df, has_constant="add")
X_test_const = add_constant(X_test_encoded_df, has_constant="add")
print(X_train_const.shape, X_test_const.shape)

# 2. Classification Model Training
### 2.1 Training all models

##### 2.1.1 Baseline - Logistic Regression

[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# ----------------------------------------------------
# 2.1.1. Logistic Regression
# --------------------------------------------------
# fit the regression:
print("\n--- Model Evaluation (Logistic Regression) ---")
log_reg_base = LogisticRegression()
log_reg_base.fit(X_train_const, y_train)
y_train_pred_log = log_reg_base.predict(X_train_const)
y_test_pred_log = log_reg_base.predict(X_test_const)

# Evaluate the model
train_accuracy_log = accuracy_score(y_train, y_train_pred_log)
test_accuracy_log = accuracy_score(y_test, y_test_pred_log)
print(f"Training Accuracy: {train_accuracy_log:.4f}")
print(f"Testing Accuracy: {test_accuracy_log:.4f}")

print("\n--- Classification Report (Test Set) ---")
print(classification_report(y_test, y_test_pred_log))
# ROC and AUC:
fpr_log, tpr_log, thresh_log = roc_curve(y_test, y_test_pred_log)
auc_log = roc_auc_score(y_test, y_test_pred_log)


##### 2.1.2 SVM Classifier

[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# ----------------------------------------------------
# 2.1.2. Initialize the SVM Model - linear kernel
# --------------------------------------------------
# 1. Initialize the SVM model
svm_model = SVC(kernel="linear", C=1.0, random_state=42)

# 2. Fit the SVM model
# used the encoded data WITHOUT the added constant, as SVC handles the intercept.
svm_model.fit(X_train_encoded_df, y_train)

# 3. Predict on Training and Testing Sets
y_train_pred_svm = svm_model.predict(X_train_encoded_df)
y_test_pred_svm = svm_model.predict(X_test_encoded_df)

# 4. Evaluate the model
print("\n--- Model Evaluation (Support Vector Machine) ---")

train_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print(f"Training Accuracy: {train_accuracy_svm:.4f}")
print(f"Testing Accuracy: {test_accuracy_svm:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_svm))
# ROC and AUC:
fpr_svm, tpr_svm, thresh_svm = roc_curve(y_test, y_test_pred_svm)
auc_svm = roc_auc_score(y_test, y_test_pred_svm)

##### 2.1.3 Decision Tree
[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# ----------------------------------------------------
# 2.1.3. CART
# --------------------------------------------------
grid_values = {
    "ccp_alpha": np.linspace(0, 0.10, 201),
    "min_samples_leaf": [5],
    "min_samples_split": [20],
    "max_depth": [30],
    "random_state": [88],
}


dtc = DecisionTreeClassifier()
dtc_cv_acc = GridSearchCV(
    dtc, param_grid=grid_values, scoring="accuracy", cv=10, verbose=1
)  # default scoring metric to optimize is accuracy, used as default if none given.
dtc_cv_acc.fit(X_train_const, y_train)


In [None]:
# --- 1. Retrieve the Best Model ---
# The best_estimator_ attribute holds the final, trained model with the optimal parameters.
dtc_best_model = dtc_cv_acc.best_estimator_

# Print the best parameters found by GridSearchCV for verification
print("Best parameters found:")
print(dtc_cv_acc.best_params_)
print(
    "Grid best parameter ccp_alpha (max. accuracy): ",
    dtc_cv_acc.best_params_["ccp_alpha"],
)
print("Grid best score (accuracy): ", dtc_cv_acc.best_score_)
# --- 2. Make Predictions ---
# Predict classes (0 or 1)
y_train_pred_dtc = dtc_best_model.predict(X_train_const)
y_test_pred_dtc = dtc_best_model.predict(X_test_const)

# Predict probabilities for ROC/AUC (optional, but often better than raw classes for AUC)
# We assume the Decision Tree classifier supports predict_proba.
try:
    y_test_proba_dtc = dtc_best_model.predict_proba(X_test_const)[:, 1]
except AttributeError:
    # Fallback if predict_proba is not available, though it should be for DTC
    y_test_proba_dtc = y_test_pred_dtc


# ---  Evaluation ---
print("\n--- Model Evaluation (Decision Tree Classifier) ---")
# Compute Accuracy
train_accuracy_dtc = accuracy_score(y_train, y_train_pred_dtc)
test_accuracy_dtc = accuracy_score(y_test, y_test_pred_dtc)
print(f"Training Accuracy: {train_accuracy_dtc:.4f}")
print(f"Testing Accuracy: {test_accuracy_dtc:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_dtc))
# Compute ROC and AUC
fpr_dtc, tpr_dtc, thresh_dtc = roc_curve(y_test, y_test_proba_dtc)
auc_dtc = roc_auc_score(y_test, y_test_proba_dtc)


#### 2.1.4. Random Forest

[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# --------------------------------------------------
# 2.1.4. Random Forest
# --------------------------------------------------
grid_values = {
    "max_features": np.linspace(1, 18, 18, dtype="int32"),
    "min_samples_leaf": [5],
    "n_estimators": [500],
    "random_state": [88],
}

tic = time.time()

rf = RandomForestClassifier()
# Note: here we set verbose=2 to keep track of the progress (the running time) of the cross validation.
cv = KFold(n_splits=5, random_state=333, shuffle=True)
rf_cv = GridSearchCV(rf, param_grid=grid_values, scoring="accuracy", cv=cv, verbose=2)
rf_cv.fit(X_train_const, y_train)

toc = time.time()

print("Best params:", rf_cv.best_params_)
print("Best CV accuracy:", rf_cv.best_score_)
print("Time:", round(toc - tic, 2), "s")

In [None]:
# Fit Random Forest with chosen hyperparameters
max_features = rf_cv.cv_results_["param_max_features"].data
cv_scores = rf_cv.cv_results_["mean_test_score"]
best_idx = np.argmax(cv_scores)

rf_final = RandomForestClassifier(
    n_estimators=rf_cv.best_params_.get("n_estimators", 100),
    max_features=max_features[best_idx],
    min_samples_leaf=rf_cv.best_params_.get("min_samples_leaf", 1),
    random_state=42,
)

rf_final.fit(X_train_const, y_train)
y_train_pred_rf = rf_final.predict(X_train_const)
y_test_pred_rf = rf_final.predict(X_test_const)
# Evaluate the model
train_accuracy_rf = accuracy_score(y_train, y_train_pred_log)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_log)
print(f"Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Testing Accuracy: {test_accuracy_rf:.4f}")

print("\n--- Classification Report (Test Set) ---")
print(classification_report(y_test, y_test_pred_log))
# ROC and AUC:
fpr_rf, tpr_rf, thresh_rf = roc_curve(y_test, y_test_pred_rf)
auc_rf = roc_auc_score(y_test, y_test_pred_rf)

In [None]:
# The list of feature names used for training
# We assume the 'const' term is the first column as added by statsmodels.api.add_constant
feature_names = ["const"] + preprocessor.get_feature_names_out().tolist()

# 2. Extract importances from the fitted model
importances = rf_final.feature_importances_

# 3. Create the DataFrame
feature_importance_df = pd.DataFrame(
    {"Feature": feature_names, "Importance": importances}
)

# 4. Sort and Display Results
# Drop the 'const' feature as its importance is irrelevant/misleading
feature_importance_df = feature_importance_df[
    feature_importance_df["Feature"] != "const"
]

feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
).reset_index(drop=True)

print("\n--- Top 15 Feature Importances (Random Forest) ---")
print(feature_importance_df.head(15).to_markdown(index=False))

# 5. Visualization
plt.figure(figsize=(12, 8))
top_features = feature_importance_df.head(15)
sns.barplot(x="Importance", y="Feature", data=top_features, color="#f39c12")
plt.title("Top 15 Feature Importances for PyPI Spam Detection")
plt.xlabel("Feature Importance (Gini Importance)")
plt.ylabel("Feature Name")
plt.show()

##### 2.1.5 Gradient Boosting
[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
gb = GradientBoostingClassifier(random_state=42)

# Random search space
# param_dist = {
#     "n_estimators": randint(50, 400),
#     "learning_rate": uniform(0.001, 0.5),
#     "max_depth": randint(2, 7),
#     "subsample": uniform(0.5, 0.5),     # searches from 0.5 to 1.0
#     "min_samples_split": randint(2, 20),
#     "min_samples_leaf": randint(1, 20)
# }


param_dist = {
    "n_estimators": randint(50, 300),  # fewer trees, usually 100â€“200 is enough
    "learning_rate": uniform(0.01, 0.2),  # narrower, realistic range
    "max_depth": randint(2, 5),  # shallow trees work best for GB
}


gb_random = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist,
    n_iter=50,  # number of random trials. canmake it 30 for random search later.
    scoring="accuracy",
    cv=5,
    random_state=42,
    verbose=2,
)

gb_random.fit(X_train_const, y_train)

# --------------------------------------
# Best params and model
# --------------------------------------
print("\nBest Params:")
print(gb_random.best_params_)

print("\nBest CV Score:", gb_random.best_score_)

# Best model object:
gb_best_model = gb_random.best_estimator_


In [None]:
# -------------------------------------------
# Extract best hyperparameters from CV
# -------------------------------------------
best_params = gb_best_model.get_params()
learning_rate = best_params.get("learning_rate", 0.1)
n_estimators = best_params.get("n_estimators", 100)
max_depth = best_params.get("max_depth", 3)

# -------------------------------------------
# Fit final Gradient Boosting model
# -------------------------------------------
gb_final = GradientBoostingClassifier(
    learning_rate=learning_rate,
    n_estimators=n_estimators,
    max_depth=max_depth,
    random_state=42,
)

gb_final.fit(X_train_const, y_train)

# Predictions
y_train_pred_gb = gb_final.predict(X_train_const)
y_test_pred_gb = gb_final.predict(X_test_const)

# -------------------------------------------
# Evaluate model
# -------------------------------------------
train_accuracy_gb = accuracy_score(y_train, y_train_pred_gb)
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb)

print(f"Training Accuracy: {train_accuracy_gb:.4f}")
print(f"Testing Accuracy:  {test_accuracy_gb:.4f}")

print("\n--- Classification Report (Test Set) ---")
print(classification_report(y_test, y_test_pred_gb))

# -------------------------------------------
# ROC & AUC
# -------------------------------------------
y_test_proba_gb = gb_final.predict_proba(X_test_const)[:, 1]

fpr_gb, tpr_gb, thresh_gb = roc_curve(y_test, y_test_proba_gb)
auc_gb = roc_auc_score(y_test, y_test_proba_gb)

print(f"AUC-ROC (GB): {auc_gb:.4f}")


### 2.2 Comparison of Classifiers
#### 2.2.1 Comparison Table 
[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
import pandas as pd
from sklearn.metrics import recall_score, accuracy_score, roc_auc_score
import numpy as np


def structure_model_metrics(
    y_train_true, y_test_true, y_train_pred, y_test_pred, train_acc, test_acc, auc_score
):
    """Calculates and structures key classification metrics for the positive class (1)."""

    # Calculate Recall for the positive class (Class 1) for both train and test
    recall_train = recall_score(
        y_train_true, y_train_pred, pos_label=1, zero_division=0
    )
    recall_test = recall_score(y_test_true, y_test_pred, pos_label=1, zero_division=0)

    return {
        "Train Accuracy": train_acc,
        "Test Accuracy": test_acc,
        "Train Recall (Class 1)": recall_train,
        "Test Recall (Class 1)": recall_test,
        "Test AUC": auc_score,
    }


# --- 2. Calculate Metrics for Each Model ---

# A. Logistic Regression Metrics
logreg_metrics = structure_model_metrics(
    y_train,
    y_test,
    y_train_pred_log,
    y_test_pred_log,
    train_accuracy_log,
    test_accuracy_log,
    auc_log,
)

# B. SVM Classifier Metrics
svm_metrics = structure_model_metrics(
    y_train,
    y_test,
    y_train_pred_svm,
    y_test_pred_svm,
    train_accuracy_svm,
    test_accuracy_svm,
    auc_svm,
)

# C. Decision Tree Metrics
# NOTE: The AUC for DTC uses y_test_proba_dtc (probabilities), which is the standard,
# while the other predictions use classes, which is fine for direct comparison.
dtc_metrics = structure_model_metrics(
    y_train,
    y_test,
    y_train_pred_dtc,
    y_test_pred_dtc,
    train_accuracy_dtc,
    test_accuracy_dtc,
    auc_dtc,
)

# D. Random Forest
dtc_metrics = structure_model_metrics(
    y_train,
    y_test,
    y_train_pred_rf,
    y_test_pred_rf,
    train_accuracy_rf,
    test_accuracy_rf,
    auc_rf,
)

# D. Random Forest
rf_metrics = structure_model_metrics(
    y_train,
    y_test,
    y_train_pred_rf,
    y_test_pred_rf,
    train_accuracy_rf,
    test_accuracy_rf,
    auc_rf,
)

# D.Gradient Boosting
# gradient_metrics = structure_model_metrics(
#     y_train, y_test, y_train_pred_gb, y_test_pred_gb,
#     train_accuracy_gb, test_accuracy_gb, auc_gb
# )


placeholder_metrics = {
    "Train Accuracy": np.nan,
    "Test Accuracy": np.nan,
    "Train Recall (Class 1)": np.nan,
    "Test Recall (Class 1)": np.nan,
    "Test AUC": np.nan,
}

# --- 3. Consolidate into the Final DataFrame ---
all_results = {
    "Logistic Regression": logreg_metrics,
    "SVM Classifier": svm_metrics,
    "Decision Tree": dtc_metrics,
    "Random Forest": rf_metrics,
    "Gradient Boosting": placeholder_metrics.copy(),
}

results_df = pd.DataFrame(all_results).T

# --- 4. Display the Formatted Table ---
print(
    "----------------------------------------------------\n--- CONSOLIDATED CLASSIFICATION MODEL COMPARISON ---"
)

# Format all columns to 3 decimal places
results_df_formatted = results_df.applymap(lambda x: f"{x:.3f}" if pd.notna(x) else "-")

# Print the final comparison table
# print(results_df_formatted.to_markdown(index=True))
results_df_formatted

##### 2.2.2 Comparison ROC curve
[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
# 2. Comparison Graph:
# --- Setup for Multiple Plots ---
fig, axes = plt.subplots(1, 5, figsize=(30, 6))

# Ensure axes is flat for easy iteration
axes = axes.flatten()

# --- Plot 1: Logistic Regression ---
ax = axes[0]
ax.plot(fpr_log, tpr_log, label=f"ROC curve (AUC = {auc_log:.3f})")
ax.plot([0, 1], [0, 1], "k--", label="Random guess")
ax.set_xlabel("False Positive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
ax.set_title("2. ROC Curve - Logistic Regression")
ax.grid(True)

# --- Plot 1: SVM ROC Curve ---
ax = axes[1]
ax.plot(fpr_svm, tpr_svm, label=f"ROC curve (AUC = {auc_svm:.3f})")
ax.plot([0, 1], [0, 1], "k--", label="Random guess")
ax.set_xlabel("False Positive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
ax.set_title("2. ROC Curve - SVM")
ax.legend(loc="lower right")
ax.grid(True)

# --- Plot 3:  ---
ax = axes[2]
ax.set_title("3. ROC Curve - Decision Tree")
ax.plot(fpr_dtc, tpr_dtc, label=f"ROC curve (AUC = {auc_dtc:.3f})")
ax.plot([0, 1], [0, 1], "k--", label="Random guess")
ax.set_xlabel("False Positive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
ax.grid(True)

# --- Plot 4: ---
ax = axes[3]
ax.set_title("4. ROC Curve - Random Forest")
ax.plot(fpr_rf, tpr_rf, label=f"ROC curve (AUC = {auc_rf:.3f})")
ax.plot([0, 1], [0, 1], "k--", label="Random guess")
ax.set_xlabel("False Positive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
ax.grid(True)

# --- Plot 5: ---
ax = axes[4]
ax.set_title("5. ROC Curve - Gradient Boosting")
ax.plot(fpr_gb, tpr_gb, label=f"ROC curve (AUC = {auc_gb:.3f})")
ax.plot([0, 1], [0, 1], "k--", label="Random guess")
ax.set_xlabel("False Positive Rate (FPR)")
ax.set_ylabel("True Positive Rate (TPR)")
ax.grid(True)

# --- Final Display ---
plt.tight_layout()
plt.show()

### 3. Model Extraction

[Click here to go to 'Quick Summary'](#quick-summary)

In [None]:
from datetime import datetime
from pathlib import Path

# Create models directory and generate timestamp
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

def save_model(model, base_name, models_dir, timestamp):
    """Save model with timestamp to models directory."""
    filename = f"{base_name}_{timestamp}.joblib"
    filepath = models_dir / filename
    joblib.dump(model, filepath)
    # Also save as the 'latest' version without timestamp
    latest_filepath = models_dir / f"{base_name}.joblib"
    joblib.dump(model, latest_filepath)
    print(f"Saved: {filepath} (and {latest_filepath})")
    return filepath

# Save preprocessor and scaler
save_model(preprocessor, "fitted_preprocessor", models_dir, timestamp)
save_model(scaler, "fitted_scaler", models_dir, timestamp)

# Save all classifier models
save_model(log_reg_base, "log_reg_spam_classifier", models_dir, timestamp)
save_model(svm_model, "svm_linear_spam_classifier", models_dir, timestamp)
save_model(dtc_best_model, "dtc_spam_classifier", models_dir, timestamp)
save_model(rf_final, "rf_spam_classifier", models_dir, timestamp)
save_model(gb_final, "gb_spam_classifier", models_dir, timestamp)

print(f"\nAll models saved with timestamp: {timestamp}")
