<a href="https://colab.research.google.com/github/siriwatsc-debug/FinalProject-ML/blob/main/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler
import joblib

#HashingVectorizer

In [None]:
# Load dataset
file_path = "https://github.com/siriwatsc-debug/FinalProject-ML/raw/main/train.csv"
df = pd.read_csv(file_path)
df['Combined'] = df['Subject'].fillna('') + " " + df['Body'].fillna('')
df['Label_Binary'] = df['Label'].apply(lambda x: 1 if x == "Phishing" else 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined'],
    df['Label_Binary'],
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=df['Label_Binary'])

In [None]:
pipeline = Pipeline([
    ("hash", HashingVectorizer(
        n_features=2**18,     # large enough for email vocabulary
        alternate_sign=False, # prevents negative values
        ngram_range=(1, 2),   # unigrams + bigrams (safe)
        norm="l2"
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42)),  # PCA-like step
    ("scale", StandardScaler()),
    ("svm", SGDClassifier(
        loss="hinge",         # linear SVM
        random_state=42
    ))
])

In [None]:
#4. GridSearchCV (Leak-Proof Hyperparameter Tuning)
param_grid = {
    "svm__alpha": [1e-5, 1e-4, 1e-3],
    "svm__max_iter": [2000]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="f1"
)

print("Training leak-proof SVM model...")
grid.fit(X_train, y_train)

print("\nBest parameters:", grid.best_params_)

Training leak-proof SVM model...

Best parameters: {'svm__alpha': 0.001, 'svm__max_iter': 2000}


In [None]:
# 5. Evaluate on *true* unseen test data
# -------------------------------------------------
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print("\n=== True Leak-Proof Evaluation ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:", round(recall_score(y_test, y_pred), 4))
print("F1 Score:", round(f1_score(y_test, y_pred), 4))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


=== True Leak-Proof Evaluation ===
Accuracy: 0.85
Precision: 0.7692
Recall: 1.0
F1 Score: 0.8696

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.70      0.82        10
           1       0.77      1.00      0.87        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [None]:
TEST_URL = "https://github.com/siriwatsc-debug/FinalProject-ML/raw/main/test.csv"
df_test = pd.read_csv(TEST_URL)
df_test["Combined"] = df_test["Subject"].fillna("") + " " + df_test["Body"].fillna("")
df_test["Label_Binary"] = df_test["Label"].apply(lambda x: 1 if x == "Phishing" else 0)

X_test = df_test["Combined"]
y_test = df_test["Label_Binary"]

print(f"Test data shape: {df_test.shape}")
print("Class distribution:\n", df_test["Label_Binary"].value_counts())

# ============================================
# 5. Predict & Evaluate
# ============================================
y_pred = best_model.predict(X_test)

accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("\n========== LEAK-PROOF TEST RESULTS ==========")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Legitimate (0)", "Phishing (1)"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test data shape: (26, 5)
Class distribution:
 Label_Binary
0    13
1    13
Name: count, dtype: int64

Accuracy : 0.8462
Precision: 0.8000
Recall   : 0.9231
F1-Score : 0.8571

Classification Report:
                precision    recall  f1-score   support

Legitimate (0)       0.91      0.77      0.83        13
  Phishing (1)       0.80      0.92      0.86        13

      accuracy                           0.85        26
     macro avg       0.85      0.85      0.85        26
  weighted avg       0.85      0.85      0.85        26

Confusion Matrix:
[[10  3]
 [ 1 12]]


In [None]:
df_test["Predicted_Label"] = y_pred
df_test["Predicted_Label_Text"] = df_test["Predicted_Label"].apply(lambda x: "Phishing" if x == 1 else "Legitimate")

# ============================================
# 7. Export Test Dataset with Predictions
# ============================================
results_csv_path = "test_with_predictions.csv"
df_test.to_csv(results_csv_path, index=False)
print(f"\n✓ Test dataset with predictions exported to {results_csv_path}")


✓ Test dataset with predictions exported to test_with_predictions.csv


#TF-IDF

In [None]:
# TF-IDF -> TruncatedSVD (PCA for sparse) -> SVM
# Cleaned, debugged, with GridSearchCV and unseen evaluation

import re
import joblib
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, classification_report, confusion_matrix)

# -----------------------
# Config / URLs / Seeds
# -----------------------
TRAIN_URL = "https://github.com/siriwatsc-debug/FinalProject-ML/raw/main/train.csv"
UNSEEN_URL = "https://github.com/siriwatsc-debug/FinalProject-ML/raw/main/test.csv"
RANDOM_STATE = 42
MODEL_OUT_PATH = "svm_tfidf_svd_pipeline.joblib"

# -----------------------
# Utility: text combine & cleaning
# -----------------------
def combine_and_clean(subject, body):
    # Combine subject and body and do light cleaning to match training assumptions
    subj = "" if pd.isna(subject) else str(subject)
    body = "" if pd.isna(body) else str(body)
    txt = f"{subj} {body}".strip()
    # lower, replace urls and emails (keeps content distribution similar)
    txt = txt.lower()
    txt = re.sub(r"http\S+|www\S+", " url ", txt)
    txt = re.sub(r"\S+@\S+", " email ", txt)
    # optionally remove non-alphanumeric except spaces (keep this if needed)
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)
    # normalize spaces
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

# -----------------------
# 1. Load data
# -----------------------
print("="*80)
print("Loading training data...")
print("="*80)
df = pd.read_csv(TRAIN_URL)

# create combined text column
# The above ugly one-liner is only to avoid linter issues in some editors. Replace with simple:
df["Combined_Text"] = (df["Subject"].fillna("") + " " + df["Body"].fillna("")).apply(lambda x: combine_and_clean(x, ""))

# Label encoding: Legitimate -> 0, else -> 1
df["Label_Binary"] = df["Label"].apply(lambda x: 0 if str(x).strip().lower() == "legitimate" else 1)
print(f"Dataset shape: {df.shape}")
print("Class distribution:")
print(df["Label"].value_counts())

# -----------------------
# 2. Train/test split
# -----------------------
X = df["Combined_Text"].values
y = df["Label_Binary"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("\nTrain / Test sizes:", X_train.shape[0], X_test.shape[0])

# -----------------------
# 3. Pipeline: TF-IDF -> TruncatedSVD -> StandardScaler -> SVM
# -----------------------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_df=0.95, min_df=3, ngram_range=(1,2), stop_words="english")),
    ("svd", TruncatedSVD(n_components=200, random_state=RANDOM_STATE)),
    ("scaler", StandardScaler()),   # TruncatedSVD yields dense; scale helps SVM
    ("svm", SVC(probability=True, random_state=RANDOM_STATE))
])

# -----------------------
# 4. Hyperparameter grid
# -----------------------
# We include n_components in the GridSearch so SVD dims are tuned too.
param_grid = {
    "svd__n_components": [100, 200, 300],     # adjust based on your dataset size
    "svm__kernel": ["linear", "rbf"],
    "svm__C": [0.1, 1, 10],
    "svm__gamma": ["scale", "auto"]           # used when kernel='rbf'
}

# GridSearchCV
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=2
)

# -----------------------
# 5. Train (Grid search)
# -----------------------
print("\nStarting GridSearchCV training (this may take a while)...")
grid.fit(X_train, y_train)

print("\nBest params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

best_model = grid.best_estimator_

# cross-val on full train for sanity
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1)
print("Cross-val accuracies on training set:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores), "+/-", np.std(cv_scores))

# -----------------------
# 6. Evaluate on hold-out test set
# -----------------------
print("\nEvaluating on hold-out test set...")
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred, zero_division=0))
print("F1:", f1_score(y_test, y_pred, zero_division=0))
if y_prob is not None:
    try:
        print("AUC:", roc_auc_score(y_test, y_prob))
    except Exception:
        pass

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Legitimate", "Phishing"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# -----------------------
# 7. Save the final model and vectorizer+svd pipeline
# -----------------------
joblib.dump(best_model, MODEL_OUT_PATH)
print(f"\nSaved best pipeline to: {MODEL_OUT_PATH}")

# Save scaler/others separately if needed (not necessary because pipeline contains everything)
# joblib.dump(scaler, "feature_scaler.pkl")

# -----------------------
# 8. Evaluate on unseen data (test.csv)
# -----------------------
print("\n" + "="*80)
print("Loading unseen data and evaluating saved model...")
print("="*80)

try:
    df_unseen = pd.read_csv(UNSEEN_URL)
    df_unseen["Combined_Text"] = (df_unseen["Subject"].fillna("") + " " + df_unseen["Body"].fillna("")).apply(lambda x: combine_and_clean(x, ""))
    df_unseen["Label_Binary"] = df_unseen["Label"].apply(lambda x: 0 if str(x).strip().lower() == "legitimate" else 1)

    X_unseen = df_unseen["Combined_Text"].values
    y_unseen = df_unseen["Label_Binary"].values

    # Load model (we already have best_model in memory, but show loading example)
    loaded = joblib.load(MODEL_OUT_PATH)

    y_pred_unseen = loaded.predict(X_unseen)
    y_prob_unseen = loaded.predict_proba(X_unseen)[:, 1] if hasattr(loaded, "predict_proba") else None

    acc_u = accuracy_score(y_unseen, y_pred_unseen)
    prec_u = precision_score(y_unseen, y_pred_unseen, zero_division=0)
    rec_u = recall_score(y_unseen, y_pred_unseen, zero_division=0)
    f1_u = f1_score(y_unseen, y_pred_unseen, zero_division=0)

    print(f"\nUnseen set performance - Accuracy: {acc_u:.4f} Precision: {prec_u:.4f} Recall: {rec_u:.4f} F1: {f1_u:.4f}")
    if y_prob_unseen is not None:
        try:
            auc_u = roc_auc_score(y_unseen, y_prob_unseen)
            print(f"AUC (unseen): {auc_u:.4f}")
        except Exception:
            pass

    print("\nClassification Report (unseen):")
    print(classification_report(y_unseen, y_pred_unseen, target_names=["Legitimate", "Phishing"]))

    print("Confusion Matrix (unseen):")
    print(confusion_matrix(y_unseen, y_pred_unseen))

except Exception as e:
    print("Error loading or evaluating unseen data:", e)


Loading training data...
Dataset shape: (100, 5)
Class distribution:
Label
Legitimate    50
Phishing      50
Name: count, dtype: int64

Train / Test sizes: 80 20

Starting GridSearchCV training (this may take a while)...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best params: {'svd__n_components': 100, 'svm__C': 0.1, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
Best CV score: 0.9875
Cross-val accuracies on training set: [1.     1.     1.     0.9375 1.    ]
Mean CV accuracy: 0.9875 +/- 0.024999999999999998

Evaluating on hold-out test set...
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
AUC: 0.0

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00        10
    Phishing       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Confusion Matrix:
[[1