# LLM Pipeline for OpenAI models

In [1]:
import os
import time
import re
import json
import pandas as pd
import backoff
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.rate_limiters import InMemoryRateLimiter
from pydantic import BaseModel, Field
import openai
import openai._exceptions as openai_error
from bs4 import MarkupResemblesLocatorWarning
import warnings

In [None]:
load_dotenv()

models = openai.models.list()

print("Available models")
for model in models.data:
    print("-", model.id)

## Load Datasets

In [None]:
#suppress spurious BeautifulSoup warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

#cleaning function for email subject and body
def bert_cleaning(text):
    if pd.isna(text):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()  
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)          
    text = re.sub(r'\s+', ' ', text).strip()             
    return text

folder_path = "../Datasets/processed_datasets"
full_datasets = {}

#load datasets from CSV files and apply cleaning + filtering
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        name = file.removesuffix(".csv")
        df = pd.read_csv(os.path.join(folder_path, file))

        # clean labels, ensuring they are integers
        df["label"] = pd.to_numeric(df["label"], errors="coerce")
        df = df.dropna(subset=["label"]).copy()
        df["label"] = df["label"].astype(int)

        #only keep entries with valid subject and text
        df = df[
            df["subject"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
            df["text"].apply(lambda x: isinstance(x, str) and x.strip() != "")
        ]

        #clean subject and text separately
        df["subject"] = df["subject"].apply(bert_cleaning)
        df["text"] = df["text"].apply(bert_cleaning)

        #remove duplicates based on subject and text
        df = df.drop_duplicates(subset=["subject", "text"]).reset_index(drop=True)

        #ensure both label classes are present
        label_counts = df["label"].value_counts()
        if 0 in label_counts and 1 in label_counts:
            full_datasets[name] = df[["subject", "text", "label"]]
            print(f"{name} accepted: {len(df)} samples")
        else:
            print(f"{name} skipped: missing class 0 or 1")

print(f"\nLoaded and cleaned {len(full_datasets)} datasets.")

In [None]:
print(full_datasets["Al-Subaiey_CEAS"].columns)

In [None]:
dataset_summaries = []

for name, df in full_datasets.items():
    # sicherstellen, dass label korrekt ist
    df["label"] = pd.to_numeric(df["label"], errors="coerce")
    df = df.dropna(subset=["label"]).copy()
    df["label"] = df["label"].astype(int)

    num_samples = len(df)
    num_phishing = (df["label"] == 1).sum()
    num_benign = (df["label"] == 0).sum()
    phishing_ratio = num_phishing / num_samples if num_samples > 0 else 0

    dataset_summaries.append({
        "dataset": name,
        "samples": num_samples,
        "benign": num_benign,
        "phishing": num_phishing,
        "phishing_ratio": round(phishing_ratio, 3)
    })

summary_df = pd.DataFrame(dataset_summaries)
print("\nDataset overview:")
print(summary_df.to_string(index=False))

example_dataset = list(full_datasets.keys())[0]
print(f"\n5 random samples from '{example_dataset}':")
print(full_datasets[example_dataset].sample(5, random_state=28))

### Load Caripoti dataset

In [None]:
with open("../Datasets/13_Pajola/emails_with_languages.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df_caripoti = pd.DataFrame(data)
df_caripoti = df_caripoti[["Subject", "Body", "type", "Language"]]

#filter for English language only
df_caripoti = df_caripoti[df_caripoti["Language"] == "en"].copy()

#ensure consistent column names
df_caripoti = df_caripoti.rename(columns={
    "Subject": "subject",
    "Body": "text",
    "type": "label"
})

#ensure labels are int, and drop NaNs to avoid issues
df_caripoti["label"] = pd.to_numeric(df_caripoti["label"], errors="coerce")
df_caripoti = df_caripoti.dropna(subset=["label"])
df_caripoti["label"] = df_caripoti["label"].astype(int)

#filter out empty rows
df_caripoti = df_caripoti[
    df_caripoti["subject"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
    df_caripoti["text"].apply(lambda x: isinstance(x, str) and x.strip() != "")
]

#clean with bert_cleaning function
df_caripoti["subject"] = df_caripoti["subject"].apply(bert_cleaning)
df_caripoti["text"] = df_caripoti["text"].apply(bert_cleaning)

#remove duplicate values
df_caripoti = df_caripoti.drop_duplicates(subset=["subject", "text"]).reset_index(drop=True)

#check if both classes are present
label_counts = df_caripoti["label"].value_counts()
if 0 in label_counts and 1 in label_counts:
    full_datasets["Caripoti"] = df_caripoti[["subject", "text", "label"]]
    print(f"Caripoti accepted: {len(df_caripoti)} samples")
else:
    print("Caripoti skipped: missing class 0 or 1")

## Initialize the model 

In [10]:
load_dotenv()

#define the structured output format
class EmailClassificationResponse(BaseModel):
    classification: int = Field(description="0 for benign, 1 for phishing")

#initialize langchain rate limiter
rate_limiter = InMemoryRateLimiter(
    requests_per_second=20,
    check_every_n_seconds=2,
    max_bucket_size=40,
)

model = "gpt-4.1-mini-2025-04-14"

#initialize LLM client with structured output
client = ChatOpenAI(model=model, temperature=0, rate_limiter=rate_limiter)
client = client.with_structured_output(EmailClassificationResponse, method="json_mode")
parser = PydanticOutputParser(pydantic_object=EmailClassificationResponse)

#function to call LLM and another functions to handle rate limiting and retries
@backoff.on_exception(
    backoff.expo,
    (
        openai_error.RateLimitError,
        openai_error.APIError,
        openai_error.APIConnectionError
    ),
    max_tries=6,
    jitter=backoff.full_jitter,
    giveup=lambda e: isinstance(e, openai_error.AuthenticationError)
)
def call_llm(prompt):
    return client.invoke(prompt)

#email classification function
def classify_email_numeric(row):
    subject = row["subject"]
    text = row["text"]
    true_label = int(row["label"])

    prompt = (
        f"You are a cybersecurity expert. Classify the following email as phishing (1) or benign (0). "
        f"Output only a JSON with the integer field 'classification'. "
        f"Base your decision on the subject and body only. Mark as phishing if the content is suspicious, manipulative, unexpected, or spam-like. "
        f"Look for urgency, requests for credentials, threatening tone, excessive capitalization, odd formatting, or pressure to act. "
        f"Note that modern phishing emails may not always follow traditional patterns and can be more targeted towards the receiver.\n\n"
        f"Subject: {subject}\n\n"
        f"Body:\n{text}\n\n"
        f"{parser.get_format_instructions()}"
    )

    try:
        response = call_llm(prompt)
        return {
            "true_label": true_label,
            "predicted_label": response.classification
        }
    except Exception as e:
        return {"error": str(e)}

## Evaluation loop

In [None]:
seed = 28
results_dir = "llm_eval_results"
os.makedirs(results_dir, exist_ok=True)

#option to evaluate only a specific dataset
"""full_datasets = {
    dataset_name: dataset_df 
    for dataset_name, dataset_df in full_datasets.items() 
    if dataset_name == "Giri_LingSpam"
}"""

#loop through each dataset for evaluation
for name, df in full_datasets.items():
    print(f"\nProcessing dataset: {name}")

    #create stratified train-test split for evaluation
    _, test_df = train_test_split(df, test_size=0.3, random_state=seed, stratify=df["label"])
    test_df = test_df.reset_index(drop=True)

    #check if results file already exists and resumre from where the evaluation was interrupted
    result_path = os.path.join(results_dir, f"{name}_testsplit_results.csv") 
    if os.path.exists(result_path):
        try:
            existing_df = pd.read_csv(result_path)
            results = existing_df.to_dict("records")
            start_idx = len(existing_df)
            print(f"Resuming from sample {start_idx}/{len(test_df)}")
        except Exception as e:
            print(f"Could not read existing results from {result_path}. Starting from scratch.")
            results = []
            start_idx = 0
    else:
        results = []
        start_idx = 0

    #evaluate each email in the test set
    for i in range(start_idx, len(test_df)):
        row = test_df.iloc[i]
        result = classify_email_numeric(row)
        result["index"] = i

        if "error" not in result:
            results.append(result)

            #save progress every 250 samples or when the evaluation is complete
            if (i + 1) % 250 == 0 or (i + 1) == len(test_df):
                results_df = pd.DataFrame(results)
                results_df.to_csv(result_path, index=False)
                print(f"Progress saved at sample {i + 1}")
        else:
            print(f"Error at index {i}: {result['error']}")
            time.sleep(5)

    #evaluate the results
    eval_df = pd.DataFrame(results)
    acc = accuracy_score(eval_df["true_label"], eval_df["predicted_label"])
    prec = precision_score(eval_df["true_label"], eval_df["predicted_label"])
    rec = recall_score(eval_df["true_label"], eval_df["predicted_label"])
    f1 = f1_score(eval_df["true_label"], eval_df["predicted_label"])

    #save results to csv
    llm_split_results_df = pd.DataFrame({
        "dataset": [name],
        "model": [model],
        "accuracy": [acc],
        "precision": [prec],
        "recall": [rec],
        "f1_score": [f1],
        "num_samples": [len(eval_df)],
    })

    llm_split_results_path = os.path.join(results_dir, "llm_split_results.csv")
    llm_split_results_df.to_csv(
        llm_split_results_path,
        mode="a",
        index=False,
        header=not os.path.exists(llm_split_results_path)
    )
    print(f"Results appended for {name}")

## Code to evaluate single full dataset without split

In [None]:
# load full dataset separately
full_dataset_no_split = {
    dataset_name: dataset_df
    for dataset_name, dataset_df in full_datasets.items()
    if dataset_name == "Chatuat_Enhancing_Phishing_Detection"
}

results_dir = "llm_eval_results"
os.makedirs(results_dir, exist_ok=True)
safe_model_name = model.replace("/", "-")

#evaluate on the full dataset without splitting
for name, df in full_dataset_no_split.items():
    print(f"\nProcessing full dataset: {name}")

    test_df = df.reset_index(drop=True)

    #check if results file already exists and resume from where the evaluation was interrupted
    result_path = os.path.join(results_dir, f"{name}_{safe_model_name}_full_results.csv")
    
    if os.path.exists(result_path):
        try:
            existing_df = pd.read_csv(result_path)
            results = existing_df.to_dict("records")
            start_idx = len(existing_df)
            print(f"Resuming from sample {start_idx}/{len(test_df)}")
        except Exception as e:
            print(f"Could not read existing results from {result_path}. Starting from scratch.")
            results = []
            start_idx = 0
    else:
        results = []
        start_idx = 0

    #classification for each email
    for i in range(start_idx, len(test_df)):
        row = test_df.iloc[i]
        result = classify_email_numeric(row)
        result["index"] = i

        if "error" not in result:
            results.append(result)

            if (i + 1) % 250 == 0 or (i + 1) == len(test_df):
                results_df = pd.DataFrame(results)
                results_df.to_csv(result_path, index=False)
                print(f"Progress saved at sample {i + 1}")
        else:
            print(f"Error at index {i}: {result['error']}")
            time.sleep(5)

    #evaluate results
    eval_df = pd.DataFrame(results)
    acc = accuracy_score(eval_df["true_label"], eval_df["predicted_label"])
    prec = precision_score(eval_df["true_label"], eval_df["predicted_label"])
    rec = recall_score(eval_df["true_label"], eval_df["predicted_label"])
    f1 = f1_score(eval_df["true_label"], eval_df["predicted_label"])

    #save
    llm_split_results_df = pd.DataFrame({
        "dataset": [f"{name}_Full"],
        "model": [model],
        "accuracy": [acc],
        "precision": [prec],
        "recall": [rec],
        "f1_score": [f1],
        "num_samples": [len(eval_df)],
    })

    llm_split_results_path = os.path.join(results_dir, "llm_split_results.csv")
    llm_split_results_df.to_csv(
        llm_split_results_path,
        mode="a",
        index=False,
        header=not os.path.exists(llm_split_results_path)
    )

    print(f"Full dataset results appended for {name}")