In [4]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import joblib
import re, random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

**Baseline Test:** Usng Naive Bayes + TF-IDF, we can establish a quick baseline that tells us if simple lexical cues already seperate the human text and AI-generated text.

In [None]:
NAIVE_BAYES_MODEL_PATH = "baseline_saved_model/naive_bayes_model.joblib"
NAIVE_BAYES_VECTORIZER_PATH = "baseline_saved_model/tfidf_vectorizer.joblib"

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    toks = text.split()
    return " ".join(toks)

def get_data_from_jsonl(path, n_per_split=10000):
    raw_dataset = load_dataset("json", data_files=path, split="train")
    
    human_raw = []
    ai_raw = []

    human_count = 0
    ai_count = 0
    
    for item in raw_dataset:
        if "real" in item and human_count < n_per_split:
            human_raw.append(item["real"])
            human_count += 1
        if "gpt2" in item and ai_count < n_per_split:
            ai_raw.append(item["gpt2"])
            ai_count += 1
        if human_count >= n_per_split and ai_count >= n_per_split:
            break 

    human = [preprocess(t) for t in human_raw]
    ai    = [preprocess(t) for t in ai_raw]
    
    human = [t for t in human if t]
    ai    = [t for t in ai if t]


    # Zip up & shuffle
    texts = human + ai
    labels = [0]*len(human) + [1]*len(ai)
    
    data = list(zip(texts, labels))
    random.seed(42)
    random.shuffle(data)
    texts, labels = zip(*data)
    
    return list(texts), list(labels)

if __name__ == "__main__":
    TRAINING_DATA_PATH = "data/real-vs-gpt2-sentences.jsonl" 
    texts, labels = get_data_from_jsonl(TRAINING_DATA_PATH, n_per_split=10000) 

    X_train, X_test, y_train, y_test= train_test_split(
        texts, labels,
        test_size=0.2,
        random_state=42,
        stratify=labels
    )
    print(f"Training samples: {len(X_train)}, Local validation samples: {len(X_test)}")

    # Vectorize
    print("Vectorizing text data...")
    vec = TfidfVectorizer(
        ngram_range=(1, 2), 
        max_features=20000,
        min_df=3,
        max_df=0.9
    )
    Xtr_tfidf = vec.fit_transform(X_train)
    Xdv_tfidf = vec.transform(X_test) 
    print("Vectorization complete.")

    # Train Naive Bayes classifier
    print("Training Naive Bayes classifier...")
    clf = MultinomialNB(alpha=0.1)
    clf.fit(Xtr_tfidf, y_train)
    print("Training complete.")

    # Evaluate on the local test set
    print("Evaluating on local test set...")
    preds = clf.predict(Xdv_tfidf)
    acc = accuracy_score(y_test, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, preds, average="binary", pos_label=1, zero_division=0 
    )

    print(f"Baseline → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    # Save the model and vectorizer
    print(f"Saving Naive Bayes model to: {NAIVE_BAYES_MODEL_PATH}")
    joblib.dump(clf, NAIVE_BAYES_MODEL_PATH)
    
    print(f"Saving TF-IDF vectorizer to: {NAIVE_BAYES_VECTORIZER_PATH}")
    joblib.dump(vec, NAIVE_BAYES_VECTORIZER_PATH)
    
    print("Model and vectorizer saved successfully.")

Training samples: 16000, Local validation samples: 4000
Vectorizing text data...
Vectorization complete.
Training Naive Bayes classifier...
Training complete.
Evaluating on local test set...
Baseline → Acc: 0.6085, Prec: 0.6146, Rec: 0.5820, F1: 0.5978
Saving Naive Bayes model to: baseline_saved_model/naive_bayes_model.joblib
Saving TF-IDF vectorizer to: baseline_saved_model/tfidf_vectorizer.joblib
Model and vectorizer saved successfully.
