In [None]:
# author : Clarice Shim
# 00_setup (eval & export)

import os, random, sys
from pathlib import Path
import numpy as np

# quieter TF logs + reproducibility
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ---- Paths ----
MODEL_NAME = "cnn"

# Notebook CWD = <repo>/Models/CNN
# CSVs live     = <repo>/Final Data/*.csv
DATA   = Path("../..") / "Final Data"

# Outputs stay inside Models/CNN/
MODELS = Path("models") / MODEL_NAME        # Models/CNN/models/cnn/...
REPORT = Path("reports") / MODEL_NAME       # Models/CNN/reports/cnn/...
PRED   = Path("preds") / MODEL_NAME         # Models/CNN/preds/cnn/...
EMB    = Path("embeddings") / MODEL_NAME    # Models/CNN/embeddings/cnn/...

for p in [MODELS, REPORT, PRED, EMB]:
    p.mkdir(parents=True, exist_ok=True)

# Sanity checks (eval needs val/test + trained model)
assert (DATA / "val.csv").exists() and (DATA / "test.csv").exists(), "Missing val/test CSVs. Fix DATA path."
assert (MODELS / "cnn_best.keras").exists(), "Missing models/cnn/cnn_best.keras. Run training first."

# ---- TensorFlow imports ----
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# (optional) force CPU if needed
try:
    tf.config.set_visible_devices([], "GPU")
except Exception:
    pass

print("CWD:", Path.cwd())
print("TF:", tf.__version__, "| Keras:", keras.__version__)
print("DATA:", (DATA).resolve())
print("Model path exists:", (MODELS / "cnn_best.keras").resolve(), (MODELS / "cnn_best.keras").exists())
print("Will write to:", {
    "REPORT": str(REPORT.resolve()),
    "PRED":   str(PRED.resolve()),
    "EMB":    str(EMB.resolve()),
})


CWD: c:\Users\Clarice Shim\Desktop\COS30049 Computing Technology Innovation project\COS30047_Session7_Group4\SUBMISSION\Models\CNN
TF: 2.19.0 | Keras: 3.11.3
DATA: C:\Users\Clarice Shim\Desktop\COS30049 Computing Technology Innovation project\COS30047_Session7_Group4\SUBMISSION\Final Data
Model path exists: C:\Users\Clarice Shim\Desktop\COS30049 Computing Technology Innovation project\COS30047_Session7_Group4\SUBMISSION\Models\CNN\models\cnn\cnn_best.keras True
Will write to: {'REPORT': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\reports\\cnn', 'PRED': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\preds\\cnn', 'EMB': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\embeddings\\cnn'}


In [2]:
# 01_load_artifacts — model, vectorizer, uniform vectorize() helper

# Load best model
model = keras.models.load_model(MODELS / "cnn_best.keras")

# Try SavedModel dir → .keras file → rebuild from vocab.txt
vec = None
vec_dir  = MODELS / "text_vectorizer_model"
vec_file = MODELS / "text_vectorizer_model.keras"
vocab_txt = MODELS / "vocab.txt"

if vec_dir.exists():
    vec = tf.saved_model.load(str(vec_dir))
    print("Loaded vectorizer (SavedModel dir).")
elif vec_file.exists():
    vec = keras.models.load_model(vec_file)
    print("Loaded vectorizer (.keras file).")
else:
    assert vocab_txt.exists(), "No vectorizer found; vocab.txt is also missing."
    # Rebuild from vocab (+ seq_len from metadata if present)
    seq_len = 300
    meta_fp = MODELS / "metadata.json"
    if meta_fp.exists():
        try:
            seq_len = int(json.loads(meta_fp.read_text(encoding="utf-8")).get("seq_len", 300))
        except Exception:
            pass
    tv = keras.layers.TextVectorization(
        max_tokens=None,
        output_mode="int",
        output_sequence_length=seq_len,
        standardize="lower_and_strip_punctuation",
    )
    vocab = vocab_txt.read_text(encoding="utf-8").splitlines()
    tv.set_vocabulary(vocab)
    # Wrap: (batch,1) strings -> ids
    string_in = keras.Input(shape=(1,), dtype="string")
    squeeze   = keras.layers.Lambda(lambda t: tf.squeeze(t, axis=1))(string_in)
    ids_out   = tv(squeeze)
    vec = keras.Model(string_in, ids_out)
    print("Rebuilt vectorizer from vocab.txt")

def vectorize_strings(batch_strings):
    """Uniformly call the vectorizer, regardless of SavedModel/Keras model."""
    x = tf.convert_to_tensor(batch_strings)
    x = tf.expand_dims(x, axis=1)  # (batch,1) strings
    if hasattr(vec, "signatures"):  # SavedModel dir
        out = vec.signatures["serving_default"](x)
        return list(out.values())[0]
    return vec(x)                   # Keras Functional model

Loaded vectorizer (SavedModel dir).


In [4]:
# 02_load_data & datasets — builds ds_val and ds_test

import pandas as pd
import tensorflow as tf
import numpy as np

BATCH = 64
AUTOTUNE = tf.data.AUTOTUNE

# assumes DATA points to ../Final Data (from your earlier cell)
train = pd.read_csv(DATA / "train.csv")
val   = pd.read_csv(DATA / "val.csv")
test  = pd.read_csv(DATA / "test.csv")

def ds_from_df(df):
    ds = tf.data.Dataset.from_tensor_slices((df["text"].values, df["fake"].values.astype("int32")))
    ds = ds.batch(BATCH).map(lambda x, y: (vectorize_strings(x), y), num_parallel_calls=AUTOTUNE)
    return ds.prefetch(AUTOTUNE)

ds_val  = ds_from_df(val)
ds_test = ds_from_df(test)

# Quick peek (also asserts the pipeline works)
ids_b, y_b = next(iter(ds_val.take(1)))
print("Vectorized batch:", ids_b.shape, "| labels:", y_b.shape)


Vectorized batch: (64, 300) | labels: (64,)


In [5]:
# 03_choose_threshold — sweep thresholds on val to maximize F1

from sklearn.metrics import precision_recall_fscore_support

val_proba = model.predict(ds_val, verbose=0).ravel()
y_val = val["fake"].values

ths = np.linspace(0.05, 0.95, 181)
best_thr, best_f1 = 0.5, -1.0
for t in ths:
    pred = (val_proba >= t).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(y_val, pred, average="binary", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, t

print(f"Chosen threshold (F1-opt on val): {best_thr:.3f}")

Chosen threshold (F1-opt on val): 0.470


In [7]:
# 04_evaluate_on_test — AUC/Accuracy/Precision/Recall/F1 and save metrics JSON

import json
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

test_proba = model.predict(ds_test, verbose=0).ravel()
y_true = test["fake"].values
y_pred = (test_proba >= best_thr).astype(int)

auc = roc_auc_score(y_true, test_proba)
rep = classification_report(y_true, y_pred, digits=4, output_dict=True)

metrics = {
    "threshold": float(best_thr),
    "test_auc": float(auc),
    "accuracy": float(rep["accuracy"]),
    "precision": float(rep["1"]["precision"]),
    "recall": float(rep["1"]["recall"]),
    "f1": float(rep["1"]["f1-score"]),
    "full_report": rep,
}

(REPORT / "test_metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")

print(json.dumps({k: metrics[k] for k in ["threshold", "test_auc", "accuracy", "precision", "recall", "f1"]}, indent=2))
print("Saved:", REPORT / "test_metrics.json")


{
  "threshold": 0.4699999999999999,
  "test_auc": 0.9629379277978923,
  "accuracy": 0.8999721370855391,
  "precision": 0.9033412887828163,
  "recall": 0.8848626534190532,
  "f1": 0.8940064954236788
}
Saved: reports\cnn\test_metrics.json


In [9]:
# 05_confusion_matrix — save PNG

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix  # if not already imported

cm = confusion_matrix(y_true, y_pred)

plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix (test)")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.colorbar()

for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha="center", va="center")

plt.tight_layout()
plt.savefig(REPORT / "confusion_matrix.png", dpi=160)
plt.close()
print("Saved:", REPORT / "confusion_matrix.png")


Saved: reports\cnn\confusion_matrix.png


In [10]:
# 06_save_predictions — per-row predictions CSV

preds_df = pd.DataFrame({
    "id":   test["id"],
    "text": test["text"],
    "y_true": y_true,
    "proba": test_proba,
    "y_pred": y_pred
})
preds_df.to_csv(PRED/"test_preds.csv", index=False, encoding="utf-8")
print("Saved:", PRED/"test_preds.csv", f"({len(preds_df)} rows)")

Saved: preds\cnn\test_preds.csv (3589 rows)


In [11]:
# 07_export_embeddings — penultimate-layer features for clustering teammates

# Penultimate layer (Dense(128) before sigmoid). Adjust index if you changed architecture.
emb_model = keras.Model(model.input, model.layers[-2].output)

def embed_texts(df):
    ds = (tf.data.Dataset.from_tensor_slices(df["text"].values)
          .batch(BATCH)
          .map(vectorize_strings, num_parallel_calls=tf.data.AUTOTUNE)
          .prefetch(tf.data.AUTOTUNE))
    return emb_model.predict(ds, verbose=0)

np.save(EMB/"emb_train.npy", embed_texts(train))
np.save(EMB/"emb_val.npy",   embed_texts(val))
np.save(EMB/"emb_test.npy",  embed_texts(test))
print("Saved embeddings ->", EMB)


Saved embeddings -> embeddings\cnn
