In [None]:
# author: Clarice Shim
# 00_setup (train)

import os, random, sys
from pathlib import Path
import numpy as np

# quieter TF logs + reproducibility
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ---- Paths ----
MODEL_NAME = "cnn"

# Notebook CWD = <repo>/Models/CNN
# CSVs live     = <repo>/Final Data/*.csv   (go up two levels, then into Final Data)
DATA   = Path("../..") / "Final Data"

# Outputs stay inside Models/CNN/
MODELS = Path("models") / MODEL_NAME        # Models/CNN/models/cnn/...
REPORT = Path("reports") / MODEL_NAME       # Models/CNN/reports/cnn/...
PRED   = Path("preds") / MODEL_NAME         # Models/CNN/preds/cnn/...

for p in [MODELS, REPORT, PRED]:
    p.mkdir(parents=True, exist_ok=True)

# Sanity checks (train needs train/val)
assert (DATA / "train.csv").exists() and (DATA / "val.csv").exists(), "Missing train/val CSVs. Fix DATA path."

# ---- TensorFlow imports ----
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# (optional) force CPU if needed
try:
    tf.config.set_visible_devices([], "GPU")
except Exception:
    pass

print("CWD:", Path.cwd())
print("TF:", tf.__version__, "| Keras:", keras.__version__)
print("DATA:", (DATA).resolve())
print("Will write to:", {"MODELS": str(MODELS.resolve()), "REPORT": str(REPORT.resolve()), "PRED": str(PRED.resolve())})


CWD: c:\Users\Clarice Shim\Desktop\COS30049 Computing Technology Innovation project\COS30047_Session7_Group4\SUBMISSION\Models\CNN
TF: 2.19.0 | Keras: 3.11.3
DATA: C:\Users\Clarice Shim\Desktop\COS30049 Computing Technology Innovation project\COS30047_Session7_Group4\SUBMISSION\Final Data
Will write to: {'MODELS': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\models\\cnn', 'REPORT': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\reports\\cnn', 'PRED': 'C:\\Users\\Clarice Shim\\Desktop\\COS30049 Computing Technology Innovation project\\COS30047_Session7_Group4\\SUBMISSION\\Models\\CNN\\preds\\cnn'}


In [3]:
# 01_load_data
import pandas as pd

train = pd.read_csv(DATA / "train.csv", dtype={"id":"string","text":"string","fake":"Int8"})
val   = pd.read_csv(DATA / "val.csv",   dtype={"id":"string","text":"string","fake":"Int8"})

for df, name in [(train,"train"), (val,"val")]:
    assert {"id","text","fake"}.issubset(df.columns), f"Columns missing in {name}"
    assert df["fake"].isna().sum()==0, f"Missing labels in {name}"
    assert set(df["fake"].unique()) <= {0,1}, f"Unexpected labels in {name}: {df['fake'].unique()}"

train["fake"] = train["fake"].astype("uint8")
val["fake"]   = val["fake"].astype("uint8")

print(f"Train: {train.shape}  Val: {val.shape}")
print("Train balance:", train["fake"].value_counts(normalize=True).round(4).to_dict())
print("Val   balance:", val["fake"].value_counts(normalize=True).round(4).to_dict())

Train: (12918, 3)  Val: (1436, 3)
Train balance: {0: 0.5233, 1: 0.4767}
Val   balance: {0: 0.5237, 1: 0.4763}


In [4]:
# 02_text_vectorizer
VOCAB_SIZE = 30_000
SEQ_LEN    = 300
BATCH      = 64

text_vec = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    standardize="lower_and_strip_punctuation"
)
text_vec.adapt(train["text"].values)  # fit on TRAIN ONLY

# Save vocab (for reproducibility / rebuilding)
vocab = text_vec.get_vocabulary()
(MODELS / "vocab.txt").write_text("\n".join(vocab), encoding="utf-8")
print("Vectorizer ready. Vocab size:", len(vocab))


Vectorizer ready. Vocab size: 21087


In [5]:
# 03_build_datasets
AUTOTUNE = tf.data.AUTOTUNE

def make_ds(df, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((df["text"].values, df["fake"].values.astype("int32")))
    if shuffle:
        ds = ds.shuffle(len(df), reshuffle_each_iteration=True)
    ds = ds.batch(BATCH).map(lambda x,y: (text_vec(x), y), num_parallel_calls=AUTOTUNE)
    return ds.prefetch(AUTOTUNE).cache()

ds_train = make_ds(train, shuffle=True)
ds_val   = make_ds(val, shuffle=False)

x_b, y_b = next(iter(ds_train.take(1)))
print("Batch ids:", x_b.shape, "labels:", y_b.shape)

Batch ids: (64, 300) labels: (64,)


In [6]:
# 04_define_model
def build_cnn(vocab_size=VOCAB_SIZE, seq_len=SEQ_LEN, emb_dim=128, filters=192, kernel=5, dense_units=128, dropout=0.3):
    inp = layers.Input(shape=(seq_len,), dtype="int32", name="ids")
    x = layers.Embedding(vocab_size, emb_dim, name="embedding")(inp)
    x = layers.Conv1D(filters, kernel, padding="same", activation="relu", name="conv")(x)
    x = layers.GlobalMaxPooling1D(name="gmp")(x)
    x = layers.Dropout(dropout, name="drop")(x)
    x = layers.Dense(dense_units, activation="relu", name="dense")(x)
    out = layers.Dense(1, activation="sigmoid", name="out")(x)
    return keras.Model(inp, out, name="cnn_text_classifier")

model = build_cnn()
model.compile(optimizer=keras.optimizers.Adam(1e-3),
              loss="binary_crossentropy",
              metrics=[keras.metrics.AUC(name="auc"), "accuracy"])
model.summary()

In [11]:
# 06_callbacks_and_weights  (run this BEFORE the fit cell)
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# ── Callbacks ─────────────────────────────────────────────────────────────────
ckpt = keras.callbacks.ModelCheckpoint(
    filepath=str(MODELS / "cnn_best.keras"),
    monitor="val_auc",        # must match the metric name you use
    mode="max",
    save_best_only=True,
    verbose=1,
)
early = keras.callbacks.EarlyStopping(
    monitor="val_auc", mode="max", patience=2, restore_best_weights=True
)
reduce = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=1, min_lr=1e-6, verbose=1
)
csvlog = keras.callbacks.CSVLogger(str(REPORT / "train_log.csv"))

callbacks = [ckpt, early, reduce, csvlog]

# ── Class weights (optional; comment out if you don't want weighting) ────────
# If you have the training labels in a DataFrame called `train`
# (e.g., created earlier via: train = pd.read_csv(DATA/'train.csv'))
y_train = train["fake"].values.astype(int)

w = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=y_train,
)
class_weight = {0: float(w[0]), 1: float(w[1])}

print("Callbacks ready. Class weights:", class_weight)


Callbacks ready. Class weights: {0: 0.955473372781065, 1: 1.048879506333225}


In [12]:
# 07_train
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=12,
    callbacks=callbacks,
    class_weight=class_weight,   # or set to None if you don't want weighting
    verbose=1,
)

best_val_auc = float(max(history.history.get("val_auc", [0.0])))
best_val_acc = float(max(history.history.get("val_accuracy", [0.0])))
print({"best_val_auc": best_val_auc, "best_val_acc": best_val_acc})



Epoch 1/12
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.7229 - auc: 0.8077 - loss: 0.5206
Epoch 1: val_auc improved from None to 0.95917, saving model to models\cnn\cnn_best.keras
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 124ms/step - accuracy: 0.8159 - auc: 0.9057 - loss: 0.3912 - val_accuracy: 0.9004 - val_auc: 0.9592 - val_loss: 0.2586 - learning_rate: 0.0010
Epoch 2/12
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.9275 - auc: 0.9786 - loss: 0.1830
Epoch 2: val_auc improved from 0.95917 to 0.96142, saving model to models\cnn\cnn_best.keras

Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 115ms/step - accuracy: 0.9501 - auc: 0.9887 - loss: 0.1342 - val_accuracy: 0.8969 - val_auc: 0.9614 - val_loss: 0.2878 - learning_rate: 0.0010
Epoch 3/12
[1m202/202[0m [32m━━━━━━━━━━━━━━━━

In [13]:
# 08_save_artifacts
# Save final model (best already saved by checkpoint)
final_path = MODELS / "cnn_final.keras"
model.save(final_path)

# Save the TextVectorization as a Keras model and export using SavedModel dir
string_input = keras.Input(shape=(1,), dtype="string")
squeezed = layers.Lambda(lambda t: tf.squeeze(t, axis=1))(string_input)
vec_out = text_vec(squeezed)
vec_model = keras.Model(string_input, vec_out, name="text_vectorizer_model")
vec_dir = MODELS / "text_vectorizer_model"
tf.saved_model.save(vec_model, str(vec_dir))  # directory format (Windows-safe)

# Metadata
import json
metadata = {
    "seed": SEED,
    "vocab_size": len(text_vec.get_vocabulary()),
    "seq_len": int(SEQ_LEN),
    "batch": int(BATCH),
    "best_val_auc": best_val_auc,
    "best_val_acc": best_val_acc,
    "paths": {
        "best": str((MODELS / "cnn_best.keras").resolve()),
        "final": str(final_path.resolve()),
        "vectorizer_model_dir": str(vec_dir.resolve()),
        "vocab_txt": str((MODELS / "vocab.txt").resolve())
    }
}
(MODELS / "metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")
print("Saved:", final_path, "| vec dir:", vec_dir, "| metadata.json")


INFO:tensorflow:Assets written to: models\cnn\text_vectorizer_model\assets


INFO:tensorflow:Assets written to: models\cnn\text_vectorizer_model\assets


Saved: models\cnn\cnn_final.keras | vec dir: models\cnn\text_vectorizer_model | metadata.json


In [14]:
# 09_plot_learning_curves
import matplotlib.pyplot as plt

def plot_history(h, key, title=None):
    plt.figure()
    plt.plot(h.history.get(key, []), label=f"train_{key}")
    val_key = f"val_{key}"
    if val_key in h.history:
        plt.plot(h.history[val_key], label=val_key)
    plt.xlabel("Epoch"); plt.ylabel(key)
    if title: plt.title(title)
    plt.legend(); plt.grid(True)
    out = REPORT / f"curve_{key}.png"
    plt.savefig(out, dpi=160, bbox_inches="tight")
    plt.close()
    return out

outs = []
for k in ["loss", "accuracy", "auc"]:
    try:
        outs.append(str(plot_history(history, k, f"Training curves: {k}")))
    except Exception as e:
        print(f"Could not plot {k}: {e}")

print("Saved plots:", outs)

Saved plots: ['reports\\cnn\\curve_loss.png', 'reports\\cnn\\curve_accuracy.png', 'reports\\cnn\\curve_auc.png']
