In [None]:
# Mount Drive (only run if not mounted)
# ensures the Colab session can read/write the model and test files in Drive.
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


"""
============================================================
 test.py
 -----------------------------------------------------------
 PURPOSE:
   - Load trained DistilBERT model from local directory
   - Run inference on test dataset (IMDB or custom examples)
   - Save outputs (reports, metrics, misclassifications) to Drive
   - Ensure reproducibility and auditability for downstream governance

 AI SECURITY & GOVERNANCE:
   - Dataset Integrity: SHA256 hash of test CSV logged for tamper detection
   - Provenance: Training metadata.json included in test reports
   - Audit Logs: Human-readable + JSON reports generated
   - Environment Logging: Python, Torch, CUDA info saved
   - Misclassification Analysis: Saved for error inspection
   - Reproducibility: All outputs stored in model_dir/test_results/

 DEVELOPMENT NOTE:
   - For Colab, we temporarily load model + data from Drive.
   - Final deployment will use Docker with embedded model,
     ensuring offline reproducibility without Drive/HF dependencies.
============================================================
"""


In [None]:
# Set paths and ensure the model folder exists
# Centralize locations and keep outputs inside the model folder for provenance

import os
from pathlib import Path
MODEL_DIR = Path("/content/drive/MyDrive/quick_distilbert_model")  # adjust if different
TEST_CSV  = Path("/content/drive/MyDrive/quick_distilbert_model/imdb_test_clean.csv")     # path to test dataset on Drive
TEST_RESULTS_DIR = MODEL_DIR / "test_results"                      # where test outputs will be saved

# Create results folder if it doesn't exist
TEST_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Sanity checks
print("Model folder exists:", MODEL_DIR.exists())
print("Test CSV exists:", TEST_CSV.exists())
print("Test results folder:", TEST_RESULTS_DIR)
print("Listing model folder sample files:", os.listdir(MODEL_DIR)[:20])


Model folder exists: True
Test CSV exists: True
Test results folder: /content/drive/MyDrive/quick_distilbert_model/test_results
Listing model folder sample files: ['imdb_test_clean.csv', 'config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json', 'training_args.bin', 'metadata.json', 'eval_outputs', 'test_results']


In [None]:
# Governance utilities
# These help with reproducibility and audit trails (dataset hashes, metadata, environment)

import json, hashlib, platform, datetime

def file_hash(path: Path) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def load_metadata(model_dir: Path):
    md = model_dir / "metadata.json"
    if md.exists():
        try:
            return json.loads(md.read_text())
        except Exception as e:
            print(f"[GOV WARN] metadata.json exists but could not be parsed: {e}")
            return None
    return None

def env_info():
    return {
        "python": platform.python_version(),
        "torch_cuda_available": torch.cuda.is_available(),
        "torch_cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
        "platform": platform.platform(),
        "timestamp_utc": datetime.datetime.utcnow().isoformat() + "Z"
    }

# quick gov output
metadata = load_metadata(MODEL_DIR)
print("Loaded metadata?:", True if metadata else False)


Loaded metadata?: True


In [None]:
# Load model and tokenizer (local_files_only=True)
# we are using the Drive copy for development; avoid HF hub calls for now.

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
print("[INFO] Using device:", device)

# Load (this will read from MODEL_DIR contents)
tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR), local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(str(MODEL_DIR), local_files_only=True)
model.to(device)
model.eval()
print("[INFO] Loaded tokenizer and model from Drive.")


[INFO] Using device: cuda
[INFO] Loaded tokenizer and model from Drive.


In [None]:
#  Prepare test inputs
# support running a full CSV or a small set of custom examples for smoke/adversarial checks

import pandas as pd

USE_CSV = True   # set False to run custom_examples below instead of CSV
custom_examples = [
    "I absolutely loved this movie, it was brilliant and moving.",
    "Worst movie ever. Boring plot and bad acting.",
    "",  # empty input test
    "<script>alert('x')</script>",
    "üòäüëç"
]

if USE_CSV:
    if not TEST_CSV.exists():
        raise FileNotFoundError(f"Test CSV not found at {TEST_CSV}")
    df_test = pd.read_csv(TEST_CSV)
    # Ensure expected columns
    if "review" not in df_test.columns or "sentiment" not in df_test.columns:
        raise ValueError("Test CSV must contain 'review' and 'sentiment' columns.")
    texts = df_test["review"].astype(str).tolist()
    true_labels = df_test["sentiment"].astype(int).tolist()
else:
    texts = custom_examples
    true_labels = [None] * len(texts)  # unknown labels for custom tests

print(f"[INFO] Number of test examples: {len(texts)}")


[INFO] Number of test examples: 10000


In [None]:
# Batched inference loop (runs on device)
# Efficient inference with batching; collects probs, preds, and logs

from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class SimpleTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        t = str(self.texts[idx])
        enc = self.tokenizer(t, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["text"] = t
        return item

BATCH_SIZE = 32
dataset = SimpleTextDataset(texts, tokenizer, max_length=256)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

softmax = torch.nn.Softmax(dim=1)

all_preds = []
all_probs = []
all_texts = []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Running Tests"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = softmax(logits).cpu().numpy()
        preds = np.argmax(probs, axis=1).tolist()
        all_preds.extend(preds)
        all_probs.extend(probs.tolist())
        all_texts.extend(batch["text"])


Running Tests: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 313/313 [01:17<00:00,  4.01it/s]


In [None]:
# Metrics and report creation
# compute metrics if labels exist, otherwise produce inference-only log

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import pandas as pd

has_labels = all(l is not None for l in true_labels)

if has_labels:
    # Only keep the same length (CSV case)
    assert len(true_labels) == len(all_preds)
    acc = accuracy_score(true_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(true_labels, all_preds, average="binary", zero_division=0)
    class_report = classification_report(true_labels, all_preds, digits=4, zero_division=0)
    cm = confusion_matrix(true_labels, all_preds)
else:
    acc = prec = rec = f1 = None
    class_report = None
    cm = None

# Build test summary for human + audit
test_timestamp = datetime.datetime.utcnow().isoformat() + "Z"
test_summary_txt = f"Test run timestamp: {test_timestamp}\nModel dir: {MODEL_DIR}\nDevice: {device}\n"
if has_labels:
    test_summary_txt += f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1: {f1:.4f}\n\n"
    test_summary_txt += "Classification report:\n" + class_report + "\n"
    test_summary_txt += "Confusion matrix:\n" + (str(cm) if cm is not None else "None") + "\n"
else:
    test_summary_txt += "No true labels provided (inference-only run).\n"

# Structured JSON summary (machine-readable)
test_summary_json = {
    "timestamp_utc": test_timestamp,
    "model_dir": str(MODEL_DIR),
    "device": device,
    "num_examples": len(all_texts),
    "has_labels": has_labels,
    "metrics": {
        "accuracy": float(acc) if acc is not None else None,
        "precision": float(prec) if prec is not None else None,
        "recall": float(rec) if rec is not None else None,
        "f1": float(f1) if f1 is not None else None
    },
    "env": env_info(),
    "metadata_present": bool(metadata),
    "metadata": metadata if metadata else None,
    "test_csv_hash": file_hash(TEST_CSV) if TEST_CSV.exists() else None
}


  test_timestamp = datetime.datetime.utcnow().isoformat() + "Z"
  "timestamp_utc": datetime.datetime.utcnow().isoformat() + "Z"


In [None]:
# Save outputs to Drive inside the model folder
# centralize model + evaluation + test artifacts for reproducibility

import csv, json
out_dir = TEST_RESULTS_DIR
out_dir.mkdir(parents=True, exist_ok=True)

# 1) human-readable txt
txt_path = out_dir / "test_report.txt"
with open(txt_path, "w") as f:
    f.write(test_summary_txt)
print(f"[INFO] Saved human-readable test report -> {txt_path}")

# 2) structured JSON
json_path = out_dir / f"test_report_{test_timestamp.replace(':','-')}.json"
with open(json_path, "w") as f:
    json.dump(test_summary_json, f, indent=2)
print(f"[INFO] Saved structured test report -> {json_path}")

# 3) predictions CSV (all inputs, preds, prob scores, optional true labels)
preds_df = pd.DataFrame({
    "text": all_texts,
    "pred_label": all_preds,
    "pred_scores": [", ".join([f"{s:.4f}" for s in p]) for p in all_probs]
})
if has_labels:
    preds_df["true_label"] = true_labels

preds_csv_path = out_dir / "test_predictions.csv"
preds_df.to_csv(preds_csv_path, index=False)
print(f"[INFO] Saved predictions CSV -> {preds_csv_path}")

# 4) misclassified examples (if labels available)
if has_labels:
    mis_df = preds_df[preds_df["true_label"] != preds_df["pred_label"]].copy()
    mis_csv_path = out_dir / "test_misclassified.csv"
    mis_df.to_csv(mis_csv_path, index=False)
    print(f"[INFO] Saved misclassified examples -> {mis_csv_path}")
else:
    print("[INFO] No true labels available ‚Äî skipping misclassified examples file.")


[INFO] Saved human-readable test report -> /content/drive/MyDrive/quick_distilbert_model/test_results/test_report.txt
[INFO] Saved structured test report -> /content/drive/MyDrive/quick_distilbert_model/test_results/test_report_2025-09-13T12-42-44.187470Z.json
[INFO] Saved predictions CSV -> /content/drive/MyDrive/quick_distilbert_model/test_results/test_predictions.csv
[INFO] Saved misclassified examples -> /content/drive/MyDrive/quick_distilbert_model/test_results/test_misclassified.csv


In [None]:
# Save a compact run log with sample entries (JSON)
# a single artifact that includes sample predictions for quick review

run_log = {
    "timestamp_utc": test_timestamp,
    "num_examples": len(all_texts),
    "sample_predictions": [
        {"text": all_texts[i], "pred_label": int(all_preds[i]), "pred_scores": all_probs[i],
         "true_label": (int(true_labels[i]) if true_labels[i] is not None else None)}
        for i in range(min(20, len(all_texts)))
    ],
    "report_json": test_summary_json
}
run_log_path = out_dir / f"test_run_log_{test_timestamp.replace(':','-')}.json"
with open(run_log_path, "w") as f:
    json.dump(run_log, f, indent=2)
print(f"[INFO] Saved run log -> {run_log_path}")


[INFO] Saved run log -> /content/drive/MyDrive/quick_distilbert_model/test_results/test_run_log_2025-09-13T12-42-44.187470Z.json


In [None]:
# Print summary and show saved files

print("=== TEST RUN COMPLETE ===")
print("Saved files in", TEST_RESULTS_DIR)
print(list(TEST_RESULTS_DIR.iterdir()))
# show first lines of human-readable report
print("\n--- Report preview ---\n")
print(open(txt_path).read()[:2000])


=== TEST RUN COMPLETE ===
Saved files in /content/drive/MyDrive/quick_distilbert_model/test_results
[PosixPath('/content/drive/MyDrive/quick_distilbert_model/test_results/test_report.txt'), PosixPath('/content/drive/MyDrive/quick_distilbert_model/test_results/test_report_2025-09-13T12-42-44.187470Z.json'), PosixPath('/content/drive/MyDrive/quick_distilbert_model/test_results/test_predictions.csv'), PosixPath('/content/drive/MyDrive/quick_distilbert_model/test_results/test_misclassified.csv'), PosixPath('/content/drive/MyDrive/quick_distilbert_model/test_results/test_run_log_2025-09-13T12-42-44.187470Z.json')]

--- Report preview ---

Test run timestamp: 2025-09-13T12:42:44.187470Z
Model dir: /content/drive/MyDrive/quick_distilbert_model
Device: cuda
Accuracy: 0.8249
Precision: 0.8053
Recall: 0.8570
F1: 0.8303

Classification report:
              precision    recall  f1-score   support

           0     0.8472    0.7928    0.8191      5000
           1     0.8053    0.8570    0.8303   