# Reference Training Data & Stage 2 Gold Scaffold Builder

This notebook converts a Word document of references into Stage 1 training data and Stage 2 gold annotation scaffolds.

In [None]:

# Install dependencies (run once)
# !pip install python-docx pandas scikit-learn


In [None]:

from docx import Document
import pandas as pd
import re
import json
import uuid
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime


## Configuration

In [None]:

INPUT_DOCX = "references.docx"
OUTPUT_DIR = "training_data"

TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

LABEL_FIXES = {
    "wesite": "website"
}


## Load Word Document & Extract Labels

In [None]:

ARTICLE_TYPE_REGEX = re.compile(
    r"(?:\[\s*(?P<t1>[^\]]+)\s*\]|\(\s*(?P<t2>[^\)]+)\s*\)|type\s*:\s*(?P<t3>.+))$",
    re.IGNORECASE
)

doc = Document(INPUT_DOCX)
rows = []

for para in doc.paragraphs:
    text = para.text.strip()
    if not text:
        continue

    match = ARTICLE_TYPE_REGEX.search(text)
    if not match:
        continue

    label_block = next(v for v in match.groupdict().values() if v)
    labels = [
        LABEL_FIXES.get(lbl.strip().lower(), lbl.strip().lower())
        for lbl in re.split(r"[,\|/]", label_block)
        if lbl.strip()
    ]

    raw_reference = ARTICLE_TYPE_REGEX.sub("", text).strip()

    for label in labels:
        rows.append({
            "raw_reference": raw_reference,
            "stage_1_label": label,
            "confidence": 1.0
        })

df = pd.DataFrame(rows)
assert not df.empty, "No labelled references found"

df.head()


## Train / Val / Test Split

In [None]:

train_df, temp_df = train_test_split(
    df,
    test_size=(1 - TRAIN_RATIO),
    stratify=df["stage_1_label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO),
    stratify=temp_df["stage_1_label"],
    random_state=42
)

len(train_df), len(val_df), len(test_df)


## Stage 2 Gold Annotation Scaffolding

In [None]:

STAGE2_FIELDS_BY_TYPE = {
    "journal": ["authors","year","title","journal","volume","issue","pages","doi"],
    "book": ["authors","year","title","publisher","place","edition","isbn"],
    "website": ["organisation","year","title","url","access_date"]
}

def build_stage2_gold(df):
    records = []
    for _, row in df.iterrows():
        ref_type = row["stage_1_label"]
        if ref_type not in STAGE2_FIELDS_BY_TYPE:
            continue

        fields = {f: {"value": None, "confidence": None} for f in STAGE2_FIELDS_BY_TYPE[ref_type]}

        records.append({
            "reference_id": str(uuid.uuid4()),
            "raw_reference": row["raw_reference"],
            "reference_type": ref_type,
            "stage2_gold": {"fields": fields},
            "annotation_meta": {
                "annotated": False,
                "annotator_id": None,
                "annotated_at": None,
                "notes": ""
            }
        })
    return records

stage2_gold_train = build_stage2_gold(train_df)
stage2_gold_val = build_stage2_gold(val_df)
stage2_gold_test = build_stage2_gold(test_df)

len(stage2_gold_train), len(stage2_gold_val), len(stage2_gold_test)


## Save Outputs

In [None]:

Path(OUTPUT_DIR).mkdir(exist_ok=True)

train_df.to_csv(f"{OUTPUT_DIR}/stage1_train.csv", index=False)
val_df.to_csv(f"{OUTPUT_DIR}/stage1_val.csv", index=False)
test_df.to_csv(f"{OUTPUT_DIR}/stage1_test.csv", index=False)

with open(f"{OUTPUT_DIR}/stage2_gold_train.json", "w") as f:
    json.dump(stage2_gold_train, f, indent=2)

with open(f"{OUTPUT_DIR}/stage2_gold_val.json", "w") as f:
    json.dump(stage2_gold_val, f, indent=2)

with open(f"{OUTPUT_DIR}/stage2_gold_test.json", "w") as f:
    json.dump(stage2_gold_test, f, indent=2)

print("Saved all outputs.")
