### Step 1: Import libraries and set file paths


In [7]:
# Step 1: imports + paths + quick sanity checks
import os
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# --- Updated correct paths ---
BASE_DIR = r"D:\Data Science Projects\Data Citation Intent Classification"
TRAIN_XML_DIR = os.path.join(BASE_DIR, "data", "raw", "train", "XML")
LABELS_PATH = os.path.join(BASE_DIR, "data", "raw", "train_labels.csv")
OUTPUT_PATH = os.path.join(BASE_DIR, "data", "processed", "train_labeled.csv")

# --- Quick checks ---
if not os.path.isdir(TRAIN_XML_DIR):
    raise FileNotFoundError(f"Cannot find XML directory: {TRAIN_XML_DIR}")
else:
    print("✅ Found XML path:", TRAIN_XML_DIR)

if not os.path.exists(LABELS_PATH):
    print("⚠️ Warning: train_labels.csv not found at", LABELS_PATH)
else:
    print("✅ Found labels file:", LABELS_PATH)

# Count XML files
xml_files = [f for f in os.listdir(TRAIN_XML_DIR) if f.lower().endswith(".xml")]
print(f"Found {len(xml_files)} XML files in {TRAIN_XML_DIR}")


✅ Found XML path: D:\Data Science Projects\Data Citation Intent Classification\data\raw\train\XML
✅ Found labels file: D:\Data Science Projects\Data Citation Intent Classification\data\raw\train_labels.csv
Found 400 XML files in D:\Data Science Projects\Data Citation Intent Classification\data\raw\train\XML


### Step 2: Load labels and create article_id → (dataset_id, type) mapping

In [8]:
labels_map = {}  # article_id -> list of (dataset_id, type)

if os.path.exists(LABELS_PATH):
    labels_df = pd.read_csv(LABELS_PATH)
    labels_df.columns = [c.strip() for c in labels_df.columns]

    if {'article_id', 'dataset_id', 'type'}.issubset(labels_df.columns):
        labels_df['article_id'] = labels_df['article_id'].astype(str)
        grouped = labels_df.groupby('article_id')[['dataset_id', 'type']]
        labels_map = grouped.apply(lambda df: list(zip(df['dataset_id'], df['type']))).to_dict()
        
        print(f"✅ Loaded labels for {len(labels_map)} unique article_id(s).")
        display(labels_df.head())
    else:
        print("⚠️ Expected columns ['article_id', 'dataset_id', 'type'] but got:", list(labels_df.columns))
else:
    print("⚠️ Labels file not found; continuing without label mapping.")


✅ Loaded labels for 523 unique article_id(s).


Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_anie.201916483,Missing,Missing
2,10.1002_anie.202005531,Missing,Missing
3,10.1002_anie.202007717,Missing,Missing
4,10.1002_chem.201902131,Missing,Missing


### Step 3: Parse XML files and extract citation contexts

In [10]:
# ---------------------------------------------------------
# Step 3: Parse XML files and extract citation contexts
# ---------------------------------------------------------
import os
from bs4 import BeautifulSoup

# --- Step 3.1: Define correct absolute paths ---
BASE_DIR = r"D:\Data Science Projects\Data Citation Intent Classification"
TRAIN_XML_PATH = os.path.join(BASE_DIR, "data", "raw", "train", "XML")
TRAIN_PDF_PATH = os.path.join(BASE_DIR, "data", "raw", "train", "PDF")
TEST_XML_PATH = os.path.join(BASE_DIR, "data", "raw", "test", "XML")
TEST_PDF_PATH = os.path.join(BASE_DIR, "data", "raw", "test", "PDF")
LABELS_PATH = os.path.join(BASE_DIR, "data", "raw", "train_labels.csv")

# --- Step 3.2: Verify folder exists ---
if not os.path.isdir(TRAIN_XML_PATH):
    raise FileNotFoundError(f"❌ XML folder not found: {TRAIN_XML_PATH}")
else:
    xml_files = sorted([f for f in os.listdir(TRAIN_XML_PATH) if f.endswith(".xml")])
    print(f"✅ Found {len(xml_files)} XML files in {TRAIN_XML_PATH}")

# --- Step 3.3: Define extraction function ---
def extract_citation_contexts(xml_path, article_id):
    """
    Extract all citation contexts robustly from XML.
    Returns a list of dicts: article_id, ref_id, context
    """
    with open(xml_path, "r", encoding="utf-8") as f:
        xml_content = f.read()

    soup = BeautifulSoup(xml_content, "lxml-xml")
    contexts = []

    # Search all <xref> tags
    for xref in soup.find_all("xref"):
        ref_id = xref.get("rid") or xref.get("id")
        parent = xref.find_parent(["p", "sec", "title"])
        if parent:
            text = parent.get_text(" ", strip=True)
            contexts.append({
                "article_id": article_id,
                "ref_id": ref_id,
                "context": text
            })

    # Fallback: <ref> tags in back matter
    if not contexts:
        for ref in soup.find_all("ref"):
            text = ref.get_text(" ", strip=True)
            if text:
                contexts.append({
                    "article_id": article_id,
                    "ref_id": ref.get("id"),
                    "context": text
                })

    return contexts

# --- Step 3.4: Test on one XML file ---
test_xml = os.path.join(TRAIN_XML_PATH, xml_files[0])
sample_contexts = extract_citation_contexts(test_xml, article_id=xml_files[0].replace(".xml", ""))
print(f"✅ Extracted {len(sample_contexts)} contexts from one sample file.")
sample_contexts[:3]


✅ Found 400 XML files in D:\Data Science Projects\Data Citation Intent Classification\data\raw\train\XML
✅ Extracted 251 contexts from one sample file.


[{'article_id': '10.1002_2017jc013030',
  'ref_id': None,
  'context': '(Volk and Hoffert, 1985;'},
 {'article_id': '10.1002_2017jc013030',
  'ref_id': None,
  'context': 'Honjo et al., 2014;'},
 {'article_id': '10.1002_2017jc013030',
  'ref_id': None,
  'context': 'Legendre et al., 2015)'}]

### Step 4: Extract contexts for all XML files and save labeled dataset

In [None]:
import pandas as pd
from tqdm import tqdm

# --- Step 4.1: Load labels ---
labels_df = pd.read_csv(LABELS_PATH)
labels_df.columns = [c.strip() for c in labels_df.columns]

# Build a mapping: article_id -> list of (dataset_id, type)
labels_map = labels_df.groupby('article_id')[['dataset_id', 'type']].apply(lambda df: list(zip(df['dataset_id'], df['type']))).to_dict()
print(f"✅ Loaded labels for {len(labels_map)} unique article_id(s)")

# --- Step 4.2: Extract contexts for all XML files ---
all_contexts = []

for xml_file in tqdm(xml_files, desc="Processing XMLs"):
    article_id = xml_file.replace(".xml", "")
    xml_path = os.path.join(TRAIN_XML_PATH, xml_file)
    contexts = extract_citation_contexts(xml_path, article_id)
    
    # Attach labels for this article_id if exists
    for ctx in contexts:
        if article_id in labels_map:
            ctx['labels'] = labels_map[article_id]  # list of (dataset_id, type)
        else:
            ctx['labels'] = []
    
    all_contexts.extend(contexts)

# --- Step 4.3: Save to CSV ---
train_labeled_df = pd.DataFrame(all_contexts)
train_labeled_df.to_csv("train_labeled.csv", index=False)
print(f"✅ Saved {len(train_labeled_df)} extracted contexts to train_labeled.csv")
train_labeled_df.head()


✅ Loaded labels for 523 unique article_id(s)


Processing XMLs: 100%|██████████| 400/400 [00:37<00:00, 10.75it/s]


✅ Saved 44899 extracted contexts to train_labeled.csv


Unnamed: 0,article_id,ref_id,context,labels
0,10.1002_2017jc013030,,"(Volk and Hoffert, 1985;","[(https://doi.org/10.17882/49388, Primary)]"
1,10.1002_2017jc013030,,"Honjo et al., 2014;","[(https://doi.org/10.17882/49388, Primary)]"
2,10.1002_2017jc013030,,"Legendre et al., 2015)","[(https://doi.org/10.17882/49388, Primary)]"
3,10.1002_2017jc013030,,"(Riser and Johnson, 2008;","[(https://doi.org/10.17882/49388, Primary)]"
4,10.1002_2017jc013030,,"Graff et al., 2012","[(https://doi.org/10.17882/49388, Primary)]"
