In [1]:
import os

# CRITICAL: Disable vLLM V1 BEFORE importing vllm
# V1 does not support logits processors which we need for constrained generation
os.environ["VLLM_USE_V1"] = "0"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

import re
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import pickle
import vllm
import torch

# Import logits processor after setting environment
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor

# set seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x77cc95301250>

In [19]:
# Configure PDF directory for local development
pdf_directory = "/home/naohiro/MakeDataCount/dataset/train/PDF"  # Local data directory
text_span_len = 1000
# Define regex patterns for different identifier types
re_doi = re.compile(r"10\.\d{4}")
re_gsr = re.compile(r"GSE\d+|SR[APRX]\d+|PRJ[NAED][A-Z]?\d+")
re_ipe = re.compile(r"IPR\d{6}|PF\d{5}|EMPIAR-\d{5}", re.IGNORECASE)
re_c = re.compile(r"CHEMBL\d+|CVCL_[A-Z0-9]{4}")
re_e = re.compile(r"ENS[A-Z]{0,6}[GT]\d{11}")
re_r = re.compile(r"N[MC]_\d+(?:\.\d+)?|rs\d+")
re_u = re.compile(r"(?:uniprot:)?(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9])", re.IGNORECASE)
re_g = re.compile(r"EPI(?:_ISL_)?\d+")
re_p = re.compile(r"PXD\d{6}|SAM[ND]\d+|ERR\d+")
relist = [re_gsr, re_ipe, re_c, re_e, re_r, re_g, re_p]

def remove_references_section(text):
    """Remove references section from academic text to focus on main content."""
    lines = text.split('\n')
    cut_index = -1
    
    # Search backwards from end of document starting at 70% mark
    for i in range(len(lines) - 1, max(0, int(len(lines) * 0.3)), -1):
        line = lines[i].strip()
        
        # Patterns matching common reference section headers
        reference_patterns = [
            r'^REFERENCES?$',
            r'^\d+\.?\s+REFERENCES?$',
            r'^\d+\.?\s+References?$',
            r'^References?:?$',
            r'^BIBLIOGRAPHY$',
            r'^\d+\.?\s+BIBLIOGRAPHY$',
            r'^\d+\.?\s+Bibliography$',
            r'^Bibliography:?$',
            r'^Literature\s+Cited$',
            r'^Works\s+Cited$'
        ]
        
        if any(re.match(pattern, line, re.IGNORECASE) for pattern in reference_patterns):
            # Verify following lines contain citation patterns
            following_lines = lines[i+1:i+4]
            has_citations = False
            
            for follow_line in following_lines:
                if follow_line.strip():
                    # Check for citation indicators
                    if (re.search(r'\(\d{4}\)', follow_line) or    # Year in parentheses
                        re.search(r'\d{4}\.', follow_line) or       # Year with period
                        'doi:' in follow_line.lower() or           # DOI identifier
                        ' et al' in follow_line.lower()):          # Author pattern
                        has_citations = True
                        break
            
            # Cut text if citations found or near document end
            if has_citations or i >= len(lines) - 3:
                cut_index = i
                break
    
    if cut_index != -1:
        return '\n'.join(lines[:cut_index]).strip()
    
    return text.strip()

# Process PDF files and extract text chunks
chunks = []
chunks2 = []
ids = []
if os.path.exists(pdf_directory) and os.listdir(pdf_directory):
    for filename in tqdm(os.listdir(pdf_directory), total=len(os.listdir(pdf_directory))):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            
            # Extract article identifier from filename
            article_id = filename.split(".pdf")[0]
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                page_text = page.get_text()
                text += page_text + "\n"
                
            doc.close()

            text = remove_references_section(text)

            # Extract DOI matches
            doi_matches = re_doi.finditer(text)
            for match in doi_matches:
                if match.group() in article_id: continue
                chunk = text[max(0, match.start() - text_span_len): match.start() + text_span_len]
                chunks.append((article_id, chunk))

            # Extract accession ID matches
            for rr in relist:
                matches = rr.finditer(text)
                for match in matches:
                    ids.append(match.group())
                    chunk = text[max(0, match.start() - text_span_len): match.start() + text_span_len]
                    chunks2.append((article_id, chunk))
else:
    print(f"Warning: PDF directory '{pdf_directory}' not found or empty. Please add PDF files to process.")

print(f"DOI chunks: {len(chunks)}")
print(f"Accession ID chunks: {len(chunks2)}")

  0%|          | 0/524 [00:00<?, ?it/s]

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

In [20]:
chunks = chunks[:1000]  # Limit to first 1000 chunks for testing
chunks2 = chunks2[:1000]  # Limit to first 1000 chunks for testing
ids = ids[:1000]  # Limit to first 1000 IDs for testing

## Model Initialization

In [4]:
# Use local Qwen3-0.6B model instead of Kaggle input
model_path = "/home/naohiro/MakeDataCount/qwen3-8b-awq"  # Use HuggingFace model name for local download

# Initialize vLLM model with smaller resource requirements for 0.6B model
llm = vllm.LLM(
    model_path,
    tensor_parallel_size=1,  # Single GPU for small model
    gpu_memory_utilization=0.7,  # Lower memory usage for smaller model
    trust_remote_code=True,
    quantization="awq",  # Use AWQ quantization for smaller model
    dtype="half",
    enforce_eager=True,
    max_model_len=4096,  # Reduced context length for smaller model
    disable_log_stats=True,
    enable_prefix_caching=True
)
tokenizer = llm.get_tokenizer()

INFO 07-30 19:19:41 [__init__.py:235] Automatically detected platform cuda.
INFO 07-30 19:19:44 [config.py:1604] Using max model len 4096
INFO 07-30 19:19:45 [awq_marlin.py:120] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 07-30 19:19:46 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='/home/naohiro/MakeDataCount/qwen3-8b-awq', speculative_config=None, tokenizer='/home/naohiro/MakeDataCount/qwen3-8b-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='xgramma

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 07-30 19:19:48 [default_loader.py:262] Loading weights took 1.29 seconds
INFO 07-30 19:19:48 [model_runner.py:1115] Model loading took 5.7071 GiB and 1.372671 seconds
INFO 07-30 19:19:50 [worker.py:295] Memory profiling takes 1.26 seconds
INFO 07-30 19:19:50 [worker.py:295] the current vLLM instance can use total_gpu_memory (11.72GiB) x gpu_memory_utilization (0.70) = 8.20GiB
INFO 07-30 19:19:50 [worker.py:295] model weights take 5.71GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 1.42GiB; the rest of the memory reserved for KV Cache is 1.06GiB.
INFO 07-30 19:19:50 [executor_base.py:113] # cuda blocks: 484, # CPU blocks: 1820
INFO 07-30 19:19:50 [executor_base.py:118] Maximum concurrency for 4096 tokens per request: 1.89x
INFO 07-30 19:19:51 [llm_engine.py:424] init engine (profile, create kv cache, warmup model) took 2.79 seconds


# Prompt Templates

In [21]:
SYS_PROMPT_DOI = """
You are given a piece of academic text. Your task is to identify a DOI citation that refers specifically to research data.

Only respond with either a full normalized DOI URL starting with "https://doi.org/" or the word "Irrelevant" (without quotes).

Do NOT include any other text or explanation.

If there is no DOI related to research data, respond with exactly "Irrelevant".

If multiple DOIs refer to research data, return any one of them.
"""

SYS_PROMPT_ACCESSION = """
You are given a piece of academic text. Your task is to determine whether the provided Accession ID refers to a dataset used in the study.

Classify the data associated with the Accession ID as:
A) Primary — if the data was generated specifically for this study.
B) Secondary — if the data was reused or derived from prior work.
C) None — if the ID is mentioned in a different context (e.g., not related to data use, or is unrelated to the study).

Respond with only one letter: A, B, or C.
"""

SYS_PROMPT_CLASSIFY_DOI = """
You are given a piece of academic text. Your task is to classify the data associated with the given DOI.

Classify the data as:
A) Primary: if the data was generated specifically for this study.
B) Secondary: if the data was reused or derived from prior work.
C) None: if the DOI is part of the References section of a paper, does not refer to research data or is unrelated.

Respond with only one letter: A, B, or C.
"""

## DOI Extraction

In [22]:
prompts = []
for article_id, academic_text in chunks:
    messages = [
        {"role": "system", "content": SYS_PROMPT_DOI},
        {"role": "user", "content": academic_text}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )
    
    prompts.append(prompt)

outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        seed=0,
        skip_special_tokens=True,
        max_tokens=80,
        temperature=0
    ),
    use_tqdm=True
)

responses = [output.outputs[0].text.strip() for output in outputs]

doi_pattern = re.compile(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', re.I)

doi_urls = []
for response in responses:
    if response.lower() == "irrelevant":
        doi_urls.append("Irrelevant")
    else:
        match = doi_pattern.search(response)
        if match:
            doi_urls.append("https://doi.org/" + match.group(1))
        else:
            doi_urls.append("Irrelevant")  # fallback

Adding requests:   0%|          | 0/1000 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1000 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…



## DOI Classification

Constrained generation using MultipleChoiceLogitsProcessor to ensure valid class selection.

In [23]:
prompts = []
valid_indices = []
for i, (chunk, url) in enumerate(zip(chunks, doi_urls)):
    #if url == "Irrelevant":
    #    continue  # Skip irrelevant DOIs

    article_id, academic_text = chunk
    messages = [
        {"role": "system", "content": SYS_PROMPT_CLASSIFY_DOI},
        {"role": "user", "content": f"DOI: {url}\n\nAcademic text:\n{academic_text}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )
    prompts.append(prompt)
    valid_indices.append(i)

# Initialize logits processor for constrained generation
mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=["A", "B", "C"])

outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        temperature=0.1,
        skip_special_tokens=True,
        max_tokens=1,
        logits_processors=[mclp],
        logprobs=len(mclp.choices)
    ),
    use_tqdm=True
)

# Extract log probabilities for each choice
logprobs = []
for lps in [output.outputs[0].logprobs[0].values() for output in outputs]:
    logprobs.append({lp.decoded_token: lp.logprob for lp in list(lps)})

logit_matrix = pd.DataFrame(logprobs)[["A", "B", "C"]].values
choices = ["Primary", "Secondary", None]
answers = [choices[pick] for pick in np.argmax(logit_matrix, axis=1)]

Adding requests:   0%|          | 0/1000 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1000 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

In [24]:
# Process accession IDs for classification
prompts = []
for chunk, acc_id in zip(chunks2, ids):
    article_id, academic_text = chunk
    messages = [
        {"role": "system", "content": SYS_PROMPT_ACCESSION},
        {"role": "user", "content": f"Accession ID: {acc_id}\n\nAcademic text:\n{academic_text}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )
    prompts.append(prompt)

outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        temperature=0.1,
        skip_special_tokens=True,
        max_tokens=1,
        logits_processors=[mclp],
        logprobs=len(mclp.choices)
    ),
    use_tqdm=True
)

# Extract log probabilities for accession ID classifications
logprobs2 = []
for lps in [output.outputs[0].logprobs[0].values() for output in outputs]:
    logprobs2.append({lp.decoded_token: lp.logprob for lp in list(lps)})

logit_matrix2 = pd.DataFrame(logprobs2)[["A", "B", "C"]].values
choices2 = ["Primary", "Secondary", None]
answers2 = [choices2[pick] for pick in np.argmax(logit_matrix2, axis=1)]

Adding requests:   0%|          | 0/1000 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1000 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

## Results Compilation

In [25]:
# Create submission dataframes
sub_df = pd.DataFrame()
sub_df["article_id"] = [c[0] for c in chunks]
sub_df["dataset_id"] = doi_urls
sub_df["dataset_id"] = sub_df["dataset_id"].str.lower()
sub_df["type"] = answers
sub_df = sub_df[sub_df["type"].notnull()].reset_index(drop=True)

sub_df2 = pd.DataFrame()
sub_df2["article_id"] = [c[0] for c in chunks2]
sub_df2["dataset_id"] = ids
sub_df2["type"] = answers2
sub_df2 = sub_df2[sub_df2["type"].notnull()].reset_index(drop=True)

# Combine and deduplicate results
sub_df = pd.concat([sub_df, sub_df2], ignore_index=True)
sub_df = sub_df[sub_df["type"].isin(["Primary", "Secondary"])].reset_index(drop=True)
sub_df = sub_df.sort_values(by=["article_id", "dataset_id", "type"], ascending=False)\
               .drop_duplicates(subset=['article_id', 'dataset_id'], keep="first")\
               .reset_index(drop=True)

# Generate submission file
sub_df['row_id'] = range(len(sub_df))
print(sub_df["type"].value_counts())

type
Primary    713
Name: count, dtype: int64


## Performance Evaluation

In [26]:
def f1_score(tp, fp, fn):
    return 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    
label_df = pd.read_csv("../dataset/train_labels.csv")
label_df = label_df[label_df['type'] != 'Missing'].reset_index(drop=True)

hits_df = label_df.merge(sub_df, on=["article_id", "dataset_id", "type"])

tp = hits_df.shape[0]
fp = sub_df.shape[0] - tp
fn = label_df.shape[0] - tp

print("TP:", tp)
print("FP:", fp)
print("FN:", fn)
print("F1 Score:", round(f1_score(tp, fp, fn), 3))

TP: 50
FP: 663
FN: 669
F1 Score: 0.07
