# 05.3 - Compare Prompting to Semantic Search for Scientific Paper Classification

In [None]:
import json
import pandas as pd

with open("../data/modeling_papers.json", "r") as f:
    data = json.load(f)

df = pd.json_normalize(data)

df.head(5)

In [None]:
# documents = [x['abstract'] for i, x in df.iterrows()]
# ids = [x['id'] for i, x in df.iterrows()]

ids = [x.id for x in df.itertuples() if x.abstract]
documents = [x.abstract for x in df.itertuples() if x.abstract]

collection.add(documents=documents, ids=ids)

In [None]:
for title in titles:
    results = collection.query(query_texts=[title], n_results=1)

    print(title)
    print(results["documents"][0][0])
    print(results["distances"][0][0])
    print("\n")

print(f"identified {sum(is_model)} of {len(data)} modeling papers")

The goal of the model classification prompt is to insruct the LLM to only return YES or NO, if the authors of the paper employ an infectious disease modeling technique or not, respectively.

In [None]:
MODEL_CLASSIFICATION_PROMPT_TEMPLATE = """
Given the following scientific paper abstract, identify if a disease modeling technique is used in the paper.
Some examples of disease modeling techniques include compartmental models, agent-based models, network-based models, and statistical models. 
Paper abstracts that reference dyanmics, forecasting, or predictors also likely describe disease modeling techniques.
Only return YES or NO. Do not include any additional text or information.
Do not guess. Your answers must be accurate.

abstract:
{abstract}
"""

In [None]:
MODEL_CLASSIFICATION_PROMPT_TEMPLATE = """
Read the following scientific paper abstract. Based on the content, determine if the paper explicitly refers to or uses a disease modeling technique.
A disease modeling technique typically involves mathematical or computational models used to simulate, predict, or analyze the spread of a disease,
including factors such as transmission dynamics, interventions, or epidemiological parameters.
Consider whether the abstract mentions models, simulations, equations, or other approaches used to study or predict disease spread or control.
Answer "YES" if a disease modeling technique is mentioned or implied, and "NO" if it is not.
Do not include any additional text or information.

Abstract:
{abstract}
"""

In [None]:
MODEL_CLASSIFICATION_PROMPT_TEMPLATE = """
Read the following scientific paper abstract. Based on the content, determine if the paper explicitly refers to or uses a disease modeling technique.
A disease modeling technique involves using mathematical, statistical, or computational methods to simulate, analyze, or predict the spread of a disease, its outcomes, or the effects of interventions.
These techniques may include models such as compartmental models (e.g., SIR, SEIR), statistical models (e.g., regression, q-statistical models), or other forms of modeling used to describe epidemic dynamics, transmission rates, or epidemic peaks.
Consider whether the abstract mentions models, simulations, equations, statistical forms, or other approaches used to study or predict disease spread, outcomes, or the impact of interventions.

If the abstract describes using such methods to analyze or forecast the course of a disease, answer "YES". If the abstract does not describe a modeling approach but instead focuses on observational data or non-modeling analysis, answer "NO".

Abstract:
{abstract}
"""

In [None]:
_PROMPT_TEMPLATE = """
Read the following scientific paper abstract. Based on the content, determine if the paper explicitly refers to or uses a disease modeling technique.
A disease modeling technique involves using mathematical, statistical, or computational methods to simulate, analyze, or predict the spread of a disease, its outcomes, or the effects of interventions.
These techniques may include models such as compartmental models (e.g., SIR, SEIR), statistical models (e.g., regression, q-statistical models), or other forms of modeling used to describe epidemic dynamics, transmission rates, or epidemic peaks.
Consider whether the abstract mentions models, simulations, equations, statistical forms, or other approaches used to study or predict disease spread, outcomes, or the impact of interventions.
If the abstract describes using such methods to analyze or forecast the course of a disease, answer "YES".

If the abstract does not describe a modeling approach but instead focuses on observational data or non-modeling analysis, answer "NO".

Abstract:
{abstract}
"""

In [None]:
MODEL_CLASSIFICATION_PROMPT_TEMPLATE = """
Read the following scientific paper abstract. Based on the content, determine if the paper explicitly refers to or uses a disease modeling technique.
A disease modeling technique involves mathematical, statistical, or computational methods used to simulate, analyze, predict, or interpret the dynamics of a disease, including its transmission, outcomes, and the effects of interventions.

These techniques can include:
- Compartmental models (e.g., SIR, SEIR models) to describe the spread of disease among populations, 
- Statistical models (e.g., regression, zero-inflated models) that analyze disease incidence, risk factors, and predict future trends.
- Simulation models to forecast disease outcomes under various scenarios.
- Mathematical equations or functional forms to estimate epidemic characteristics, such as reproduction numbers, peak timings, or growth rates.

Look for any mention of models, equations, simulations, statistical forms, or other methods used to analyze or predict disease transmission, risk factors, or the effects of interventions.

Even if the model is statistical or non-compartmental (e.g., regression models), if it is used to analyze the spread or dynamics of disease and predict outcomes, it qualifies as a disease modeling technique.

If the abstract describes or references any of these methods or similar approaches, answer "YES". If the abstract focuses on non-modeling analysis, such as reporting observational data without reference to disease modeling techniques, answer "NO".

Abstract:
{abstract}
"""

The following parameters are used when a model is instantiated. These determine where and how the model is stored in memory (system/CPU or GPU memory).

In [None]:
model_kwargs = {
    "low_cpu_mem_usage": True,
    "device_map": "sequential",  # load the model into GPUs sequentially, to avoid memory allocation issues with balancing
    "torch_dtype": "auto",
}

The following generator parameters are model hyperparameters used during text generation.

In [None]:
generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.75,
    "top_k": 50,
    "top_p": 0.95,
}

These are the Hugging Face identifiers for the models that we'll use for classifying documents.

In [None]:
MODEL_IDS = [
    # "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "mistralai/Ministral-8B-Instruct-2410",
    "Qwen/Qwen2.5-7B-Instruct",
]

Define functions for instantiating models and using models to generate text.

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


def load_model(model_id, model_kwargs):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

    return model, tokenizer

In [None]:
def generate_text(model, prompt, tokenizer, generate_kwargs):
    generate_kwargs["bos_token_id"] = tokenizer.bos_token_id
    generate_kwargs["pad_token_id"] = tokenizer.eos_token_id
    generate_kwargs["eos_token_id"] = tokenizer.eos_token_id

    # processor = AutoProcessor.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
    # generate_kwargs["attention_mask"] = processor("test", return_tensors="pt")['attention_mask']

    messages = [{"role": "user", "content": prompt}]

    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(
        model.device
    )

    outputs = model.generate(input_ids, **generate_kwargs)

    response = outputs[0][input_ids.shape[-1] :]

    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
import torch

from ipywidgets import IntProgress
from IPython.display import display

results = {}

for model_id in MODEL_IDS:
    print(model_id)

    progress_bar = IntProgress(min=0, max=len(df))
    display(progress_bar)

    model, tokenizer = load_model(model_id, model_kwargs)
    is_modeling = []

    for paper in df.itertuples():
        prompt = MODEL_CLASSIFICATION_PROMPT_TEMPLATE.format(abstract=paper.abstract)
        result = generate_text(model, prompt, tokenizer, generate_kwargs)

        if "YES" in result:
            is_modeling.append(True)
        else:
            is_modeling.append(False)

        progress_bar.value += 1

    results[model_id] = is_modeling

    del model
    torch.mps.empty_cache()

In [None]:
df_results = pd.DataFrame(results)
df_results

In [None]:
for i in range(len(MODEL_IDS)):
    print(MODEL_IDS[i])
    print(sum(df_results[MODEL_IDS[i]]))

Determine 

In [None]:
pos_rate = []

for row in df_results.itertuples():
    pos_rate.append(sum(row[1:]) / len(df_results.columns))

df_results["pos_rate"] = pos_rate
df_results

In [None]:
df_results.to_json("results.json")

In [None]:
df_results.query("pos_rate < .5")

In [None]:
results = collection.query(query_texts=["infectious disease modeling"], n_results=400)

match_ids = results["ids"]

In [None]:
is_modeling = []

for i, x in df_validate.iterrows():
    if x["id"] in match_ids[0]:
        is_modeling.append(True)
    else:
        is_modeling.append(False)

df_results["chroma"] = is_modeling

In [None]:
sum(df_results["chroma"])

In [None]:
is_modeling_agree = [
    a and b and c and d
    for a, b, c, d in zip(
        df_results[MODEL_IDS[0]],
        df_results[MODEL_IDS[1]],
        df_results[MODEL_IDS[2]],
        df_results["chroma"],
    )
]
print(sum(is_modeling_agree))

In [None]:
titles_to_validate = []

modeling_keywords = [
    "model",
    "spatial",
    "dynamics",
    "forecast",
    "simulate",
    "simulating",
    "quantify",
]
modeling_titles = []
titles = []

for i, x in df_validate.iterrows():
    if is_modeling_agree[i]:
        titles.append(x["title"])
        for y in modeling_keywords:
            if y in x["title"].lower():
                modeling_titles.append(x["title"])

print(len(titles))
print(len(modeling_titles))

In [None]:
for x in titles:
    if x not in modeling_titles:
        print(x)