# 05.1 - _Training_ an LLM to Classify Disease Modeling Papers using Promptbreeder Technique

https://arxiv.org/abs/2309.16797

In [None]:
import json
import pandas as pd

with open("../data/modeling_papers.json", "r") as f:
    data = json.load(f)

df_data = pd.json_normalize(data)

df_data.head(5)

The goal of the model classification prompt is to insruct the LLM to only return YES or NO, if the authors of the paper employ an infectious disease modeling technique or not, respectively.

In [None]:
TASK_PROMPT_TEMPLATE = """
Read the following scientific paper abstract. Based on the content, determine if the paper explicitly refers to or uses a disease modeling technique,
including but not limited to mathematical, statistical, or computational methods used to simulate, analyze, predict, or interpret the dynamics of a disease,
specifically in the context of estimating the probability of disease resurgence.

Consider the use of disease modeling if the abstract describes or references compartmental models, statistical models, simulation models, mathematical equations,
or functional forms to analyze or predict disease transmission, risk factors, or the effects of interventions.

Additionally, if the paper uses epidemiological modeling, disease forecasting, regression analysis, or statistical analysis to investigate associations
between disease characteristics and external factors, consider it a form of disease modeling technique.

If the abstract specifically mentions estimating the probability of disease resurgence using quantitative methods, such as statistical models or mathematical equations,
consider it a form of disease modeling technique.
"""

In [None]:
TASK_PROMPT_IO_TEMPLATE = """
If the abstract describes or references any of these methods or similar approaches, answer "YES".
If the abstract focuses on non-modeling analysis, such as reporting observational data without reference to disease modeling techniques, answer "NO".
Do not include any additional text or information.

Abstract:
{abstract}
"""

In [None]:
MUTATION_PROMPT_TEMPLATE = """
Read the prompt and scientific paper abstract below. Modify the prompt so that, if an LLM were given the modified prompt and abstract, the LLM would say that the paper explicitly refers to or uses a disease modeling technique.

Prompt:
{prompt}

Abstract:
{abstract}
"""

The following parameters are used when a model is instantiated. These determine where and how the model is stored in memory (system/CPU or GPU memory).

In [None]:
model_kwargs = {
    "low_cpu_mem_usage": True,
    "device_map": "sequential",  # load the model into GPUs sequentially, to avoid memory allocation issues with balancing
    "torch_dtype": "auto",
}

The following generator parameters are model hyperparameters used during text generation.

In [None]:
generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.75,
    "top_k": 50,
    "top_p": 0.95,
}

These are the Hugging Face identifiers for the models that we'll use for classifying documents.

In [None]:
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"

Define functions for instantiating models and using models to generate text.

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


def load_model(model_id, model_kwargs):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

    return model, tokenizer

In [None]:
# from transformers import AutoProcessor


def generate_text(model, prompt, tokenizer, generate_kwargs):
    generate_kwargs["bos_token_id"] = tokenizer.bos_token_id
    generate_kwargs["pad_token_id"] = tokenizer.eos_token_id
    generate_kwargs["eos_token_id"] = tokenizer.eos_token_id

    # processor = AutoProcessor.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
    # generate_kwargs["attention_mask"] = processor("test", return_tensors="pt")['attention_mask']

    messages = [{"role": "user", "content": prompt}]

    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(input_ids, **generate_kwargs)

    response = outputs[0][input_ids.shape[-1] :]

    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
model, tokenizer = load_model(MODEL_ID, model_kwargs)

In [None]:
import torch

from ipywidgets import IntProgress
from IPython.display import display

progress_bar = IntProgress(min=0, max=len(df_data))
display(progress_bar)

results = {}
is_modeling = []

prompt_template = TASK_PROMPT_TEMPLATE + "\n\n" + TASK_PROMPT_IO_TEMPLATE

for paper in df_data.itertuples():
    prompt = prompt_template.format(abstract=paper.abstract)
    result = generate_text(model, prompt, tokenizer, generate_kwargs)

    if "yes" in result.lower():
        is_modeling.append(True)
    else:
        is_modeling.append(False)

    progress_bar.value += 1

results["is_modeling"] = is_modeling
df_results = pd.DataFrame(results)
df_results

In [None]:
sum(is_modeling) / len(df_data)

In [None]:
df_neg = df_data[df_results["is_modeling"] == False]
df_neg

In [None]:
result = generate_text(
    model,
    MUTATION_PROMPT_TEMPLATE.format(
        prompt=TASK_PROMPT_TEMPLATE, abstract=df_neg.iloc[0].abstract
    ),
    tokenizer,
    generate_kwargs,
)

result

In [None]:
result = generate_text(
    model,
    result + "\nAbstract:\n" + df_neg.iloc[0].abstract,
    tokenizer,
    generate_kwargs,
)
result

## Clean-up

In [None]:
del model
torch.mps.empty_cache()