# 05.2 - Classification of Scientific Papers Using Open Hugging Face Models

This notebook explores how open LLMs, such a Mistral, Llama, Gemma, Specter, etc., can be used for classifying scientific papers based on the content or their abstracts. Specifically, these models will be used to detect papers that discuss infectious disease modeling, and further identify which modeling techniques are used.

In order to increase the accuracy of the classification, multiple models will be evaluated and employed.

Load paper abstracts from Hugging Face or locally if they've already been downloaded.

In [None]:
import pandas as pd
import os.path

fname = "../data/train.json"
if os.path.isfile(fname):
    df_train = pd.read_json(fname)
else:
    df_train = pd.read_csv("hf://datasets/krosenf/midas-abstracts/train.csv")
    df_train.to_json(fname)

fname = "../data/validate.json"
if os.path.isfile(fname):
    df_validate = pd.read_json(fname)
else:
    df_validate = pd.read_csv("hf://datasets/krosenf/midas-abstracts/validate.csv")
    df_validate.to_json(fname)

fname = "../data/test.json"
if os.path.isfile(fname):
    df_test = pd.read_json(fname)
else:
    df_test = pd.read_csv("hf://datasets/krosenf/midas-abstracts/test.csv")
    df_test.to_json(fname)

print(df_train.shape)
print(df_validate.shape)
print(df_test.shape)

df = df_validate

# df = pd.concat([df_train, df_validate, df_test])
# df.shape

In [None]:
from transformers import AutoProcessor


def generate_text(model, prompt, tokenizer, generate_kwargs):
    generate_kwargs["bos_token_id"] = tokenizer.bos_token_id
    generate_kwargs["pad_token_id"] = tokenizer.eos_token_id
    generate_kwargs["eos_token_id"] = tokenizer.eos_token_id

    # processor = AutoProcessor.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
    # generate_kwargs["attention_mask"] = processor("test", return_tensors="pt")['attention_mask']

    messages = [{"role": "user", "content": prompt}]

    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(
        model.device
    )

    outputs = model.generate(input_ids, **generate_kwargs)

    response = outputs[0][input_ids.shape[-1] :]

    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
import torch

from ipywidgets import IntProgress
from IPython.display import display

results = {}

for model_id in MODEL_IDS:
    print(model_id)

    progress_bar = IntProgress(min=0, max=len(df))
    display(progress_bar)

    model, tokenizer = load_model(model_id, model_kwargs)
    is_modeling = []

    for paper in df.itertuples():
        prompt = MODEL_CLASSIFICATION_PROMPT_TEMPLATE.format(abstract=paper.abstract)
        result = generate_text(model, prompt, tokenizer, generate_kwargs)

        if "YES" in result:
            is_modeling.append(True)
        else:
            is_modeling.append(False)

        progress_bar.value += 1

    results[model_id] = is_modeling

    del model
    torch.mps.empty_cache()

In [None]:
df_results = pd.DataFrame(results)
df_results

In [None]:
for i in range(len(MODEL_IDS)):
    print(MODEL_IDS[i])
    print(sum(df_results[MODEL_IDS[i]]))

In [None]:
pos_rate = []

for row in df_results.itertuples():
    pos_rate.append(sum(row[1:]) / len(df_results.columns))

df_results["pos_rate"] = pos_rate
df_results

In [None]:
df_results.to_json("results.json")

In [None]:
df_results.query("pos_rate < .5")

In [None]:
results = collection.query(query_texts=["infectious disease modeling"], n_results=400)

match_ids = results["ids"]

In [None]:
is_modeling = []

for i, x in df_validate.iterrows():
    if x["id"] in match_ids[0]:
        is_modeling.append(True)
    else:
        is_modeling.append(False)

df_results["chroma"] = is_modeling

In [None]:
sum(df_results["chroma"])

In [None]:
is_modeling_agree = [
    a and b and c and d
    for a, b, c, d in zip(
        df_results[MODEL_IDS[0]],
        df_results[MODEL_IDS[1]],
        df_results[MODEL_IDS[2]],
        df_results["chroma"],
    )
]
print(sum(is_modeling_agree))

In [None]:
titles_to_validate = []

modeling_keywords = [
    "model",
    "spatial",
    "dynamics",
    "forecast",
    "simulate",
    "simulating",
    "quantify",
]
modeling_titles = []
titles = []

for i, x in df_validate.iterrows():
    if is_modeling_agree[i]:
        titles.append(x["title"])
        for y in modeling_keywords:
            if y in x["title"].lower():
                modeling_titles.append(x["title"])

print(len(titles))
print(len(modeling_titles))

In [None]:
for x in titles:
    if x not in modeling_titles:
        print(x)