In [2]:
!pip install -r requirements.txt

Collecting accelerate==0.27.2 (from -r requirements.txt (line 1))
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m153.6/280.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting anyio==4.3.0 (from -r requirements.txt (line 5))
  Downloading anyio-4.3.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes==0.42.0 (from -r requirements.txt (line 8))
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting clou

In [1]:
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import set_seed
import torch
from typing import Callable
import random
import os

In [2]:
# Configuration variables for this whole notebook
class config:
    seed = 42
    model = "Mistral-7B-OpenOrca"
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
    top_k = 20
    top_p = 0.95
    do_sample = True
    num_return_sequences = 1
    max_new_tokens = 500
    temperature = 0.8
    repetition_penalty = 1.2
    penalty_alpha=0.6


In [None]:
!git lfs install
!git clone https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca

Git LFS initialized.
Cloning into 'Mistral-7B-OpenOrca'...
remote: Enumerating objects: 160, done.[K
remote: Total 160 (delta 0), reused 0 (delta 0), pack-reused 160[K
Receiving objects: 100% (160/160), 852.99 KiB | 32.81 MiB/s, done.
Resolving deltas: 100% (80/80), done.


In [None]:
km_output = pd.read_csv("./data.csv")
km_output

Unnamed: 0.1,Unnamed: 0,pmid,abstract,label
0,0,23956253,PMID: 23956253 Text: The first aim was to crit...,0
1,1,23444397,PMID: 23444397 Text: Niacin has potentially fa...,0
2,2,28886926,"PMID: 28886926 Text: In 2016, the American Col...",0
3,3,27701660,PMID: 27701660 Text: Low-density lipoprotein c...,1
4,4,19095139,PMID: 19095139 Text: This secondary analysis f...,0
5,5,21095263,PMID: 21095263 Text: Lowering low-density lipo...,0


In [None]:
b_terms_pmids = km_output.ab_pmid_intersection.map(lambda pmid_list: pmid_list.strip('][').split(', '))
# Grab only the abstract from each list of pmids in the TSV
abstracts = [process_abstracts_data(config, pmid_list)[0] for pmid_list in b_terms_pmids] # Fetch abstracts from each b_term's PMID list
# There should only be one a_term, so it's safe to grab the first index
a_term = km_output.a_term.unique().tolist()[0].split("&")[0]
b_terms = km_output.b_term.unique().tolist()

In [None]:
hyp = "ezetimibe may effectively alleviate or target key pathogenic mechanisms of diabetes potentially offering therapeutic benefits or slowing disease progression."
sys_prompt = "You are an incredibly brilliant biomedical researcher who has spent their lifetime reading all the papers in PubMed. You are focused on assisting other researchers in evaluating suggested hypotheses given abstracts in PubMed. The purpose of evaluating suggested hypothesis is to uncover novel connections between the insights found in various biomedical texts. Rely on what you know from the papers you have read, but think out of the box as well."

In [None]:
def retrieveZeroShotCoTPrompt(hyp: str, abstract: str) -> str:
  zero_shot_prompt = f"""
    Hypothesis: {hyp}
    Abstract: {abstract}

    Determine whether or not this abstract is relevant for scientifically evaluating the provided hypothesis.
    A relevant abstract should either support the given hypothesis or have evidence to refute the hypothesis.
    A relevant abstract must directly comment on the hypothesis.

    Let us think through this step by step.
  """
  return zero_shot_prompt

In [None]:
prompt = retrieveZeroShotCoTPrompt(hyp, data["abstract"][2])

# Benchmarking Outlines



In [None]:
from guidance import models, gen, select, system, assistant, user

In [None]:
mistral = outlines.models.transformers("Mistral-7B-OpenOrca")

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.52s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from pydantic import BaseModel, field_validator, Field

In [None]:
import pydantic
pydantic.__version__

'2.6.1'

In [None]:
import pyarrow
pyarrow.__version__

'11.0.0'

In [None]:
class Output(BaseModel):
    chain_of_thought: str = Field(max_length=60)
    answer: int
    @field_validator("answer")
    def binary_check(cls, v):
        if v not in [0, 1]:
            raise ValueError("Has to be zero or one")
        return v

In [None]:
generator = outlines.generate.json(mistral, Output)

In [None]:
generator(["What is 2 - 2?"])

KeyboardInterrupt: 

In [None]:
with system():
    lm = mistral + sys_prompt

with user():
    lm += prompt

with assistant():
    lm += gen(max_tokens = 500, temperature = config.temperature, name = "chain_of_thought")

with user():
    lm += "Give a score of either 0: (Not relevant) or 1: (Relevant) for the above abstract. Answer: " + select([0, 1], name = "answer")

KeyboardInterrupt: 