# Experiment

This notebook further assesses LLM's abilities to produce PRISMA-like reviews by generating its responses to the three questions about [glutamate](https://en.wikipedia.org/wiki/Glutamate_(neurotransmitter)) in `glutamate/glutamate.ipynb`.

In [None]:
q1 = "What is glutamate?"
q2 = "How does glutamate change in response to exercise?"
q3 = "What is log2 fold change of glutamate in response to exercise?"

### Requirements

In [None]:
from apikey import PINECONE_API_KEY, PINECONE_ENV, HF_AUTH_TOKEN

In [None]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client==2.2.2 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0

### Inputs

In [None]:
model = "meta-llama/Llama-2-13b-chat-hf"
temperature = 0.0

### Set Up Experiment

In [None]:
# Initialize Pipeline

from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [None]:
import pinecone

# connect to pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

# connect to the index
index_name = 'glutamate'
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
# Initialize LLM

from torch import cuda, bfloat16
import transformers

model_id = model

device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"

# set quantization configuration to load large model with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items
hf_auth = HF_AUTH_TOKEN
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

(…)a-2-13b-chat-hf/resolve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

(…)t-hf/resolve/main/generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [None]:
# Initialize Model Tokenizer

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

(…)at-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)-13b-chat-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
# Initialize Pipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,    # langchain expects the full text
    task='text-generation',
    # model parameters
    temperature=temperature,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [None]:
# Implement in LangChain

from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
# Initialize LangChain Vector Store

from langchain.vectorstores import Pinecone

text_field = "text"    # field in metadata that contains text content

vectorstore = Pinecone(
    index,
    embed_model.embed_query,
    text_field
)

In [None]:
# Create RAG Pipeline

from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

### Run Experiment

In [None]:
def get_llm_response(question, llm=llm):
    response = llm(question)
    return response

def get_rag_response(question):
    response = rag_pipeline(question)["result"]
    return response

In [None]:
# Question 1
llm_response = get_llm_response(q1, llm)
rag_response = get_rag_response(q1)
with open("results/q1.txt", "a") as file:
    results = [
        "\n",
        f"Model: {model} \n",
        f"Temperature: {temperature} \n",
        f"LLM: {llm_response} \n",
        f"RAG: {rag_response} \n",
        "\n"
    ]
    file.writelines(results)

In [None]:
# Question 2
llm_response = get_llm_response(q2, llm)
rag_response = get_rag_response(q2)
with open("results/q2.txt", "a") as file:
    results = [
        "\n",
        f"Model: {model} \n",
        f"Temperature: {temperature} \n",
        f"LLM: {llm_response} \n",
        f"RAG: {rag_response} \n",
        "\n"
    ]
    file.writelines(results)

In [None]:
# Question 3
llm_response = get_llm_response(q3, llm)
rag_response = get_rag_response(q3)
with open("results/q3.txt", "a") as file:
    results = [
        "\n",
        f"Model: {model} \n",
        f"Temperature: {temperature} \n",
        f"LLM: {llm_response} \n",
        f"RAG: {rag_response} \n",
        "\n"
    ]
    file.writelines(results)