In [None]:
from typing import List, Dict
from langchain.vectorstores.pgvector import PGVector

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# THE FIRST TIME YOU RUN THIS, IT MIGHT TAKE A WHILE
model_path_or_id = "mistralai/Mistral-7B-v0.1"
lora_path = "./mistral-7b-int4-dolly"

if lora_path:
    # load base LLM model with PEFT Adapter
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        bnb_4bit_compute_dtype=torch.float16,
        use_flash_attention_2=True,
        load_in_4bit=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(lora_path)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        bnb_4bit_compute_dtype=torch.float16,
        use_flash_attention_2=True,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)

def generate(prompt, max_new_tokens = 100, temperature = 0.3):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample aacording to the parameter
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=max_new_tokens, 
            do_sample=True, 
            top_p=0.9,
            temperature=temperature,
            use_cache=True
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

In [None]:
# The connection to the database
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver= "psycopg2",
    host = "localhost",
    port = "5432",
    database = "postgres",
    user= "username",
    password="password"
)

# The embedding function that will be used to store into the database
embedding_function = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

# Creates the database connection to our existing DB
db = PGVector(
    connection_string = CONNECTION_STRING,
    collection_name = "embeddings",
    embedding_function = embedding_function
)

In [None]:
RAG_PROMPT_TEMPLATE = """### System:
You are an information extraction system.  Use only the Context provide below to answer the Question.

### Context:
{context}

### Question:
{question}

### Response:
"""

empty_context = ""
question = "What did Dr. James Harper say about NeuroGlyde?"

docs_with_scores = db.similarity_search_with_score(question, k = 1)
context_prompt = RAG_PROMPT_TEMPLATE.format(
    context = docs_with_scores[0][0].page_content,
    question = question
)

res = generate(context_prompt, max_new_tokens = 100, temperature = 0.3)

print(f"Question:\n{question}\n")
print(f"Generated Response:\n{res}")