<h1>Implementing a RAG process</h1>

Importing libraries:

In [38]:
import warnings
warnings.filterwarnings("ignore")

In [39]:
from tqdm import tqdm
import torch

<h2>Creating a vector database</h2>

Defining embedding model:

In [7]:
!pip install -qU langchain-huggingface

In [8]:
!pip install sentence-transformers



In [9]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

Defining a vector database:

In [None]:
from langchain_chroma import Chroma

database_creation_flag = False

if database_creation_flag:
    vector_store = Chroma(
        collection_name="example_collection",
        embedding_function=embeddings,
        persist_directory="./chroma_langchain_db"
    )

Processing input files:

In [11]:
docs = []
for file_name in ['test.txt', 'train.txt', 'val.txt']:
    with open(file_name, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]


    for i in range(0, len(lines), 2):
        if i+1 < len(lines):
            prompt = lines[i].replace("Prompt:", "").strip().strip('"')
            response = lines[i+1].replace("Response:", "").strip().strip('"')
            docs.append(prompt + '\n' + response)

In [12]:
docs[:3]

['Tell me the common excipient combinations for a Chewable tablet drug containing the active ingredient calcium carbonate.\nA Chewable tablet drug containing calcium carbonate typically uses excipients such as acacia, magnesium stearate, maltodextrin, sucrose.',
 'Tell me the common excipient combinations for a Solution drug containing the active ingredient levetiracetam.\nA Solution drug containing levetiracetam typically uses excipients such as acesulfame potassium, citric acid monohydrate, glycerin, glycyrrhizin, maltitol, methylparaben, propylparaben, water, sodium citrate.',
 'Tell me the common excipient combinations for a Chewable tablet drug containing the active ingredient sodium fluoride, vitamin a, ascorbic acid, sodium ascorbate, cholecalciferol, .alpha.-tocopherol acetate, dl-, thiamine mononitrate, riboflavin, niacinamide, pyridoxine hydrochloride, folic acid, cyanocobalamin.\nA Chewable tablet drug containing sodium fluoride, vitamin a, ascorbic acid, sodium ascorbate, c

In [13]:
len(docs)

9076

Placing documents into a vector database:

In [14]:
from langchain_core.documents import Document

docs = [Document(page_content=d) for d in docs]

In [None]:
#This cell is to be run if you want to delete the existing database and create a new one
delete_database_flag = False
if delete_database_flag:
    name = vector_store._collection.name
    vector_store._client.delete_collection(name)
    vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db")

In [None]:
update_database_flag = False
if update_database_flag:
    for doc in tqdm(docs):
        vector_store.add_documents([doc])

100%|██████████| 9076/9076 [10:35<00:00, 14.27it/s]


<h2>Combined code for a model with a vector-database given</h2>

A code below can be used in a final project:

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma   # pip install langchain-chroma

#Defining a path to a database
PERSIST_DIR = "chroma_langchain_db"

#Defining the embedding model used for a database creation
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

#Opening the database with the defined parameters
vectordb = Chroma(
    collection_name="example_collection",
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings,
)

#Defining a function to generate context from the database based on a question
def generate_context(vector_storage, question):
    retrieved_docs = vector_storage.similarity_search(question)
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return docs_content

  from .autonotebook import tqdm as notebook_tqdm


Examle run:

In [2]:
generate_context(vectordb, "Tell me the common excipient combinations for a Chewable tablet drug containing the active ingredient vitamin a, ascorbic acid, vitamin d, .alpha.-tocopherol acetate, dl-, thiamine mononitrate, riboflavin, niacin, pyridoxine, folic acid, cyanocobalamin, sodium fluoride.")

'Tell me the common excipient combinations for a Chewable tablet drug containing the active ingredient vitamin a, ascorbic acid, vitamin d, .alpha.-tocopherol acetate, dl-, thiamine mononitrate, riboflavin, niacin, pyridoxine, folic acid, cyanocobalamin, sodium fluoride.\nA Chewable tablet drug containing vitamin a, ascorbic acid, vitamin d, .alpha.-tocopherol acetate, dl-, thiamine mononitrate, riboflavin, niacin, pyridoxine, folic acid, cyanocobalamin, sodium fluoride typically uses excipients such as citric acid monohydrate, magnesium stearate, mannitol, cellulose, microcrystalline, sucralose, talc, xylitol.\n\nTell me the common excipient combinations for a Chewable tablet drug containing the active ingredient vitamin a, ascorbic acid, vitamin d, .alpha.-tocopherol acetate, dl-, thiamine mononitrate, riboflavin, niacin, pyridoxine, folic acid, cyanocobalamin, sodium fluoride.\nA Chewable tablet drug containing vitamin a, ascorbic acid, vitamin d, .alpha.-tocopherol acetate, dl-, th