In [1]:
!pip install \
    transformers==4.30.2 \
    einops==0.6.1 \
    accelerate==0.20.3 \
    datasets==2.14.5 \
    chromadb \
    sentence-transformers==2.2.2
!pip install -U datasets
!pip install xformers
!pip install torch==2.0.1

Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.5
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8

In [2]:
!pip3 install numpy --pre torch --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu117


Looking in indexes: https://download.pytorch.org/whl/nightly/cu117
Collecting numpy
  Downloading https://download.pytorch.org/whl/nightly/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Downloading https://download.pytorch.org/whl/nightly/cu117/torch-2.1.0.dev20230621%2Bcu117-cp310-cp310-linux_x86_64.whl (1886.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 GB[0m [31m566.0 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/nightly/filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting typing-extensions (from torch)
  Downloading https://download.pytorch.org/whl/nightly/typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Collecting sympy (from torch)
  Downloading https://download.pytorch.org/whl/nightly/sympy-1.12-py3-

In [1]:
from importlib.metadata import version
version("chromadb")

'0.5.0'

In [9]:
import csv

# Initialize an empty dictionary to store the data
data_dict = {}
closed_qa_dataset = []

# Open the CSV file and read its contents
with open('/content/medquad_qa_pairs.csv', encoding="utf-8",newline='') as csvfile:
    reader = csv.DictReader(csvfile)

    # Iterate over each row in the CSV file
    for row in reader:
        # Extract data from the row
        question = row['question']
        answer = row['answer']
        source = row['source']
        focus_area = row['focus_area']

        # Construct the formatted entry
        formatted_entry = {
            'instruction': question,
            'context': answer,
            'source': source,
            'category': focus_area
        }

        # Append the formatted entry to the list
        closed_qa_dataset.append(formatted_entry)

# # Print the formatted entries
# for entry in formatted_entries:
#     print(entry)
# Print the resulting dictionary
closed_qa_dataset[0]

{'instruction': 'What is (are) Glaucoma ?',
 'context': "Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may cause damage to the optic nerve and other parts of the eye and result in loss of vision

In [15]:
import chromadb
from sentence_transformers import SentenceTransformer

class VectorStore:

    def __init__(self, collection_name):
       # Initialize the embedding model
        self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(name=collection_name)

    # Method to populate the vector store with embeddings from a dataset
    def populate_vectors(self, dataset):
        for i, item in enumerate(dataset):
            combined_text = f"{item['instruction']}. {item['context']}"
            embeddings = self.embedding_model.encode(combined_text).tolist()
            self.collection.add(embeddings=[embeddings], documents=[item['context']], ids=[f"id_{i}"])

    # Method to search the ChromaDB collection for relevant context based on a query
    def search_context(self, query, n_results=1):
        query_embeddings = self.embedding_model.encode(query).tolist()
        return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)


   # Initialize the handler with collection name
vector_store = VectorStore("knowledge-base2")

    # Assuming closed_qa_dataset is defined and available
vector_store.populate_vectors(closed_qa_dataset)

In [2]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class Falcon7BInstructModel:

    def __init__(self):
        # Model name
        model_name = "tiiuae/falcon-7b-instruct"
        self.pipeline, self.tokenizer = self.initialize_model(model_name)

    def initialize_model(self, model_name):
        # Tokenizer initialization
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Pipeline setup for text generation
        pipeline = transformers.pipeline(
            "text-generation",
            model=model_name,
            tokenizer=tokenizer,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto",
        )

        return pipeline, tokenizer

    def generate_answer(self, question, context=None):
        # Preparing the input prompt
        prompt = question if context is None else f"{context}\n\n{question}"

        # Generating responses
        sequences = self.pipeline(
            prompt,
            max_length=500,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
        )

        # Extracting and returning the generated text
        return sequences

In [3]:
# Initialize the Falcon model class
falcon_model = Falcon7BInstructModel()
# falcon_model=falcon_model.to(device)
user_question = "What are the signs and symptoms of Kaposi sarcoma?"

# Generate an answer to the user question using the LLM
answer = falcon_model.generate_answer(user_question)

print(f"Result: {answer}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.1.0.dev20230621+cu117)
    Python  3.10.14 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
The model 'FalconForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM'

Result: [{'generated_text': 'What are the signs and symptoms of Kaposi sarcoma?\nThe common signs and symptoms of Kaposi sarcoma include skin lesions, skin ulcers, lymph node swelling, and chest pain. However, it can also affect internal organs and cause severe complications.'}]


In [16]:

# Assuming vector_store and falcon_model have already been initialized

# Fetch context from VectorStore, assuming it's been populated
context_response = vector_store.search_context(user_question)

# Extract the context text from the response
# The context is assumed to be in the first element of the 'context' key
context = "".join(context_response['documents'][0])

# Generate an answer using the Falcon model, incorporating the fetched context
enriched_answer = falcon_model.generate_answer(user_question, context=context)

print(f"Result: {enriched_answer}")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Result: [{'generated_text': 'A sign of adult soft tissue sarcoma is a lump or swelling in soft tissue of the body. A sarcoma may appear as a painless lump under the skin, often on an arm or a leg. Sarcomas that begin in the abdomen may not cause signs or symptoms until they get very big. As the sarcoma grows bigger and presses on nearby organs, nerves, muscles, or blood vessels, signs and symptoms may include:          - Pain.    - Trouble breathing.        Other conditions may cause the same signs and symptoms. Check with your doctor if you have any of these problems.\n\nWhat are the signs and symptoms of Kaposi sarcoma?\nKaposi sarcoma is a type of cancer that causes tumors to form in the lymph or blood vessels of the body. The most common sign and symptom of Kaposi sarcoma is the appearance of dark, velvety, purple spots on the skin, often on the scalp. These spots may grow to form a larger patch or lump. Other symptoms of Kaposi sarcoma may include:            - A rash that is not 

In [17]:
context

'A sign of adult soft tissue sarcoma is a lump or swelling in soft tissue of the body. A sarcoma may appear as a painless lump under the skin, often on an arm or a leg. Sarcomas that begin in the abdomen may not cause signs or symptoms until they get very big. As the sarcoma grows bigger and presses on nearby organs, nerves, muscles, or blood vessels, signs and symptoms may include:          - Pain.    - Trouble breathing.        Other conditions may cause the same signs and symptoms. Check with your doctor if you have any of these problems.'