In [1]:
# Import the required libraries
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

In [2]:
# Tokenizer
model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# bitsandbytes parameters
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Set up quantization config
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


# Load pre-trained config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)



tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Your GPU supports bfloat16: accelerate training with bf16=True


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [3]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [4]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [5]:

import json 
def extract_data(data):
    # Initialize empty lists to store extracted data
    contexts = []     # Store contexts from paragraphs
    questions = []    # Store questions from Q&A pairs
    answers = []  # Store lists of answers corresponding to questions
    ids = []        # Store IDs of questions
    
    # Loop through the provided data
    for row in data:
        # Extract paragraphs within the data
        for paragraph in row['paragraphs']:
            # Extract Q&A pairs within each paragraph
            for qas in paragraph['qas']:
                # Store context for each Q&A pair
                contexts.append(paragraph['context'])
                
                # Store the question itself
                questions.append(qas['question'])
                
                # Store all answers related to the question
                answer_texts = []
                for answer in qas['answers']:
                    answer_texts.append(answer['text'])
                answers.append(answer_texts)
                
                # Store the ID of the question
                ids.append(qas['id'])
    
    # Return the extracted data
    return contexts, questions, answers, ids


In [6]:

path = "data/train_webmd_squad_v2_full.json"
def load_data(path):
    # Load JSON data from the given path
    obj = json.load(open(path))
    
    # Extract the 'data' field from the loaded JSON object
    obj = obj['data']
    
    # Process the extracted data to get context, question, answers, and IDs
    context, question, answers, ids = extract_data(obj)
    
    # Create a Dataset object using the processed data
    data = Dataset.from_dict({
        'context': context,
        'question': question,
        'answer': answers,
        'id': ids
    })
    
    # Return the constructed Dataset object
    return data


In [7]:
import json
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import Language

# Load data using the load_data function
data = load_data(path)

# Create a loader instance
loader = GenericLoader.from_filesystem(
    path,
    parser=LanguageParser( parser_threshold=500),  # Adjust parser settings as needed
)

# Load the documents
docs = loader.load()
len(docs)

1

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)

140741

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

onnx/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [11]:
query_result = embeddings.embed_query(texts[0].page_content)
print(len(query_result))

1024


In [13]:
%%time
from langchain.vectorstores import Chroma

db = Chroma.from_documents(texts, embeddings, persist_directory="db")

CPU times: user 1h 10min 46s, sys: 42.8 s, total: 1h 11min 29s
Wall time: 1h 5min 26s


In [14]:
template = """
<s>[INST] <>
Act as a Multiple Answer Spans Healthcare Question Answering helpful assistant and answer the user's questions in details with reasoning. Do not give any false information. In case you don't have answer, specify why the question can't be answered.
<>

{context}

{question} [/INST]
"""

In [15]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [17]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=mistral_llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [18]:
%%time
result = qa_chain(
    "What types of exercise are best for people with asthma?"
)
result

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


CPU times: user 18.4 s, sys: 37.5 ms, total: 18.5 s
Wall time: 18.6 s


{'query': 'What types of exercise are best for people with asthma?',
 'result': "\nPeople with asthma can generally participate in most types of exercise, but it's important to choose activities that are appropriate for their individual needs and limitations. Short, intermittent periods of exertion, such as volleyball, gymnastics, baseball, walking, and wrestling, are generally well tolerated by people with exercise-induced asthma. Long periods of exertion, like soccer, distance running, basketball, and field hockey, may be less well tolerated, as are cold weather sports like ice hockey, cross-country skiing, and ice skating. However, many people with asthma are able to fully participate in these activities.\n\nSwimming is another good option for people with asthma, as it is a low-impact activity that can help improve cardiovascular fitness and lung function. It's also a great way to cool off during hot summer days.\n\nIt's important to note that people with asthma should always consul

In [19]:
print(result["result"].strip())

People with asthma can generally participate in most types of exercise, but it's important to choose activities that are appropriate for their individual needs and limitations. Short, intermittent periods of exertion, such as volleyball, gymnastics, baseball, walking, and wrestling, are generally well tolerated by people with exercise-induced asthma. Long periods of exertion, like soccer, distance running, basketball, and field hockey, may be less well tolerated, as are cold weather sports like ice hockey, cross-country skiing, and ice skating. However, many people with asthma are able to fully participate in these activities.

Swimming is another good option for people with asthma, as it is a low-impact activity that can help improve cardiovascular fitness and lung function. It's also a great way to cool off during hot summer days.

It's important to note that people with asthma should always consult with their healthcare provider before starting any new exercise routine. They may nee

In [20]:
%%time
result = qa_chain(
    "How is obsessive-compulsive disorder diagnosed?"
)
print(result["result"].strip())

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Obsessive-compulsive disorder (OCD) is typically diagnosed based on a combination of self-reported symptoms and clinical evaluation by a mental health professional. The Diagnostic and Statistical Manual of Mental Disorders (DSM-5) provides specific criteria for diagnosing OCD, which include the presence of obsessions and compulsions that are intrusive, unwanted, and cause significant distress or impairment in daily life.

To diagnose OCD, a mental health professional will typically conduct a thorough interview with the individual to assess their symptoms and gather information about their medical history, family history, and other relevant factors. They may also administer standardized questionnaires or tests to help evaluate the severity and duration of the individual's symptoms.

It is important to note that a diagnosis of OCD requires a comprehensive evaluation by a qualified mental health professional, and self-diagnosis should not be relied upon. Additionally, it is possible for i

In [21]:
%%time
result = qa_chain(
    "When are you more likely to get a blood clot?"
)
print(result["result"].strip())

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


A blood clot is more likely to occur when an individual is immobile for extended periods of time, such as during bed rest or sitting for long periods of time. Additionally, being pregnant or having recently given birth, being overweight, smoking, and using birth control pills or hormone replacement therapy can also increase the risk of developing a blood clot. It's important to note that blood clots can occur in anyone, regardless of age or health status, although they are less common in younger, healthy individuals.
CPU times: user 6.13 s, sys: 19.4 ms, total: 6.15 s
Wall time: 6.14 s


In [22]:
%%time
result = qa_chain(
    "How should you lift objects to prevent back pain?"
)
print(result["result"].strip())

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


To prevent back pain when lifting objects, it is important to follow these steps:

1. Bend your knees and squat, pulling in your stomach muscles.
2. Keep your back straight and avoid bending forward at the waist.
3. Use your leg muscles to lift the object, rather than your back.
4. If necessary, carry several small loads instead of one large one, or use a cart.
5. Avoid reaching over your head and try to keep your arms close to your body.
6. Use long-handled tools to reach high shelves or objects.
7. Store items lower to reduce strain on your back.
8. Take breaks frequently and stretch your back muscles regularly.
9. Maintain good posture throughout the day, even when sitting or standing.
10. Consider using ergonomic equipment or furniture to reduce stress on your back.
CPU times: user 10.6 s, sys: 2.46 ms, total: 10.7 s
Wall time: 10.6 s


In [23]:
%%time
result = qa_chain(
    "How can you be smart with antibiotics?"
)
print(result["result"].strip())

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


To be smart with antibiotics, it is important to understand their proper use and follow guidelines issued by healthcare professionals. This includes taking antibiotics only when necessary for a bacterial infection, using them as prescribed, taking all of the antibiotics as directed, and not sharing antibiotics with others. Additionally, preventing the spread of germs through good hygiene practices such as washing hands regularly, covering your mouth and nose when coughing or sneezing, and avoiding close contact with sick individuals can also help reduce the need for antibiotics. It is also important to note that antibiotics should not be used to treat viral infections such as the common cold or flu.
CPU times: user 8.19 s, sys: 10.9 ms, total: 8.2 s
Wall time: 8.19 s
