In [1]:
!pip install langchain
!pip install torch
!pip install transformers
!pip install sentence-transformers
!pip install datasets
!pip install faiss-cpu
!pip install -U langchain-community
!pip install tqdm
!pip install bitsandbytes
!pip install huggingface_hub
!pip install --upgrade accelerate
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from datasets import load_dataset
from tqdm import tqdm
import multiprocessing as mp


Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.105-py3-none-any.whl.metadata (13 kB)
Collecting packaging<25,>=23.2 (from langchain-core<0.3.0,>=0.2.32->langchain)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Downloading langchain-0.2.14-py3-none-any.whl (997 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m997.8/997.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading langchain_core-0.2.35-py3-none-any.whl (394 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.9/394.9 kB[0m [31m17.1 MB/s[0m eta [36m0:00:0

NameError: name '_name_' is not defined

In [None]:
def load_and_process_dataset(args):
    if len(args) == 3:
        dataset_name, input_col, output_col = args
        config = None
    else:
        dataset_name, input_col, output_col, config = args
    
    if config:
        dataset_name = f"{dataset_name}/{config}"
    
    loader = HuggingFaceDatasetLoader(dataset_name, output_col)
    data = loader.load()
    
    dataset = load_dataset(dataset_name)
    processed_data = []
    for i, doc in tqdm(enumerate(data), desc=f"Processing {dataset_name}", leave=False):
        input_text = dataset['train'][i][input_col]
        doc.metadata['full_qa'] = f"Input: {input_text}\nOutput: {doc.page_content}"
        processed_data.append(doc)
    
    return processed_data

def process_in_batches(docs, batch_size=1000):
    for i in range(0, len(docs), batch_size):
        yield docs[i:i + batch_size]

if __name__ == "__main__":
    datasets = [
        ("ruslanmv/ai-medical-chatbot", "Patient", "Doctor"),
        ("lavita/ChatDoctor-iCliniq", "input", "answer_chatdoctor"),
        ("xDAN-datasets/medical_meadow_wikidoc_patient_information_6k", "input", "output"),
        ("mlabonne/medical_meadow_medqa", "instruction","output"),
        ("medalpaca/medical_meadow_medical_flashcards", "input", "output"),
        ("lavita/ChatDoctor-HealthCareMagic-100k","input","output")
    ]

    # Parallel processing of datasets
    print("Loading and processing datasets:")
    with mp.Pool(processes=mp.cpu_count()) as pool:
        all_data = list(tqdm(pool.imap(load_and_process_dataset, datasets), total=len(datasets), desc="Datasets"))
    all_data = [item for sublist in all_data for item in sublist]

    print(f"Total documents loaded: {len(all_data)}")

    print("Splitting documents:")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = []
    for doc in tqdm(all_data, desc="Splitting"):
        docs.extend(text_splitter.split_documents([doc]))

    print(f"Total chunks after splitting: {len(docs)}")

    embeddings = HuggingFaceEmbeddings()

    print("Creating FAISS index...")
    db = None
    batch_size = 1000
    total_batches = len(docs) // batch_size + (1 if len(docs) % batch_size != 0 else 0)

    for i, batch in enumerate(tqdm(process_in_batches(docs, batch_size), total=total_batches, desc="Vectorizing")):
        if i == 0:
            db = FAISS.from_documents(batch, embeddings)
        else:
            db.add_documents(batch)

    print("Saving FAISS index...")
    db.save_local("RAG_DB_Multiple_Answer_Based")
    print("FAISS index created and saved.")

    print("Loading FAISS index...")
    loaded_db = FAISS.load_local("RAG_DB_Multiple_Answer_Based", embeddings, allow_dangerous_deserialization=True)
    print("FAISS index loaded.")

    def query_database(query, k=6):
        results = loaded_db.similarity_search(query, k=k)
        for doc in results:
            print("Retrieved answer:", doc.page_content[:100] + "...")
            print("Full Q&A:")
            print(doc.metadata['full_qa'])
            print("\n---\n")

    query_database("What are the symptoms of diabetes?")

Loading and processing datasets:




Downloading readme:   0%|          | 0.00/863 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/317 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.40M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5850 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]



Downloading readme:   0%|          | 0.00/581 [00:00<?, ?B/s]



Downloading readme:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/542 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33955 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7321 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

Datasets: 100%|██████████| 6/6 [01:12<00:00, 12.13s/it]                             


Total documents loaded: 426385
Splitting documents:


Splitting: 100%|██████████| 426385/426385 [01:52<00:00, 3777.32it/s] 
  warn_deprecated(


Total chunks after splitting: 456445


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating FAISS index...


Vectorizing:  10%|▉         | 45/457 [3:24:34<34:44:42, 303.60s/it]

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session