In [None]:
!pip install chromadb langchain-community langchain-text-splitters sentence-transformers tqdm





Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [None]:
import os
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import TextLoader


DATA_DIR = "/content/"
DB_DIR = "/content/vectordb"


documents = []

print("Loading textbook files...")

for lang_folder in ["EN", "ZH"]:
    folder_path = os.path.join(DATA_DIR, lang_folder)

    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            file_path = os.path.join(folder_path, file)

            print("Loading:", file_path)

            loader = TextLoader(file_path, encoding="utf-8")
            docs = loader.load()

            for d in docs:
                d.metadata["language"] = "en" if lang_folder == "EN" else "zh"
                d.metadata["source"] = file

            documents.extend(docs)


print("Total raw documents loaded:", len(documents))


print("Splitting into chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)

print("Total chunks created:", len(chunks))


print("Loading embedding model...")

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)


print("Creating Chroma vector database...")

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=DB_DIR
)

vectordb.persist()

print("✅ Vector database successfully created and saved!")


Loading textbook files...
Loading: /content/EN/Obstentrics_Williams.txt
Loading: /content/EN/Immunology_Janeway.txt
Loading: /content/EN/Histology_Ross.txt
Loading: /content/EN/Anatomy_Gray.txt
Loading: /content/EN/Pathology_Robbins.txt
Loading: /content/EN/InternalMed_Harrison.txt
Loading: /content/EN/Pediatrics_Nelson.txt
Loading: /content/EN/Psichiatry_DSM-5.txt
Loading: /content/EN/First_Aid_Step1.txt
Loading: /content/EN/Gynecology_Novak.txt
Loading: /content/EN/Cell_Biology_Alberts.txt
Loading: /content/EN/Pathoma_Husain.txt
Loading: /content/EN/Pharmacology_Katzung.txt
Loading: /content/EN/Surgery_Schwartz.txt
Loading: /content/EN/Biochemistry_Lippincott.txt
Loading: /content/EN/First_Aid_Step2.txt
Loading: /content/EN/Neurology_Adams.txt
Loading: /content/EN/Physiology_Levy.txt
Loading: /content/ZH/all_books.txt
Total raw documents loaded: 19
Splitting into chunks...
Total chunks created: 285189
Loading embedding model...


  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]



sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating Chroma vector database...


KeyboardInterrupt: 

In [None]:
import os
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader


DATA_DIR = "/content/"
DB_DIR = "/content/vectordb"

documents = []

print("Loading textbook files...")

for lang_folder in ["EN", "ZH"]:
    folder_path = os.path.join(DATA_DIR, lang_folder)

    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            file_path = os.path.join(folder_path, file)

            print("Loading:", file_path)

            loader = TextLoader(file_path, encoding="utf-8")
            docs = loader.load()

            for d in docs:
                d.metadata["language"] = "en" if lang_folder == "EN" else "zh"
                d.metadata["source"] = file

            documents.extend(docs)

print("Total raw documents loaded:", len(documents))


print("Splitting into chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)

print("Total chunks created:", len(chunks))


print("Loading embedding model on GPU...")

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cuda"},   # ✅ GPU
    encode_kwargs={"batch_size": 64}   # ✅ Faster on GPU
)

print("Creating Chroma vector database...")

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=DB_DIR
)

vectordb.persist()

print("✅ Vector database successfully created and saved using GPU!")


Loading textbook files...
Loading: /content/EN/Obstentrics_Williams.txt
Loading: /content/EN/Immunology_Janeway.txt
Loading: /content/EN/Histology_Ross.txt
Loading: /content/EN/Anatomy_Gray.txt
Loading: /content/EN/Pathology_Robbins.txt
Loading: /content/EN/InternalMed_Harrison.txt
Loading: /content/EN/Pediatrics_Nelson.txt
Loading: /content/EN/Psichiatry_DSM-5.txt
Loading: /content/EN/First_Aid_Step1.txt
Loading: /content/EN/Gynecology_Novak.txt
Loading: /content/EN/Cell_Biology_Alberts.txt
Loading: /content/EN/Pathoma_Husain.txt
Loading: /content/EN/Pharmacology_Katzung.txt
Loading: /content/EN/Surgery_Schwartz.txt
Loading: /content/EN/Biochemistry_Lippincott.txt
Loading: /content/EN/First_Aid_Step2.txt
Loading: /content/EN/Neurology_Adams.txt
Loading: /content/EN/Physiology_Levy.txt
Loading: /content/ZH/all_books.txt
Total raw documents loaded: 19
Splitting into chunks...
Total chunks created: 285189
Loading embedding model on GPU...


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Creating Chroma vector database...
✅ Vector database successfully created and saved using GPU!


  vectordb.persist()


In [None]:
!zip -r vectordb.zip /content/vectordb


  adding: content/vectordb/ (stored 0%)
  adding: content/vectordb/chroma.sqlite3 (deflated 51%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/ (stored 0%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/index_metadata.pickle (deflated 43%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/length.bin (deflated 81%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/header.bin (deflated 55%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/link_lists.bin (deflated 73%)
  adding: content/vectordb/b107d99e-e3ff-4f3c-9adb-bf7ecf48ca37/data_level0.bin (deflated 11%)


In [None]:
from google.colab import files
files.download("/content/vectordb.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
pip install langchain

