In [1]:
!pip install langchain
# install nessary dependencies for building LLM apps in Python



In [2]:
!pip install chromadb
# for making vector database

Collecting chromadb
  Downloading chromadb-0.5.17-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3

In [3]:
import chromadb
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
# creating chromadb client and vector database
client = chromadb.Client()

collection = client.get_or_create_collection(
    name="llama_chunks"
)

In [5]:
# create a text_splitting function using Langchain

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '\n\n', " ", '.', ','],
    chunk_size=600,
    chunk_overlap = 50
)

In [6]:
# open('Llama_Wikipedia_Cleaned.txt','r) reads the file called Llama_Wikipedia_Cleaned.txt

# 'r' ensures that open function reads the file but does not edit it

with open('Llama_Wikipedia_Cleaned.txt','r') as file:
  content = file.read() # .read() allows to convert the convert the file content back to string

In [7]:
langchain_chunks = text_splitter.create_documents([content])

In [8]:
for idx, chunk in enumerate(langchain_chunks):
  collection.add(
      documents=[chunk.page_content],
      ids = [f'chunk #{idx}'],
      metadatas = [{'source': 'https://en.wikipedia.org/wiki/Llama','chunk_idx': idx}]
  )

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 76.7MiB/s]


In [9]:
collection.peek()

{'ids': ['chunk #0',
  'chunk #1',
  'chunk #2',
  'chunk #3',
  'chunk #4',
  'chunk #5',
  'chunk #6',
  'chunk #7',
  'chunk #8',
  'chunk #9'],
 'embeddings': array([[-0.03380476,  0.07549164, -0.03189154, ..., -0.01461167,
         -0.00717111,  0.02333828],
        [ 0.05158975,  0.02474356, -0.01942642, ..., -0.00689341,
         -0.04974074,  0.04560149],
        [ 0.0331398 ,  0.00545078, -0.02212834, ..., -0.01394328,
         -0.03128806,  0.02386065],
        ...,
        [ 0.04420555,  0.05205919,  0.0481475 , ..., -0.03531361,
          0.0606828 , -0.00510189],
        [-0.0260729 ,  0.04097349, -0.00920881, ..., -0.02787998,
          0.00496551, -0.02016043],
        [ 0.03322602,  0.05902057,  0.00946919, ..., -0.05597772,
         -0.04901107,  0.07196109]]),
 'documents': ['Llama\n\n=====\n\n\n\nCamelus glama Linnaeus, 1758\n\nThe llama (/ˈlɑːmə/; Spanish pronunciation: [ˈʎama] or [ˈʝama]) (Lama glama) is a domesticated South American camelid, widely used as a meat 

In [10]:
result_query = collection.query(query_texts=['What is a llama'],n_results=1)

In [11]:
print(result_query['documents'][0][0])

Llama

=====



Camelus glama Linnaeus, 1758

The llama (/ˈlɑːmə/; Spanish pronunciation: [ˈʎama] or [ˈʝama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.
