In [14]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings,SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from pathlib import Path

### Load

In [17]:
loader=TextLoader(file_path=Path("Data").joinpath("Sample.txt"))
loader

<langchain_community.document_loaders.text.TextLoader at 0x197553adfd0>

In [18]:
documents=loader.load()
documents

[Document(page_content="India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. With a rich history spanning thousands of years, India is known for its cultural heritage, religious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene backwaters of Kerala in the south, India encompasses a wide range of geographical features, including deserts, plains, mountains, and coastlines, making it a land of incredible natural beauty.\n\nIndia is the seventh-largest country by land area and the second-most populous country in the world, with a population exceeding 1.3 billion people. It is a federal parliamentary democratic republic, with a president as the head of state and a prime minister as the head of government. The country follows a multi-tiered administrative structure, with 28 states and 9 union territories, each having its own elected government.\n\nIndia has a rich cultural heritage that has evolved over

In [19]:
len(documents)

1

### Documents Transformers

In [20]:
textSplitter=CharacterTextSplitter(chunk_size=200,chunk_overlap=0)
texts=textSplitter.split_documents(documents=documents)

Created a chunk of size 486, which is longer than the specified 200
Created a chunk of size 426, which is longer than the specified 200
Created a chunk of size 419, which is longer than the specified 200
Created a chunk of size 498, which is longer than the specified 200
Created a chunk of size 319, which is longer than the specified 200
Created a chunk of size 436, which is longer than the specified 200
Created a chunk of size 309, which is longer than the specified 200
Created a chunk of size 365, which is longer than the specified 200
Created a chunk of size 288, which is longer than the specified 200
Created a chunk of size 320, which is longer than the specified 200
Created a chunk of size 354, which is longer than the specified 200
Created a chunk of size 263, which is longer than the specified 200
Created a chunk of size 339, which is longer than the specified 200
Created a chunk of size 338, which is longer than the specified 200
Created a chunk of size 366, which is longer tha

In [21]:
len(texts)

19

In [24]:
texts[0]

Document(page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. With a rich history spanning thousands of years, India is known for its cultural heritage, religious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene backwaters of Kerala in the south, India encompasses a wide range of geographical features, including deserts, plains, mountains, and coastlines, making it a land of incredible natural beauty.', metadata={'source': 'Data\\Sample.txt'})

### Text Embedded Models

In [23]:
embeddings=SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

  from tqdm.autonotebook import tqdm, trange


In [25]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

### VectorStores

In [26]:
db=Chroma.from_documents(documents=texts,embedding=embeddings)

In [29]:
db._collection.get(include=['embeddings'])

{'ids': ['0fc6cd64-8aa7-449f-9879-c4fca0181977',
  '1103e23c-7dbc-4d28-a6f5-a1179d0fb6ae',
  '13ce7308-bb07-4b14-acb8-a20e7fedba5a',
  '14fa11f7-2ec1-409b-a0c1-78b2b971d5a4',
  '23e2a838-9d7e-45e2-bedb-79b384b90b0c',
  '242fbc77-fbf3-456d-b629-dea84c59fd4e',
  '37b79bb6-c910-4ccd-b605-13cf56eb7505',
  '70de88ab-e407-441e-bff7-529c19224c64',
  '71b34285-c4a4-4b32-a7fd-12891520226d',
  '801f9a69-2fd2-4697-bfd3-308ddebada94',
  '8c37f0dc-e2da-4181-96db-886a846cde87',
  '9322c45b-159e-4f65-b00d-49bd25f0e38a',
  'b8e6e94b-efa1-4112-ac79-78d193556775',
  'c04f8607-6847-47d5-942c-051a04bb24ff',
  'c285e132-1ed9-481e-aad0-64519d66eaa9',
  'c6bb6195-b643-43d4-b0ff-03c90875c01e',
  'daeed1af-922a-4322-8aef-566a696f0cca',
  'e6f8fcc5-2cda-479e-92eb-2627e763f306',
  'e7c0ff86-0b4d-4ac9-bf95-9c1b1b6cca2a'],
 'embeddings': [[0.04023279994726181,
   -0.050923898816108704,
   -0.0731794685125351,
   -0.004093305207788944,
   -0.023085076361894608,
   -0.05878104269504547,
   -0.023700207471847534,
   

### Retrievers

In [35]:
retriever=db.as_retriever(search_kwargs={"k":2})

In [36]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x00000197559E5950>, search_kwargs={'k': 2})

In [37]:
docs=retriever.get_relevant_documents(query="What is the Capital of India?")

In [38]:
docs

[Document(page_content='Delhi is the capital of India.', metadata={'source': 'Data\\Sample.txt'}),
 Document(page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. With a rich history spanning thousands of years, India is known for its cultural heritage, religious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene backwaters of Kerala in the south, India encompasses a wide range of geographical features, including deserts, plains, mountains, and coastlines, making it a land of incredible natural beauty.', metadata={'source': 'Data\\Sample.txt'})]

In [39]:
docs=retriever.get_relevant_documents(query="What is the Currency of India?")
docs

[Document(page_content='The Indian rupee is the official currency in the Republic of India. The rupee is subdivided into 100 paise. The issuance of the currency is controlled by the Reserve Bank of India.', metadata={'source': 'Data\\Sample.txt'}),
 Document(page_content='The Indian rupee sign (₹) is the currency symbol for the Indian rupee the official currency of India', metadata={'source': 'Data\\Sample.txt'})]