In [5]:
import os

os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")

## Data Ingestion

In [2]:
from langchain_community.document_loaders import TextLoader

In [4]:
loader = TextLoader('speech.txt')
text_documents = loader.load()

[Document(page_content='They shall be My Hammer, the sword in My hand, the gauntlet about My fist, the bane of My foes and woes of the treacherous. When no others may stand beside them, they shall fight. Only the greatest shall enter their ranks, for unto them do I entrust stewardship over the Gates of Hell.\n\n"Statistically, you will almost certainly die when assaulting a well-maintained fortress with a competent commander. You must strive to make your death useful. "\n\n"Enemies of the Imperium, hear me. You have come here to die. The Immortal Emperor is with us and we are invincible. His soldiers will strike you down. His war machines will crush you under their treads. His mighty guns will bring the very sky crashing down upon you. You cannot win. The Emperor has given us his greatest weapon to wield. So make yourselves ready. We are the First Kronus Regiment, and today is our Victory Day. "\n\n"We hold them here or we fight them on the hallowed ground of Terra itself. I for one wo

In [6]:
# Reading from the web
from langchain_community.document_loaders import WebBaseLoader
import bs4

In [14]:
# Load, chunk and index the content of the html page
loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("post-title","post-content","post-header")
                       )))

In [15]:
text_documents = loader.load()

In [18]:
# Read from pdf
from langchain_community.document_loaders import PyPDFLoader

In [20]:
loader = PyPDFLoader("attention.pdf")
text_documents = loader.load()

## Transform text (chunking)

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(text_documents)

## Vector embeddings and Vector store

In [28]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [33]:
db = Chroma.from_documents(documents[:5], OllamaEmbeddings(model="tinydolphin"))

In [34]:
## Vector database
query = "Who are the authors of the attention is all you need paper"

result = db.similarity_search(query)
result[0].page_content

'efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and\nimplementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating\nour research.\n†Work performed while at Google Brain.\n‡Work performed while at Google Research.\n31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.arXiv:1706.03762v7  [cs.CL]  2 Aug 2023'

In [36]:
# FAISS Vector database
from langchain_community.vectorstores import FAISS

In [38]:
db1 = FAISS.from_documents(documents[:5], OllamaEmbeddings(model="tinyllama"))

In [39]:

result = db1.similarity_search(query)
result[0].page_content

'best models from the literature. We show that the Transformer generalizes well to\nother tasks by applying it successfully to English constituency parsing both with\nlarge and limited training data.\n∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started\nthe effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and\nhas been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head\nattention and the parameter-free position representation and became the other person involved in nearly every\ndetail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and\ntensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and\nefficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and'