In [18]:
# Data Load
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

loader = TextLoader('llm.txt')

text_docs = loader.load()

text_docs = text_splitter.split_documents(text_docs)

text_docs

[Document(metadata={'source': 'llm.txt'}, page_content='A large language model (LLM) is a type of artificial intelligence (AI) program that can recognize and generate text, among other tasks. LLMs are trained on huge sets of data â€” hence the name "large." LLMs are built on machine learning: specifically, a type of neural network called a transformer model.\n\nIn simpler terms, an LLM is a computer program that has been fed enough examples to be able to recognize and interpret human language or other types of complex data. Many LLMs are trained on data that has been gathered from the Internet â€” thousands or millions of gigabytes\' worth of text. But the quality of the samples impacts how well LLMs will learn natural language, so an LLM\'s programmers may use a more curated data set.'),
 Document(metadata={'source': 'llm.txt'}, page_content='LLMs use a type of machine learning called deep learning in order to understand how characters, words, and sentences function together. Deep lea

In [19]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import bs4


loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")

                     )))

docs = loader.load()


In [20]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
embeddings = OllamaEmbeddings(model="llama3.1")

db = Chroma.from_documents(text_docs, embeddings)

In [None]:
query = "what is dog?"
retreived_results = db.similarity_search(query)
print(retreived_results)

In [None]:
from langchain_community.vectorstores import LanceDB
import lancedb

lanceDB = lancedb.connect("tmp/lancedb")

table = lanceDB.create_table("my_table",data=[
    {
        "vector":embeddings.embed_query("text_docs"),
        "text":"test text",
        "id":"1"
    }
], mode="overwrite")

lanceDB = LanceDB.from_documents(text_docs, embeddings)



In [None]:

query = "llm?"
retreived_results = lanceDB.similarity_search(query)
print(retreived_results[0].page_content)

In [21]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3.1")

llm

Ollama(model='llama3.1')

In [22]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(""" 
                                          Answer the question correcty on the base of context  <context>{context}</context> on the following text: <question>{input}</question>
                                          """)

In [23]:
# create document chain
from langchain.chains.combine_documents import  create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm, prompt)

In [24]:
 #retriver
 
retreiver = db.as_retriever()

retreiver

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002443F8BB430>)

In [25]:
# retreival chain

from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retreiver, document_chain)

In [27]:
response = retrieval_chain.invoke({"input":"llm in simpler term"})

response["answer"]

'An LLM (Large Language Model) is a computer program that has been fed enough examples to be able to recognize and interpret human language or other types of complex data.'