In [1]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI

import os

api_key = os.getenv("API_KEY")
chat_completion_url = os.getenv("BASE_URL")

llm = ChatOpenAI(
    model="gcp/gemini-1.5-flash-002",
    api_key=api_key,
    base_url=chat_completion_url
)

In [None]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
embedding_function = OpenAIEmbeddings(model="openai/text-embedding-3-large",
                                      api_key=api_key,
                                      base_url=chat_completion_url)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embedding_function)

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is Task Decomposition?")

In [13]:
from langchain_community.document_loaders.csv_loader import CSVLoader

embedding_function = OpenAIEmbeddings(model="openai/text-embedding-3-large",
                                      api_key=api_key,
                                      base_url=chat_completion_url)

loader = CSVLoader(
    file_path="school_guide_qna_russia.csv",
    csv_args={
        "delimiter": ",", 
        "quotechar": '"',
        "fieldnames": ["id", "question", "answer"]
    }
)
docs = loader.load()

print(docs)
print(len(docs))

[Document(metadata={'source': 'school_guide_qna_russia.csv', 'row': 0}, page_content='id: 1\nquestion: Во сколько начинается школьное занятие?\nanswer: Занятие начинается с 8:20 до 8:40 утра.'), Document(metadata={'source': 'school_guide_qna_russia.csv', 'row': 1}, page_content='id: 2\nquestion: Как можно узнать меню школьной столовой?\nanswer: Ежедневное меню можно узнать на школьном сайте или в мобильном приложении.'), Document(metadata={'source': 'school_guide_qna_russia.csv', 'row': 2}, page_content='id: 3\nquestion: Как проверить успеваемость ребенка?\nanswer: Успеваемость и учебный прогресс ребенка можно проверить через онлайн-портал для родителей.'), Document(metadata={'source': 'school_guide_qna_russia.csv', 'row': 3}, page_content='id: 4\nquestion: Какое время работы медицинского кабинета в школе?\nanswer: Медицинский кабинет работает в будние дни с 8:00 до 16:30.'), Document(metadata={'source': 'school_guide_qna_russia.csv', 'row': 4}, page_content='id: 5\nquestion: Что делат

In [14]:
vectorstore = Chroma.from_documents(documents=docs, 
                                    embedding=embedding_function)

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

print(prompt)
print(prompt.input_variables)
print(prompt.messages)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]
['context', 'question']
[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the a

In [6]:
question = "Какие услуги предоставляются для студентов с ограниченными возможностями?"
rag_chain.invoke(question)

'校服可在校内商店或指定的线上购物平台购买。  具体销售点请咨询学校或查看学校官网。\n'