# 1. langchain的Chroma类

来自langchain_chroma包！

参数：

- collection_name: 默认值langchain
- embedding_function: embedding函数,默认None
- persist_directory: 持久化目录
- client_settings: 客户端设置
- collection_metadata: collection的metadata信息
- client: 客户端
- create_collection_if_not_exists: bool,默认为True



In [1]:
# 1. 添加文档到chromadb当中
# 2. 根据问题进行检索并输出答案

import chromadb
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser

class RagLangChain():
    def __init__(self, 
                 collection_name: str = "rag-video-collection", 
                 host: str = "localhost",
                port: int = 8000):
        self.collection_name = collection_name
        self.host = host
        self.port = port

    def __get_vector_store(self):
        chromadb_client = chromadb.HttpClient(host=self.host, port=self.port)
        embedding_fn = OllamaEmbeddings(model="nomic-embed-text:latest")
        
        vector_store = Chroma(collection_name=self.collection_name,
                              client=chromadb_client, 
                              embedding_function=embedding_fn
                             )
        return vector_store

    # def delete_collection(self):
    #     vector_store = self.__get_vector_store()
    #     vector_store.delete_collection()
        
    def add_file(self, file_path: str):
        loader = TextLoader(file_path)
        
        docs = loader.load()
        text_spliter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)
        all_splits = text_spliter.split_documents(docs)
        
        vector_store = self.__get_vector_store()
        ids = vector_store.add_documents(documents=all_splits)
        return ids

    def __query_vector(self, info):
        vector_store = self.__get_vector_store()
        retriever = vector_store.as_retriever()
        docs = retriever.invoke(info["query"])
        docs_str = "\n\n".join(doc.page_content for doc in docs)
        return docs_str
        
    def query(self, question: str):
        prompt = ChatPromptTemplate.from_template("""
你是一个问答机器人。你的任务是根据下述给定的已知信息回答用户问题。

已知信息:
{context}
用户问题：
{query}
如果已知信息不包含用户问题的答案，或者已知信息不足以回答用户的问题，请直接回复"我无法回答您的问题"。
请不要输出已知信息中不包含的信息或答案。
请用中文回答用户问题。
""")
        
        llm = ChatOllama(model="qwen2.5:latest")
        
        output_parser = StrOutputParser()
        # 返回值: {"query": "XXXX", "context": "XXXXX2"}
        chain = ( {"context": self.__query_vector, "query": lambda x: x["query"]} | prompt | llm | output_parser)
        
        result = chain.invoke({"query": question})
        return result


In [2]:
rag_chain = RagLangChain()

In [3]:
rag_chain.add_file(file_path="科技行业 2025 年展望.txt")

['fef6c059-de38-4344-9ee5-0db3440c5538',
 '1e8a97f6-c596-46be-86c0-6c840fdd01c3',
 '5925c4e3-be58-4300-95b2-58c27ed333c9',
 'a5380403-abaa-47ba-bf5f-bda6e20e336c',
 '5fec7acd-46c2-44e2-8558-b641cb88f738',
 'e430c832-f090-4e66-aac6-bd0a2b610071',
 '3dcdf7e4-1765-49a9-933e-630d57a0bebc',
 '12df78e2-5714-4607-bc69-78c4323fb6e4',
 'bb54a544-06b1-4db1-a4d1-4cb6d2ccff7a',
 '97d2a120-c957-45df-a559-bfd197951c43',
 '6b3e3f7c-5f0a-4a3f-b52c-8f10566d70cf',
 'aeab3935-2e9d-4412-8b0c-bee38c685804',
 '65264adc-1dce-4f4c-b459-947c902e9716',
 '5c0447aa-1fe0-43fc-9cc6-86506f5888e9',
 '3b86a417-7bf1-434f-a568-671a68d1ac08',
 '2bc33779-1d52-45b8-a094-842d182b991b',
 '3f02c9c2-79c6-4773-9758-1033888a07f9',
 '7d2dbc16-8770-4aa2-8727-931685e0c060',
 'c1b2c8b0-698f-471b-a205-efe9c0f981bc',
 '8c6eb5b8-eb54-4804-9b09-faeaa96c195d',
 '915c4edb-2a02-44c1-8fd3-cf5892dc4c0c',
 '7b5d3f0a-47ac-494e-892f-c2694b0415b7',
 '38c53534-a495-4e5c-a209-2edf1c2963fb']

In [4]:
rag_chain.query(question="2024年三季度全球智能手机出货量 多少部?")

'2024年三季度全球智能手机出货量的信息在给定的已知信息中没有直接给出具体数字，仅提到“今年三季度全球智能手机出货量 3.15 亿部”，这里的“今年”指的是2023年三季度的数据。对于2024年三季度的具体出货量数据，已知信息中并未提供。\n\n因此，我无法回答您的问题。'