## 1. 选择模型

In [1]:
import getpass
import os

os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT_ID"] = "project-rag-1.0.0"
os.environ["LANGSMITH_PROJECT_NAME"] = "rag-sample"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

from langchain_deepseek.chat_models import ChatDeepSeek  # 导入 DeepSeek 的聊天模型
from langchain_huggingface import HuggingFaceEmbeddings # 导入 HuggingFace 的嵌入模型
from langchain.chains import RetrievalQA # 导入检索问答链
from langchain.prompts import PromptTemplate # 导入提示模板
from langchain.chains import LLMChain # 导入 LLM 链
from langchain.memory import ConversationBufferMemory # 导入对话缓冲区内存
from langchain_chroma import Chroma # 导入 Chroma 向量存储

### 1. 聊天模型

In [None]:
# 创建 DeepSeek 聊天模型实例
deepseek_api_key = "sk-fffbb9b8a78d436a91a4780356b67a93"
# 选择deepseek-V3模型
# llm = ChatDeepSeek(model="deepseek-chat", api_key = deepseek_api_key, temperature=0, base_url='https://api.deepseek.com')
# 选择deepseek-R1模型
llm = ChatDeepSeek(model="deepseek-reason", api_key = deepseek_api_key, temperature=0, base_url='https://api.deepseek.com')

### 2. 嵌入模型

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# 初始化模型（自动下载）
model_name = "BAAI/bge-large-en-v1.5" # BGE 模型
model_kwargs = {'device': 'cuda'} # 使用 GPU 进行推理
encode_kwargs = {'normalize_embeddings': True}  # 是否归一化

# 创建 HuggingFace 嵌入模型实例
# 这里使用了 BAAI 的 BGE 模型，可以根据需要选择其他模型
embeddings_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  embeddings_model = HuggingFaceEmbeddings(


## 2. 构建索引

### 加载数据

#### JSONL文件

In [4]:
from langchain_community.document_loaders import JSONLoader


file_path = "database/json/sample_data.jsonl" # 数据文件路径

# 定义元数据函数
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["title"] = record.get("Title")
    metadata["label"] = record.get("Label")
    metadata["viewcount"] = record.get("ViewCount")
    metadata['source'] = file_path

    return metadata


# 加载数据
json_loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="Content",
    metadata_func=metadata_func,
    text_content=True,
    json_lines=True,
    )

json_data = json_loader.load()

In [5]:
# print(json_data)

#### PDF文件

In [6]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

# 加载 PDF 文件
pdf_loader = PyPDFDirectoryLoader("database/pdf")
pdf_data = pdf_loader.load()

In [7]:
# print(pdf_data)

In [8]:
all_data = json_data + pdf_data

## 切分数据

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 创建文本分割器，设置分割参数
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)
# 将数据分割成小块
data_splits = text_splitter.split_documents(all_data)

## 存储数据

In [16]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory="./sample_chroma",  # Where to save data locally, remove if not necessary
).from_documents(documents=data_splits, embedding=embeddings_model)  # 创建 Chroma 向量存储实例

# vector_db = vector_store.add_documents(data_splits)  # 将数据添加到向量数据库中

## 3. 检索增强

### 查询内容优化-MultiQueryRetriever（多角度问题召回器）

In [None]:
# Set logging for the queries
import logging
from typing import List
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.documents import Document
from langchain_core.runnables import chain

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""
    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""你的任务是生成五个不同的问题版本，以便从向量数据库中检索相关文档。
    通过从多个角度重新表述用户的问题，你的目标是帮助用户克服基于距离的相似性搜索的一些局限性。
    请将这些替代问题用换行符分隔。
    原始问题：{question}""",
)

question = "京剧旦角主要分为哪几个流派？"

def get_multiquery_docs(question: str) -> List[Document]:
    """Create a retriever that generates multiple queries."""
    # Chain
    llm_chain = QUERY_PROMPT | llm | output_parser
    # Other inputs
    # Run
    retriever = MultiQueryRetriever(
        retriever=vector_store.as_retriever(), llm_chain=llm_chain, parser_key="lines"
    )  # "lines" is the key (attribute name) of the parsed output
    # Results
    unique_docs = retriever.invoke(question)
    return unique_docs

In [None]:
question = "京剧旦角主要分为哪几个流派？"

unique_docs = get_multiquery_docs(question)
# Print the unique documents
print(f"Number of unique documents: {len(unique_docs)}")
print(unique_docs[0].page_content)
print(unique_docs[-1].page_content)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. 京剧中的旦角有哪些著名的表演流派？  ', '2. 京剧旦角艺术的主要派别及其特点是什么？  ', '3. 请列举京剧旦角的代表性流派及其创始人。  ', '4. 京剧旦角在表演风格上分为哪几大类别？  ', '5. 不同京剧旦角流派在唱腔和表演上有何区别？']


12

## *4. 训练/评估

## 5. 可视化界面