In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_BASE = os.environ.get("OPENAI_API_BASE")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [2]:
pdf_path = "../data/Understanding_Climate_Change.pdf"

# 加载文档

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(
    pdf_path,
    mode="single",
)
docs = loader.load()

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=len
)

texts = text_splitter.split_documents(docs)
len(texts)

92

In [5]:
for doc in texts:
    doc.page_content = doc.page_content.replace("\t", " ")

# 嵌入文档

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings


model_name = os.environ.get("EMBEDDING_MODEL")
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [7]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(host="localhost", port=6333)

client.create_collection(
    collection_name="Understanding_Climate_Change",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="Understanding_Climate_Change",
    embedding=embedding_model,
)

In [8]:
from uuid import uuid4

ids = [str(uuid4()) for _ in range(len(texts))]
vector_store.add_documents(documents=texts, ids=ids)

['f90aa586-ca59-4b8d-8709-752a3830bfaf',
 'bac036a0-2610-4efe-8ed8-98126d79e4eb',
 'a7335903-ebbf-426f-aaef-f0a1d4b59e4f',
 '1422cda2-a9d0-45bb-bdea-aadcceda7eb1',
 'cbeed763-3884-45d9-a360-04be4cf5aebe',
 'e39020d1-3074-44ac-96aa-09c79a2261fb',
 'cbffc16f-735f-4043-9ec6-9f2a10c5f3ac',
 'c6cb957c-9c7c-4978-874b-181e5669d829',
 '2afeadf9-f1a9-40cb-a8ef-ecf068cd7d51',
 'c56c70c6-039f-4539-b39f-4273f18c1008',
 '004f5f6b-39b4-4751-9c58-d4c5fe5e37f9',
 '80c68b89-1470-4bcf-8ded-5442d291ce0c',
 'f6081413-40fa-4fdb-b8df-444dcbf8040f',
 '859926ca-3aee-46bc-b508-03c50f3bec2c',
 '9e4fb1d2-ee3a-40d5-b804-f1a6da83cb21',
 'eacf5259-5a8a-4728-baf0-7b10310febfa',
 '89b5e0ae-457a-4dff-a0b2-799f2bd42f1e',
 '504501c1-d2e1-4e5b-9876-2077d083755b',
 '88d2575a-dce2-419a-b4f0-cfeb7dd0b893',
 'd4b3fbab-e50f-4388-bbe0-eb0a816a8b9f',
 '8908ef92-bb0f-4e00-b86c-180736e14d17',
 '6255e2b6-8bdd-4f3e-9a74-af86571c4589',
 '521dd64b-a3af-490b-85c6-d45e3b84abc9',
 'cf6b7046-616c-4c87-8284-41552b33f428',
 '7c4207a2-c8f4-

# 检索

In [9]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("What is the main cause of climate change?")

retrieved_docs

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'source': '../data/Understanding_Climate_Change.pdf', 'file_path': '../data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'format': 'PDF 1.7', 'title': '', 'author': 'Nir', 'subject': '', 'keywords': '', 'moddate': '2024-07-13T20:17:34+03:00', 'trapped': '', 'modDate': "D:20240713201734+03'00'", 'creationDate': "D:20240713201734+03'00'", '_id': 'a7335903-ebbf-426f-aaef-f0a1d4b59e4f', '_collection_name': 'Understanding_Climate_Change'}, page_content='Chapter 2: Causes of Climate Change \nGreenhouse Gases \nThe primary cause of recent climate change is the increase in greenhouse gases in the \natmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous \noxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential \nfor life on Earth, as it keeps the planet warm enough to support life. H

In [10]:
retrieved_docs = retriever.invoke("气候变化的主要原因是什么？")

retrieved_docs

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'source': '../data/Understanding_Climate_Change.pdf', 'file_path': '../data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'format': 'PDF 1.7', 'title': '', 'author': 'Nir', 'subject': '', 'keywords': '', 'moddate': '2024-07-13T20:17:34+03:00', 'trapped': '', 'modDate': "D:20240713201734+03'00'", 'creationDate': "D:20240713201734+03'00'", '_id': 'a7335903-ebbf-426f-aaef-f0a1d4b59e4f', '_collection_name': 'Understanding_Climate_Change'}, page_content='Chapter 2: Causes of Climate Change \nGreenhouse Gases \nThe primary cause of recent climate change is the increase in greenhouse gases in the \natmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous \noxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential \nfor life on Earth, as it keeps the planet warm enough to support life. H

# 增强生成

In [11]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="qwen-max", temperature=0)

In [12]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Chain
chain = prompt | llm

In [13]:
result = chain.invoke(
    {"context": retrieved_docs, "question": "What is the main cause of climate change?"}
)

result

AIMessage(content='The main cause of recent climate change is the increase in greenhouse gases in the atmosphere, primarily due to human activities such as the burning of fossil fuels (coal, oil, and natural gas) for energy, which releases large amounts of carbon dioxide (CO2).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 52, 'prompt_tokens': 1462, 'total_tokens': 1514, 'completion_tokens_details': None, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'qwen-max', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-2a4f6cea-85ff-4541-9c91-cd329410882a-0', usage_metadata={'input_tokens': 1462, 'output_tokens': 52, 'total_tokens': 1514, 'input_token_details': {'cache_read': 0}, 'output_token_details': {}})

In [14]:
# Run
result = chain.invoke(
    {"context": retrieved_docs, "question": "气候变化的主要原因是什么？"}
)

result

AIMessage(content='气候变化的主要原因是大气中温室气体的增加。温室气体，如二氧化碳（CO2）、甲烷（CH4）和一氧化二氮（N2O），会捕获来自太阳的热量，产生“温室效应”。这种效应对于地球上的生命至关重要，因为它使地球保持足够的温暖以支持生命。然而，人类活动加剧了这一自然过程，导致气候变暖。燃烧化石燃料（如煤、石油和天然气）用于发电、供暖和运输，释放大量二氧化碳，这是温室气体增加的一个主要原因。', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 110, 'prompt_tokens': 1462, 'total_tokens': 1572, 'completion_tokens_details': None, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'qwen-max', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-3b9cc880-5f41-410f-a527-66907b458371-0', usage_metadata={'input_tokens': 1462, 'output_tokens': 110, 'total_tokens': 1572, 'input_token_details': {'cache_read': 0}, 'output_token_details': {}})

# 检索增强生成链

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke("气候变化的主要原因是什么？")

In [16]:
import textwrap

textwrap.fill(result, width=120)

'气候变化的主要原因是大气中温室气体的增加。这些温室气体，如二氧化碳（CO2）、甲烷（CH4）和一氧化二氮（N2O），会捕获太阳的热量，产生“温室效应”。虽然这种效应对于维持地球温暖、支持生命是必不可少的，但人类活动加剧了这一自然过程，导致气\n候变暖。特别是燃烧化石燃料（煤、石油和天然气）用于发电、供暖和交通等活动释放了大量的二氧化碳，从而增加了温室气体的浓度。自工业革命以来，化石燃料的消耗显著增加，并且至今仍在增长。'