<a href="https://colab.research.google.com/github/shi991027/RAG/blob/main/gaos_auto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers  torch


In [None]:
pip install scikit-learn




In [None]:
pip install deepseek_tokenizer

In [None]:
pip install openai


In [None]:
import requests
import time
from typing import List, Dict, Union, Any
import logging

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('bge_m3_client')


class BGEM3Client:
    """
    BGE-M3 嵌入模型客户端

    这个客户端封装了对 Silicon Flow API 的调用，用于获取文本嵌入向量。
    支持单条和批量文本处理，自动处理批量限制。
    """

    # API 配置
    DEFAULT_API_URL = "https://api.siliconflow.cn/v1/embeddings"
    DEFAULT_MODEL = "Pro/BAAI/bge-m3"
    MAX_BATCH_SIZE = 64  # 根据测试结果确定的最大批量大小
    MAX_TOKEN_LENGTH = 8192  # 单个文本最大Token数量
    APPROX_CHAR_PER_TOKEN = 1.2  # 每个Token大约对应的字符数(粗略估计)
    MAX_CHAR_LENGTH = int(MAX_TOKEN_LENGTH / APPROX_CHAR_PER_TOKEN)  # 大约为10000字符
    VECTOR_DIMENSION = 1024  # 嵌入向量维度

    def __init__(self, api_key: str, api_url: str = DEFAULT_API_URL, model: str = DEFAULT_MODEL):
        """
        初始化 BGE-M3 客户端

        参数:
            api_key: Silicon Flow API 密钥
            api_url: API 端点URL，默认为 "https://api.siliconflow.cn/v1/embeddings"
            model: 使用的模型名称，默认为 "Pro/BAAI/bge-m3"
        """
        self.api_key = api_key
        self.api_url = api_url
        self.model = model
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        logger.info(f"已初始化 BGE-M3 客户端，模型: {model}")

    def _check_text_length(self, text: str) -> bool:
        """
        检查文本长度是否超过限制

        参数:
            text: 需要检查的文本

        返回:
            如果文本长度在限制范围内返回 True，否则返回 False
        """
        return len(text) <= self.MAX_CHAR_LENGTH

    def get_embeddings(self, texts: Union[str, List[str]],
                       batch_size: int = MAX_BATCH_SIZE,
                       retry_count: int = 3,
                       retry_delay: float = 1.0,
                       encoding_format: str = "float") -> Dict[str, Any]:
        """
        获取文本的嵌入向量

        参数:
            texts: 单条文本字符串或文本列表
            batch_size: 批处理大小，默认为最大批量大小 (64)
            retry_count: 重试次数
            retry_delay: 重试延迟（秒）
            encoding_format: 编码格式，默认为 "float"

        返回:
            包含嵌入向量结果的字典，格式如下:
            {
                "embeddings": 嵌入向量列表,
                "total_tokens": 处理的令牌总数,
                "model": 使用的模型名称,
                "dimensions": 向量维度,
                "batch_stats": {
                    "total_batches": 总批次数,
                    "total_time": 总处理时间,
                    "avg_time_per_batch": 每批平均处理时间,
                    "avg_time_per_text": 每文本平均处理时间
                }
            }
        """
        # 统一输入格式
        if isinstance(texts, str):
            texts = [texts]

        # 所有texts强制小写
        texts = [text.lower() for text in texts]

        # 参数验证
        if not texts:
            return {"embeddings": [], "total_tokens": 0, "model": self.model, "dimensions": self.VECTOR_DIMENSION}

        # 检查文本长度
        for i, text in enumerate(texts):
            if not self._check_text_length(text):
                logger.warning(f"第 {i+1} 条文本长度超过限制 (字符数: {len(text)}, 限制: {self.MAX_CHAR_LENGTH})。该文本可能会被API拒绝。")

        # 确保批大小不超过最大限制
        actual_batch_size = min(batch_size, self.MAX_BATCH_SIZE)
        if batch_size > self.MAX_BATCH_SIZE:
            logger.warning(f"请求的批大小 {batch_size} 超过最大限制 {self.MAX_BATCH_SIZE}，已自动调整")

        # 将文本划分为批次
        batches = [texts[i:i + actual_batch_size] for i in range(0, len(texts), actual_batch_size)]
        all_embeddings = []
        total_time = 0
        batch_times = []

        logger.info(f"开始处理 {len(texts)} 条文本，分为 {len(batches)} 个批次，每批最大 {actual_batch_size} 条")

        # 处理每个批次
        for i, batch in enumerate(batches):
            success = False
            attempts = 0
            batch_result = None

            while not success and attempts < retry_count:
                try:
                    start_time = time.time()

                    # 构建请求
                    payload = {
                        "model": self.model,
                        "input": batch,
                        "encoding_format": encoding_format
                    }

                    # 发送请求
                    response = requests.post(
                        self.api_url,
                        headers=self.headers,
                        json=payload,
                        timeout=30  # 30秒超时
                    )

                    end_time = time.time()
                    batch_time = end_time - start_time
                    batch_times.append(batch_time)
                    print(batch_time)
                    total_time += batch_time

                    # 检查响应状态
                    if response.status_code == 200:
                        batch_result = response.json()

                        # 增加详细日志，记录API返回内容
                        content_size = len(str(batch_result))
                        logger.info(f"API响应内容大小: {content_size} 字符")

                        # 检查data字段
                        if "data" in batch_result:
                            data_count = len(batch_result["data"])
                            input_count = len(batch)
                            logger.info(f"API处理结果: 输入 {input_count} 个文本，返回 {data_count} 个向量")

                            # 如果数量不匹配，记录更详细信息
                            if data_count < input_count:
                                logger.warning(f"API返回的向量数量({data_count})少于输入文本数量({input_count})")

                                # 记录部分API响应内容，避免日志过大
                                sample_keys = list(batch_result.keys())
                                logger.info(f"API响应包含以下字段: {sample_keys}")

                                # 如果有错误信息字段，记录下来
                                if "error" in batch_result:
                                    logger.error(f"API报告错误: {batch_result['error']}")
                                elif "errors" in batch_result:
                                    logger.error(f"API报告错误: {batch_result['errors']}")
                        else:
                            logger.warning(f"API响应中不包含'data'字段: {list(batch_result.keys())}")

                        success = True
                        logger.info(f"批次 {i+1}/{len(batches)} 处理成功，{len(batch)} 条文本，耗时 {batch_time:.4f} 秒")
                    else:
                        # 如果状态码为429，增加重试延迟
                        if response.status_code == 429:
                            wait_time = retry_delay * (2 ** attempts)  # 指数增长重试延迟
                            logger.warning(f"批次 {i+1}/{len(batches)} 请求被限制，等待 {wait_time:.2f} 秒后重试")
                            time.sleep(wait_time)
                        else:
                            logger.warning(f"批次 {i+1}/{len(batches)} 请求失败，状态码: {response.status_code}，响应: {response.text}，尝试 {attempts+1}/{retry_count}")
                            attempts += 1
                            time.sleep(retry_delay)
                except requests.exceptions.Timeout:
                    wait_time = retry_delay * (1.5 ** attempts)  # 指数增长重试延迟，避免429状态码
                    logger.warning(f"批次 {i+1}/{len(batches)} 请求超时，等待 {wait_time:.2f} 秒后重试")
                    time.sleep(wait_time)
                    attempts += 1
                except Exception as e:
                    logger.error(f"批次 {i+1}/{len(batches)} 发生异常: {str(e)}，尝试 {attempts+1}/{retry_count}")
                    attempts += 1
                    time.sleep(retry_delay)

            if not success:
                logger.error(f"批次 {i+1}/{len(batches)} 在 {retry_count} 次尝试后失败")
                raise Exception(f"无法处理批次 {i+1}/{len(batches)}，所有重试都失败")

            # 提取嵌入向量
            if "data" in batch_result:
                batch_embeddings = [item["embedding"] for item in batch_result["data"]]
                all_embeddings.extend(batch_embeddings)

        # 计算统计信息
        avg_time_per_batch = total_time / len(batches) if batches else 0
        avg_time_per_text = total_time / len(texts) if texts else 0

        # 构建结果
        result = {
            "embeddings": all_embeddings,
            "total_tokens": len(texts),  # 简化的令牌计数（实际应该来自API响应）
            "model": self.model,
            "dimensions": self.VECTOR_DIMENSION,
            "batch_stats": {
                "total_batches": len(batches),
                "total_time": total_time,
                "avg_time_per_batch": avg_time_per_batch,
                "avg_time_per_text": avg_time_per_text
            }
        }

        return result

    def embed_query(self, query: str):
        """
        将查询文本转换为嵌入向量

        参数:
            query: 需要转换的查询文本

        返回:
            嵌入向量数组
        """
        # 检查查询文本是否为空
        if not query or not query.strip():
            error_msg = "查询文本不能为空"
            logger.error(error_msg)
            raise ValueError(error_msg)

        try:
            # 获取嵌入向量
            embedding_result = self.get_embeddings(query)

            if "embeddings" in embedding_result and len(embedding_result["embeddings"]) > 0:
                # 返回嵌入向量
                return embedding_result["embeddings"][0]
            else:
                # 错误处理：无法获取嵌入向量
                error_msg = f"无法获取嵌入向量，API返回结果: {embedding_result}"
                logger.error(error_msg)
                raise ValueError(error_msg)

        except Exception as e:
            error_msg = f"嵌入模型处理异常: {str(e)}"
            logger.error(error_msg)
            # 直接抛出异常
            raise RuntimeError(error_msg) from e
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='gbk') as file:
            return file.read()
def split_text(text, chunk_size=512):
    """
    将文本按固定字符数进行分块

    参数:
        text: 输入的完整文本字符串
        chunk_size: 每个块的最大字符数

    返回:
        分块后的文本列表，每个元素为一个字符串
    """
    chunks = [text[i:i+chunk_size].strip() for i in range(0, len(text), chunk_size)]
    return chunks
# 示例用法
def example_usage():
    """
    BGE-M3 客户端示例用法

    """
    file_path = "2.txt"
    text = read_file(file_path)
    chunks = split_text(text,30)
    # API 密钥
    API_KEY = "sk-jpkuroorxergfxzgdwtznqppggwualbfruicevnhtgukrxrz"

    # 初始化客户端
    client = BGEM3Client(api_key=API_KEY)

    # 单个文本示例
    # single_text = "深度学习和人工智能技术正在改变世界"
    # result = client.get_embeddings(single_text)
    # print(f"单个文本嵌入维度: {len(result['embeddings'][0])}")

    # 批量文本示例
    texts = [
        "政府工作报告 ——2025年3月5日在第十四届全国人民代表大会第世界",
        "自然语言处理是人工智能的重要分支",
        "向量数据库可以高效存储和检索嵌入向量",
        "大型语言模型具有强大的文本生成能力",
        "机器学习算法可以从数据中学习模式"
    ]

    batch_result = client.get_embeddings(texts)
    print(batch_result)
    print(f"批量处理结果:")
    print(f"嵌入向量数量: {len(batch_result['embeddings'])}")
    print(f"处理时间: {batch_result['batch_stats']['total_time']:.4f} 秒")
    print(f"每文本平均时间: {batch_result['batch_stats']['avg_time_per_text']:.4f} 秒")

    # 查询文本示例
    query_text = "人工智能技术"
    query_embedding = client.embed_query(query_text)
    print(f"查询文本嵌入向量: {query_embedding}")


if __name__ == "__main__":
    example_usage()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from deepseek_tokenizer import ds_token

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib  # 用于保存模型和聚类中心
import os
import time


# 读取文件
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='gbk') as file:
            return file.read()


def split_text(text, chunk_size=256):
    """
    将文本转换为 tokens，按 token 数量进行切分，然后将每个 token 切分块重新转换为汉字返回。

    参数:
        text: 输入的完整文本字符串
        chunk_size: 每个块的最大 token 数量

    返回:
        分块后的文本列表，每个元素为一个字符串
    """
    # 将文本编码为 tokens
    token_list = ds_token.encode(text)

    # 按照指定的 chunk_size 将 token 切分
    chunks_tokens = [token_list[i:i + chunk_size] for i in range(0, len(token_list), chunk_size)]

    # 将每个 token 切分块重新解码为汉字并返回
    chunks = [ds_token.decode(tokens) for tokens in chunks_tokens]
    print("分块数量:", len(chunks))
    for i, chunk in enumerate(chunks):
      print(len(chunk))

    return chunks


# 用于确定最优聚类数的 MeanShift（此处暂不启用）
from sklearn.cluster import MeanShift


def determine_optimal_clusters(embeddings, max_bandwidth=10, num_steps=10):
    cluster_counts = []  # 用于存储每个bandwidth对应的聚类数

    bandwidths = np.linspace(0.1, max_bandwidth, num_steps)
    for bandwidth in bandwidths:
        mean_shift = MeanShift(bandwidth=bandwidth)
        mean_shift.fit(embeddings)
        cluster_count = len(np.unique(mean_shift.labels_))
        cluster_counts.append(cluster_count)

    plt.plot(bandwidths, cluster_counts, marker='o')
    plt.xlabel('Bandwidth')
    plt.ylabel('Number of Clusters')
    plt.title('Mean Shift Clustering for Optimal Bandwidth')
    plt.savefig('txt_output/output.png')

    optimal_bandwidth_index = np.argmin(np.diff(cluster_counts)) + 1
    optimal_bandwidth = bandwidths[optimal_bandwidth_index]
    optimal_cluster_count = cluster_counts[optimal_bandwidth_index]

    return optimal_cluster_count


# 获取嵌入表示
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)  # 取平均池化作为文档的嵌入
    return embedding.squeeze().numpy()


def kmeans_clustering(embeddings, chunks, n_clusters=3, random_state=0):
    # 使用 KMeans 聚类算法
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(embeddings)
    labels = kmeans.predict(embeddings)
    cluster_centers = kmeans.cluster_centers_

    # 为每个聚类中心找到最接近的文本块
    closest_chunks = []
    for cluster_idx in range(n_clusters):
        # 获取当前聚类中心
        cluster_center = cluster_centers[cluster_idx]
        # 找到当前聚类中所有文本的索引
        cluster_indices = [i for i, label in enumerate(labels) if label == cluster_idx]

        best_chunk_idx = None
        best_similarity = -1  # 最小的相似度
        # 计算所有文本块与当前聚类中心的相似度，并选择最接近的文本块
        for idx in cluster_indices:
            chunk_embedding = embeddings[idx]
            similarity = cosine_similarity([chunk_embedding], [cluster_center])[0][0]
            if similarity > best_similarity:
                best_similarity = similarity
                best_chunk_idx = idx

        # 保存最接近的文本块以及该聚类的标签和相似度
        closest_chunks.append({
            "Cluster Label": cluster_idx,
            "Closest Chunk": chunks[best_chunk_idx],
            "Similarity": best_similarity
        })

    return labels, n_clusters, cluster_centers, closest_chunks


# 使用高斯混合模型（GMM）进行聚类
def gmm_clustering(embeddings, n_clusters=3, random_state=0):
    gm = GaussianMixture(n_components=n_clusters, random_state=random_state).fit(embeddings)
    labels = gm.predict(embeddings)
    return labels, n_clusters, gm.means_  # 返回聚类标签和聚类中心


# 保存聚类结果、聚类中心和文本片段
def save_clustering_results(chunks, labels, cluster_centers, closest_chunks,
                            file_name="txt_output/clustering_results.csv",
                            cluster_centers_file="txt_output/cluster_centers.pkl",
                            closest_chunks_file="txt_output/closest_chunks.csv"):
    # 保存聚类结果
    df = pd.DataFrame({
        'Text Chunk': chunks,
        'Cluster Label': labels
    })
    df.to_csv(file_name, index=False, encoding='utf-8-sig')
    print(f"Clustering results saved to {file_name}")

    # 保存聚类中心
    joblib.dump(cluster_centers, cluster_centers_file)
    print(f"Cluster centers saved to {cluster_centers_file}")

    # 保存距离每个聚类中心最近的文本块及其相似度
    closest_chunks_df = pd.DataFrame(closest_chunks)
    closest_chunks_df.to_csv(closest_chunks_file, index=False, encoding='utf-8-sig')
    print(f"Closest chunks saved to {closest_chunks_file}")


# 保存并加载聚类中心（保持原有功能不变）
def load_cluster_centers(cluster_centers_file="cluster_centers.pkl"):
    cluster_centers = joblib.load(cluster_centers_file)
    return cluster_centers


def load_clustering_results(file_name="clustering_results.csv"):
    df = pd.read_csv(file_name, encoding='utf-8-sig')
    labels = df['Cluster Label'].values
    chunks = df['Text Chunk'].values
    return labels, chunks


# 查找最相关的聚类并返回该聚类的所有片段
def find_most_relevant_cluster(new_text, chunks, cluster_centers, labels):
    new_embedding = get_embedding(new_text)
    similarities = cosine_similarity([new_embedding], cluster_centers)
    most_relevant_cluster = np.argmax(similarities)
    relevant_chunks = [chunks[i] for i in range(len(chunks)) if labels[i] == most_relevant_cluster]
    return most_relevant_cluster, relevant_chunks, similarities


#########################
# 第一部分：文本嵌入并保存
#########################
def embed_and_save_text(file_path, chunk_size=512,
                        embedding_save_path="txt_output/embeddings.npy",
                        chunks_save_path="txt_output/chunks.pkl"):
    # 读取文件并分块
    text = read_file(file_path)

    chunks = split_text(text, 256)
    # API 密钥
    API_KEY = "sk-jpkuroorxergfxzgdwtznqppggwualbfruicevnhtgukrxrz"

    # 初始化客户端
    client = BGEM3Client(api_key=API_KEY)
    batch_result = client.get_embeddings(chunks)
    # 获取每个文本块的嵌入
    embeddings = batch_result['embeddings']
    # 保存嵌入结果和文本块
    np.save(embedding_save_path, np.array(embeddings))
    joblib.dump(chunks, chunks_save_path)
    print(f"Text embeddings saved to {embedding_save_path}")
    print(f"Text chunks saved to {chunks_save_path}")

    return embeddings, chunks


#########################
# 第二部分：从文件加载嵌入并进行聚类
#########################
def load_embeddings_and_chunks(embedding_save_path="txt_output/embeddings.npy",
                               chunks_save_path="txt_output/chunks.pkl"):
    embeddings = np.load(embedding_save_path, allow_pickle=True)
    chunks = joblib.load(chunks_save_path)
    return embeddings, chunks


def cluster_embeddings_from_file(embedding_save_path="txt_output/embeddings.npy",
                                 chunks_save_path="txt_output/chunks.pkl",
                                 n_clusters=3):
    # 从文件加载嵌入和文本块
    embeddings, chunks = load_embeddings_and_chunks(embedding_save_path, chunks_save_path)

    # 使用 KMeans 进行聚类，并返回聚类标签、聚类中心和最接近的文本块
    labels, n_clusters, cluster_centers, closest_chunks = kmeans_clustering(embeddings, chunks, n_clusters)

    # 保存聚类结果、聚类中心和最接近的文本块
    save_clustering_results(chunks, labels, cluster_centers, closest_chunks)

    return labels, cluster_centers, closest_chunks


#########################
# 修改后的主流程：先执行第一部分，再执行第二部分
#########################
def main(file_path, chunk_size=100,
         embedding_save_path="txt_output/embeddings.npy",
         chunks_save_path="txt_output/chunks.pkl"):
    # 判断是否存在已保存的嵌入和文本块文件
    if os.path.exists(embedding_save_path) and os.path.exists(chunks_save_path):
        print("Found existing embeddings and chunks. Loading and clustering...")
        # 从文件中加载嵌入，并进行聚类
        start_time = time.time()
        labels, cluster_centers, closest_chunks = cluster_embeddings_from_file(embedding_save_path, chunks_save_path,
                                                                               n_clusters=10)
        end_time = time.time()
        print(f"Clustering executed in {end_time - start_time:.2f} seconds")
    else:
        print("Embeddings or chunks not found. Generating embeddings and performing clustering...")
        # 第一部分：生成嵌入并保存
        embed_and_save_text(file_path, chunk_size, embedding_save_path, chunks_save_path)
        # 第二部分：从文件中加载嵌入，并进行聚类
        labels, cluster_centers, closest_chunks = cluster_embeddings_from_file(embedding_save_path, chunks_save_path,
                                                                               n_clusters=10)

    return labels, cluster_centers, closest_chunks


file_path = "2.txt"
labels, cluster_centers, closest_chunks = main(file_path)











Embeddings or chunks not found. Generating embeddings and performing clustering...
分块数量: 39
508
451
476
489
480
500
490
484
491
489
468
448
459
455
471
458
497
510
495
484
480
475
468
456
462
429
447
481
468
485
460
480
488
518
487
484
481
463
299
5.210302829742432
Text embeddings saved to txt_output/embeddings.npy
Text chunks saved to txt_output/chunks.pkl
Clustering results saved to txt_output/clustering_results.csv
Cluster centers saved to txt_output/cluster_centers.pkl
Closest chunks saved to txt_output/closest_chunks.csv


# 新段落

In [None]:
from openai import OpenAI

# 初始化 DeepSeek API 客户端
client = OpenAI(api_key="sk-c553c5fa02e64d729f91e8593f914776", base_url="https://api.deepseek.com")
# 假设我们有一个新的问题：
question= ""
import pandas as pd

import pandas as pd


def concat_clusters(closest_chunks_file="txt_output/closest_chunks.csv",file_path = "2.txt"):
    # 读取 closest_chunks.csv 文件
    text = read_file(file_path)
    df = pd.read_csv(closest_chunks_file, encoding='utf-8-sig')
    chunks = split_text(text, 256)
    final_text = " -- -- ".join(chunks[:2])

    # 创建一个列表来存储每个聚类的最接近的文本块
    cluster_texts = []

    # 遍历每个聚类
    for cluster_label in df['Cluster Label'].unique():
        # 获取该聚类的第一个 "Closest Chunk"
        cluster_chunk = df[df['Cluster Label'] == cluster_label].iloc[0]['Closest Chunk']
        cluster_texts.append(cluster_chunk)

    final_text += " --||-- ".join(cluster_texts)

    # 拼接剩下的 chunks
    final_text += " --||-- ".join(chunks[len(chunks)-2:])  # 这里将最后两个 chunks 拼接到 final_text
    return final_text


# 调用函数
final_text = concat_clusters("txt_output/closest_chunks.csv","2.txt")



response = client.chat.completions.create(
    model="deepseek-chat",  # 使用 DeepSeek 的聊天模型
    messages=[
        {"role": "system", "content": "You are an AI assistant who provides accurate and concise answers based strictly on the given text. You should not make assumptions or provide external information outside of the provided content."},  # 系统消息，用于设置对话环境
        {"role": "user", "content": f"请根据以下提供的文档，生成一个合适的标题和一段 200-300 字的摘要。文档中的片段以 -- -- 划分。标题应简洁明了，能够准确概括文档的核心内容。摘要应完整呈现文档的主要论点、关键信息和结论，要确保一定提到文档中的每个片段，适用于后续检索和快速理解原文内容。请按照 JSON 格式输出，确保格式规范且可解析，格式如下：title:生成的标题,summary:生成的摘要，请确保 title 反映文档核心内容，summary 精炼但包括所有段落的内容。输入内容如下：{final_text}现在，不要给出任何解释性文本，请直接输出:"}  # 用户输入，替换占位符
    ],
    stream=False  # 设置为 False 获取完整响应（而不是流式响应）
)

# 输出模型返回的消息内容
print(response.choices[0].message.content)


分块数量: 39
508
451
476
489
480
500
490
484
491
489
468
448
459
455
471
458
497
510
495
484
480
475
468
456
462
429
447
481
468
485
460
480
488
518
487
484
481
463
299
```json
{
  "title": "2025年政府工作报告：回顾2024年成就与展望2025年发展目标",
  "summary": "2025年政府工作报告由国务院总理李强在第十四届全国人民代表大会第三次会议上提交。报告回顾了2024年我国在复杂严峻形势下取得的显著成就，包括经济稳步增长（GDP达134.9万亿元，增长5%）、就业物价稳定（新增就业1256万人）、产业升级（粮食产量1.4万亿斤，新能源汽车产量1300万辆）、创新能力提升（嫦娥六号月球采样）及生态环境改善（PM2.5浓度下降2.7%）。报告同时指出2025年重点工作：深化改革开放（推进财税金融改革、扩大制度型开放）、强化科技创新（发展新质生产力）、保障民生（提高医保补助、促进就业）、推动绿色低碳经济（碳达峰碳中和试点）及维护国家安全。报告强调坚持党的全面领导，统筹高质量发展，确保“十四五”规划圆满收官，全面推进中国式现代化。"
}
```
