In [None]:
prompt = """
ABSTRACT:{},
================
CURRENT paragraph:{},
================
Based on the abstract of the article, 
merge and clean the current paragraph, removing format characters, quotations, and other contents irrelevant to the article. 
[DON NOT OMIT Original expression !!!]
Then, based on the theme of the article, 
generate {} hypothetical questions for this paragraph, centering around the terpene theme.
EXAMPLE JSON OUTPUT:
{{
    "cleaned_paragraph": "",
    "hypothetical_questions": [],
}}
"""

In [None]:
from volcenginesdkarkruntime import Ark
client = Ark(
    base_url="https://ark.cn-beijing.volces.com/api/v3",
    api_key="44ea595c-61c3-4af1-87fb-50464b5cc95a",
)
def get_v3_response(query):
    # deepseek-v3-0324
    completion = client.chat.completions.create(
        model="ep-20250327150949-k85zh",
        messages=[
            {"role": "user", "content": query},
        ],
    )
    return completion.choices[0].message.content

def get_embedding(question_list):
    resp = client.embeddings.create(
        model="ep-20250418144025-nmvsv",
        input=question_list
    )
    return resp.data[0].embedding


In [None]:
input_data = [{"custom_id": "123", "llm_input":"hello"}]

def data_to_jsonl_volcengine(datas:list, save_file_path=".tmp.jsonl"):
    import json
    # 火山引擎要求的jsonl格式
    with open(save_file_path, "w", encoding="utf-8") as f:
        for _data in datas:
            custom_id = _data["custom_id"]
            user_input = _data["llm_input"]
            _input = {
                "custom_id": custom_id,
                "body": {
                    "messages": [
                        {"role": "system","content": ""},
                        {"role": "user","content": user_input}
                    ],
                    "max_tokens": 10000,
                    "temperature": 0
                },
            }
            f.write(json.dumps(_input, ensure_ascii=False) + "\n")
    return

data_to_jsonl_volcengine(input_data, "merge_input.jsonl")

In [None]:
import os
import numpy as np
from typing import List

from volcenginesdkarkruntime import Ark

# def sliced_norm_l2(vec, dim=2048):
#     import numpy as np
#     # dim 取值 512,1024,2048
#     norm = float(np.linalg.norm(vec[:dim]))
#     return [v / norm for v in vec[:dim]]

def sliced_norm_l2(vec, dim=2048):
    import numpy as np
    # dim 取值 512,1024,2048
    vec = np.array(vec[:dim], dtype=np.float32)  # Convert to float32 array
    norm = np.linalg.norm(vec).astype(np.float32)  # Compute norm and ensure float32
    return vec / norm

client = Ark(api_key="44ea595c-61c3-4af1-87fb-50464b5cc95a")


print("----- embeddings request -----")
#  doubao-embedding 模型用于检索场景，query 建议必须添加如下 instruction 前缀保证检索效果
query_instruction = ""
query = "天是什么颜色？"
# 向量索引的 document 则不添加 instruction
document = "天空呈现颜色主要与“瑞利散射”现象有关，具体形成过程如下：太阳光是由红、橙、黄、绿、蓝、靛、紫等多种颜色的光混合而成的。大气中存在着无数的气体分子和其他微粒。当太阳光进入地球大气层时，波长较长的红光、橙光、黄光能穿透大气层，直接射到地面，而波长较短的蓝、紫、靛等色光，很容易被悬浮在空气中的微粒阻挡，从而使光线散射向四方。其中蓝光波长较短，散射作用更强，因此我们眼睛看到的天空主要呈现蓝色。在一些特殊情况下，如傍晚或早晨，阳光斜射角度大，通过大气层的路径较长，蓝光等短波长光被散射得更多，而红光等长波长光散射损失较少，这时天空可能会呈现橙红色等其他颜色。"

resp = client.embeddings.create(
    model="ep-20250418144025-nmvsv",
    input=[
        query_instruction + query,  # query text
        document,  # document text
    ]
)

embeddings = [item.embedding for item in resp.data]
query_doc_relevance_score_2048d = np.matmul(
    sliced_norm_l2(embeddings[0], 2048),  # query embedding
    sliced_norm_l2(embeddings[1], 2048)  # document embedding
)
query_doc_relevance_score_1024d = np.matmul(
    sliced_norm_l2(embeddings[0], 1024),  # query embedding，压缩维度
    sliced_norm_l2(embeddings[1], 1024)  # document embedding，压缩维度
)
query_doc_relevance_score_512d = np.matmul(
    sliced_norm_l2(embeddings[0], 512),  # query embedding，压缩维度
    sliced_norm_l2(embeddings[1], 512)  # document embedding，压缩维度
)
print(f"2048 dim relevance score: {query_doc_relevance_score_2048d:6f}")
print(f"1024 dim relevance score: {query_doc_relevance_score_1024d:6f}")
print(f" 512 dim relevance score: {query_doc_relevance_score_512d:6f}")
# 裁剪维度越低，不同内容的区分度越低，检索的准确率、召回率、NDCG 都会变差