In [None]:
import pandas as pd
import os, json
df_all = pd.read_csv(".data/available_TA_all.csv")
df_all = df_all[df_all["relevant"]==2]


In [None]:
row = df_all.sample(1)
print(row["title"].values[0])
row

In [None]:
df_introduction = df_all[~df_all["introduction"].isna()]
# introduction parse
prompt = """
Title:{}
Abstract:{}
=================
Introduction:{}
First, summarize the theme of the article in one sentence.
Based on the background information, extract all terpenoids involved in the article and their biological sources, and summarize the key words. There is no need for abbreviations or acronyms.
Then, divide the Introduction part. For each paragraph, summarize in one sentence which methods were used in which tasks and which operations were carried out on which substances. 
Follow the original text without omission, but text cleaning can be performed.
It is recommended to merge related paragraphs. Each output paragraph should not be less than 500 words.
EXAMPLE JSON OUTPUT:
{{
    "topic":"",
    "source":[],
    "terpenoid":[],
    "keywords":[],
    "0001": {{
        "summary":"",
        "original_text":""，
    }},
    "0002": {{
        "summary":"",
        "original+text":"",
    }},
}}
"""
llm_data = []
for _, row in df_introduction.iterrows():
    custom_id = row["T_id"]
    title = row["title"]
    abstract = row["abstract"]
    content = row["introduction"]
    llm_input = prompt.format(title, abstract, content)
    llm_data.append({
        "custom_id": custom_id,
        "llm_input": llm_input
    })
len(llm_data)


def data_to_jsonl_volcengine(datas:list, save_file_path=".tmp.jsonl"):
    import json
    # 火山引擎要求的jsonl格式
    with open(save_file_path, "w", encoding="utf-8") as f:
        for _data in datas:
            custom_id = _data["custom_id"]
            user_input = _data["llm_input"]
            _input = {
                "custom_id": custom_id,
                "body": {
                    "messages": [
                        {"role": "system","content": ""},
                        {"role": "user","content": user_input}
                    ],
                    "max_tokens": 10000,
                    "temperature": 0
                },
            }
            f.write(json.dumps(_input, ensure_ascii=False) + "\n")
    return

data_to_jsonl_volcengine(llm_data, "h_introduction_input_err.jsonl")

In [None]:
introduction_datas = []
import json
import pandas as pd
err_data = {}
with open("h_introduction_output_err.jsonl", "r") as r:
    for line in r:
        _data = json.loads(line)
        custom_id = _data["custom_id"]
        response = _data["response"]["body"]["choices"][0]["message"]["content"]
        response = response.replace("`","").replace("json","").replace("\n","")
        intro_data = {"custom_id":custom_id}
        try:
            resp_data = eval(response)
            for k, v in resp_data.items():
                intro_data[k] = v
            introduction_datas.append(intro_data)
        except:
            err_data[custom_id] = response
            continue
        # break
len(err_data)

# 解析批量推理结果

In [None]:
import pandas as pd
import os, json

df_all = pd.read_csv(".data/available_TA_all.csv")
df_all = df_all[df_all["relevant"]==2]
df_0 = pd.read_csv("introduce_split.csv")


In [57]:
df_intro = df_all.copy()[["T_id", "title"]]
df_intro

Unnamed: 0,T_id,title
0,TA000000,Yeast Particle Encapsulation of Scaffolded Ter...
1,TA000001,Volatile Terpenes and Terpenoids from Workers ...
2,TA000002,Yeast Particles Hyper-Loaded with Terpenes for...
3,TA000003,Olfactory Impact of Terpene Alcohol on Terpene...
4,TA000004,Using volatile organic compounds to enhance at...
...,...,...
34700,TA050916,Combined acetyl-11-keto-β-boswellic acid and r...
34701,TA050917,Monotropein promotes angiogenesis and inhibits...
34702,TA050918,Ginsenoside Rb1 protects against ischemia/repe...
34703,TA050919,Ginsenoside Rg1 protects against neuronal dege...


In [8]:
import pandas as pd

df = pd.read_csv("introduce_split.csv")
df.head()


Unnamed: 0,custom_id,topic,source,terpenoid,keywords,0001,0002,0003,0004,0005,0006,Introduction,0007,0008,0009,0010,0011,0012,0013
0,TA000590,Functional and allelic diversity of terpene sy...,"['Hops (Humulus lupulus L.)', 'Female inflores...","['β-myrcene', 'α-humulene', 'β-caryophyllene',...","['Aroma', 'α-Farnesene', 'GC–MS', 'Genome anal...",{'summary': 'Hops (Humulus lupulus L.) are dio...,{'summary': 'Hops belong to the Cannabaceae fa...,{'summary': 'Terpenoid biosynthesis in hops in...,{'summary': 'New Zealand-grown hop cultivars e...,,,,,,,,,,
1,TA000718,The influence of terpene content on the flamma...,"['Pinus halepensis', 'Cupressus sempervirens',...","['monoterpenes', 'sesquiterpenes', 'diterpenes']","['flammability', 'wildland-urban interface', '...","{'summary': ""The introduction highlights the i...",,,,,,,,,,,,,
2,TA000629,Terpene profiles in Cantal and Saint-Nectaire-...,"['natural grassland', 'dairy cows', 'Cantal ch...","['beta-caryophyllene', 'alpha-pinene', 'beta-p...","['terpene profiles', 'cheese', 'raw milk', 'pa...","{'summary': 'Terpenes, as plant secondary meta...",,,,,,,,,,,,,
3,TA004953,The study investigates the bioactive and aroma...,"['Kaffir (Citrus hystrix)', 'Key lime (Citrus ...","['limonene', 'citral', 'terpinen-4-ol']","['polyphenols', 'antioxidant', 'human serum al...",{'summary': 'The introduction highlights the i...,,,,,,,,,,,,,
4,TA001205,Developing forest therapy programmes based on ...,"['fir (Abies alba)', 'spruce (Picea abies)']","['α-cadinol', 'spathulenol']","['forest therapy', 'terpenes', 'health benefit...",{'summary': 'The introduction highlights the p...,,,,,,,,,,,,,


In [19]:
from tqdm import tqdm

introduction_chunks_data = []
for _, row in tqdm(df.iterrows()):
    T_id = row["custom_id"]
    for sub_id in ["0001","0002","0003","0004","0005","0006","0007","0008","0009","0010","0011","0012","0013"]:
        if pd.isna(row[sub_id]):
            break
        _data = {}
        _data["introduction_id"] = T_id + "_I_" + sub_id
        if "summary" in eval(row[sub_id]).keys():
            _data["summary"] = eval(row[sub_id])["summary"]
        else:
            _data["summary"] = eval(row[sub_id])["summery"]
        _data["original_text"] = eval(row[sub_id])["original_text"]
        introduction_chunks_data.append(_data)

introduction_df = pd.DataFrame(introduction_chunks_data)



32284it [00:05, 5799.97it/s]


In [21]:
introduction_df.to_csv(".data/chunks_introduction.csv", index=False)

In [24]:
introduction_df

Unnamed: 0,introduction_id,summary,original_text
0,TA000590_I_0001,Hops (Humulus lupulus L.) are dioecious climbi...,**Background** Hops (*Humulus lupulus* L.) are...
1,TA000590_I_0002,Hops belong to the Cannabaceae family and prod...,**Background** Hops (*Humulus lupulus* L.) are...
2,TA000590_I_0003,Terpenoid biosynthesis in hops involves the me...,Terpenoids are derived either from the mevalon...
3,TA000590_I_0004,New Zealand-grown hop cultivars exhibit unique...,New Zealand grows many locally bred hop cultiv...
4,TA000718_I_0001,The introduction highlights the increasing wil...,Mediterranean regions are characterised by cli...
...,...,...,...
64464,TA019435_I_0001,The introduction outlines the two pathways for...,In plants two different pathways are utilized ...
64465,TA019435_I_0002,The study utilized isotope-labeled intermediat...,The availability of both early intermediates i...
64466,TA029724_I_0001,The introduction highlights lupeol as a bioact...,"**1. introduction** Triterpenoid, lupeol (3ˇ-h..."
64467,TA001094_I_0001,The introduction highlights terpenes as versat...,1. introduction Terpenes and terpenoids consti...


In [None]:
from volcenginesdkarkruntime import Ark
client = Ark(
    base_url="https://ark.cn-beijing.volces.com/api/v3",
    api_key="44ea595c-61c3-4af1-87fb-50464b5cc95a",
)

def get_embedding(question_list):
    resp = client.embeddings.create(
        model="ep-20250418144025-nmvsv",
        input=question_list
    )
    return resp.data[0].embedding


df_test = introduction_df.sample(100)
df_test


Unnamed: 0,introduction_id,summary,original_text
31545,TA000534_I_0002,The introduction continues by describing the a...,"In order to confirm this hypothesis, lipid con..."
6940,TA012835_I_0001,The study focuses on cloning and analyzing the...,Increasing demand of artemisinin in the treatm...
14822,TA025960_I_0002,The introduction discusses the significance of...,Since 1970 several studies have shown that a w...
51390,TA046986_I_0001,The study focuses on developing an efficient m...,"Glycyrrhizic acid (GL), the major component in..."
40395,TA000354_I_0002,The text discusses the enzymatic strategies fo...,1.1 extended substrate space Prenylation of ph...
...,...,...,...
57177,TA048556_I_0002,The study describes the characterization of GA...,GA20ox proteins have been identified in a larg...
32798,TA039343_I_0002,The introduction discusses the role of biofilm...,It is worth mentioning that infectious bacteri...
1541,TA000575_I_0001,Silsesquioxanes (SQ) are organosilicon compoun...,"Silsesquioxanes (SQ), organosilicon compounds ..."
45513,TA044713_I_0002,The introduction describes the pharmacological...,Berberine and berberrubine are isoquinoline al...


In [33]:
tqdm.pandas()  # 激活pandas的进度条支持


df_test["embedding"] = df_test["summary"].progress_apply(get_embedding)
df_test

100%|██████████| 100/100 [00:15<00:00,  6.37it/s]


Unnamed: 0,introduction_id,summary,original_text,embedding
31545,TA000534_I_0002,The introduction continues by describing the a...,"In order to confirm this hypothesis, lipid con...","[4.0625, -0.93359375, -1.6484375, -1.78125, -4..."
6940,TA012835_I_0001,The study focuses on cloning and analyzing the...,Increasing demand of artemisinin in the treatm...,"[3.09375, -1.4921875, 0.0869140625, -3.1875, -..."
14822,TA025960_I_0002,The introduction discusses the significance of...,Since 1970 several studies have shown that a w...,"[2.75, -1.1640625, -0.74609375, -0.953125, -1...."
51390,TA046986_I_0001,The study focuses on developing an efficient m...,"Glycyrrhizic acid (GL), the major component in...","[1.09375, 3.78125, -0.4453125, -1.1328125, -0...."
40395,TA000354_I_0002,The text discusses the enzymatic strategies fo...,1.1 extended substrate space Prenylation of ph...,"[0.462890625, -0.361328125, -0.5703125, -0.402..."
...,...,...,...,...
57177,TA048556_I_0002,The study describes the characterization of GA...,GA20ox proteins have been identified in a larg...,"[2.15625, 2.75, 2.296875, -0.609375, -0.863281..."
32798,TA039343_I_0002,The introduction discusses the role of biofilm...,It is worth mentioning that infectious bacteri...,"[0.5859375, -0.004913330078125, -1.1796875, -2..."
1541,TA000575_I_0001,Silsesquioxanes (SQ) are organosilicon compoun...,"Silsesquioxanes (SQ), organosilicon compounds ...","[0.0703125, 3.4375, -1.8515625, 0.890625, -1.8..."
45513,TA044713_I_0002,The introduction describes the pharmacological...,Berberine and berberrubine are isoquinoline al...,"[0.89453125, 0.98046875, -0.6953125, -1.375, -..."
