In [1]:
import pandas as pd
import os, json
df_all = pd.read_csv(".data/available_TA_all.csv")
df_all = df_all[df_all["relevant"]==2]
df_topic = pd.read_csv("df_introduction.csv")


In [4]:
print(len(df_all))
print(len(df_all[~df_all["methods"].isna()]))

34070
33651


In [13]:
rows = df_all[~df_all["methods"].isna()].sample(2)
rows

Unnamed: 0,T_id,doi,title,abstract,authors,year,publisher,relevant,wos,scopus,pubmed,introduction,methods,results,discuession,conclustion
12640,TA022753,10.1016/j.fitote.2012.03.024,"Kujigamberol, a new dinorlabdane diterpenoid i...","A new compound, 15,20-dinor-5,7,9-labdatriene-...","Kimura, Ken-ichi; Minamikawa, Yuki; Ogasawara,...",2012,FITOTERAPIA,2,True,False,False,['1. introduction Amber is a fossilized tree r...,"['2. experimental ', '2.1. chemicals and strai...",To investigate the toxicity of M. alternifolia...,['3. results and discussion The major bioactiv...,
31181,TA047380,10.1111/jcmm.15484,Nomilin targets the Keap1-Nrf2 signalling and ...,Osteoarthritis (OA) is a long-term and inflamm...,"Xue, Xing-He, Xue, Ji-Xin, Hu, Wei, Shi, Fang-...",2020,Journal of cellular and molecular medicine,2,False,False,True,Osteoarthritis (OA) is a prevalent senile diso...,Nomilin (purity > 98%) was purchased from Shan...,Figure 1A showed the chemical formula of NOM. ...,Osteoarthritis is one of the most widespread a...,


In [None]:

prompt = """
The topic is {}
=================
Method:{}
Based on the above information:
 - First, summarize the methods and techniques used in the article, without any abbreviations or shorthand.
 - Then Divide the text reasonably, each paragraph should summarize which tasks the methods used in the text were applied to, what operations were performed on those substances, followed by the original text. Do not omit any information, but you can clean the text.
 - It is recommended to merge related paragraphs. Each output paragraph should be no less than 500 words.
 - No output in any format other than JSON is required. Translate it into English as a prompt.
EXAMPLE JSON OUTPUT:
{{
    "0001": {{
        "technology":[],
        "summary":"",
        "original_text":""，
    }},
    "0002": {{
        "technology":[],
        "summary":"",
        "original_text":"",
    }},
}}
The response is:
"""
llm_data = []
for _, row in df_all[~df_all["methods"].isna()].iterrows():
    T_id = row["T_id"]
    topic = df_topic[df_topic["T_id"]==T_id]["topic"].values[0]
    content = row["methods"]
    llm_input = prompt.format(topic, content)
    llm_data.append({
        "custom_id": T_id,
        "llm_input": llm_input
    })
len(llm_data)

33651

In [17]:
def data_to_jsonl_volcengine(datas:list, save_file_path=".tmp.jsonl"):
    import json
    # 火山引擎要求的jsonl格式
    with open(save_file_path, "w", encoding="utf-8") as f:
        for _data in datas:
            custom_id = _data["custom_id"]
            user_input = _data["llm_input"]
            _input = {
                "custom_id": custom_id,
                "body": {
                    "messages": [
                        {"role": "system","content": ""},
                        {"role": "user","content": user_input}
                    ],
                    "max_tokens": 10000,
                    "temperature": 0
                },
            }
            f.write(json.dumps(_input, ensure_ascii=False) + "\n")
    return

data_to_jsonl_volcengine(llm_data, "h_method_input.jsonl")

In [8]:
import json
from tqdm import tqdm

method_datas = []
err_ids = []

with open("h_method_output.jsonl", "r") as r:
    for line in tqdm(r.readlines()):
        llm_data = json.loads(line)
        T_id = llm_data["custom_id"]
        llm_msg = llm_data["response"]["body"]["choices"][0]["message"]["content"]
        llm_msg = llm_msg.replace("`","").replace("json","").replace("\n","")
        try:
            msg_data = eval(llm_msg)
            _data = {"T_id": T_id}
            for k, v in msg_data.items():
                _data[k] = v
            method_datas.append(_data)
        except:
            err_ids.append(T_id)

100%|██████████| 33651/33651 [00:04<00:00, 6782.86it/s]


In [32]:
import pandas as pd

method_df = pd.DataFrame(method_datas)

method_split_data = []
for _, row in tqdm(method_df[:].iterrows()):
    T_id = row["T_id"]
    for column_name, value in row.items():  # 直接遍历Series的键值对
        if column_name == "T_id":
            continue
        if pd.isna(row[column_name]):
            break
        else:
            _data = {}
            _data["method_id"] = T_id + "_M_" + column_name
            _data["technology"] = value["technology"]
            if "summary" in value.keys():
                _data["summary"] = value["summary"]
            elif "summery" in value.keys():
                _data["summary"] = value["summery"]
            _data["original_text"] = value["original_text"]
            method_split_data.append(_data)

# method_df
method_split_df = pd.DataFrame(method_split_data)

33352it [00:01, 28801.39it/s]


In [33]:
method_split_df.to_csv("method_split_id.csv", index=False)

In [34]:
method_split_df

Unnamed: 0,method_id,technology,summary,original_text
0,TA000013_M_0001,[Nuclear Magnetic Resonance (NMR) Spectroscopy...,The methods and techniques used in the article...,The supporting information includes Hydrogen-1...
1,TA000013_M_0002,[Internet-based data sharing],The article mentions that the supplementary da...,This material is available free of charge via ...
2,TA000034_M_0001,[Bacterial diterpene synthase characterization...,The study characterized bacterial diterpene sy...,The methods for bacterial diterpene synthase c...
3,TA000034_M_0002,"[Protein purification, Kinetic parameter deter...",Proteins were purified to homogeneity for func...,"Protein purification, kinetic parameter determ..."
4,TA002027_M_0001,"[Protein expression and purification, Crystall...",The study employed a combination of protein ex...,The 10-epi-cubebol synthase from Sorangium cel...
...,...,...,...,...
83894,TA047435_M_0002,"[Cell proliferation assays (MTT and WST-1), Br...",Cell proliferation was assessed using MTT and ...,Cell proliferation was measured using the MTT ...
83895,TA047435_M_0003,"[Zebrafish model, EdU staining, Confocal micro...",Zebrafish models were used to study the effect...,Wild type zebrafish were purchased from Lotte ...
83896,TA047435_M_0004,"[Animal models (rats and mice), Myocardial inf...","Animal models, including rats and mice, were u...",Animal studies were carried out in accordance ...
83897,TA047547_M_0001,"[Thin-layer chromatography (TLC), Flash chroma...","The methods section describes the synthesis, p...",Reagents and solvents were purchased from Sigm...
