In [None]:
from openai import OpenAI
with open('openai-key.txt') as fin:
    client = OpenAI(api_key=fin.read().strip())

In [None]:
GPT_PROMPT = """请判断以下文字的中心主题是否围绕互联网货运平台。
常见的货运平台包括运满满、货车帮、货拉拉、满帮等。
文字中也可能为了反审查，而使用谐音、符号等代替平台名称，请注意识别。
如果文字中没有直接出现平台名称，但内容与对互联网货运平台高度相关，也需要标记为相关。
对于招聘、广告等主题，标记为不相关。
结果请用 json 表示，包括两个 key：
"reason" 为一个字符串，是做出此判断的理由；
"result" 为一个 0 或 1 的整数，表示判断结果。""".replace('\n', '')

import json
import traceback

def query_gpt(text):
    completion = client.chat.completions.create(
      model="gpt-4-turbo-preview",
      messages=[
        {"role": "system",
         "content": "You are an expert in the Chinese language and in the modern "
         "Chinese freight industry. Accurately answer all the questions from the user."},
        {"role": "user", "content": f"{GPT_PROMPT}\n\n{text}"}
      ]
    )
    msg = completion.choices[0].message
    try:
        assert msg.role == 'assistant' and msg.function_call is None and msg.tool_calls is None
        t = msg.content
        assert t.startswith('```json') and t.endswith('```')
        t = t[8:-3].replace('\\陕', '陕').replace('\\提', '提')
        ret = json.loads(t)
        ret['good'] = True
    except Exception as exc:
        print(text)
        print(msg)
        raise
    return ret


In [None]:
import pandas as pd
df = pd.read_parquet('uncertain_data.parquet')
df.head()

In [None]:
from tqdm.notebook import tqdm
from pathlib import Path
output_file = Path('data-cleaning-uncertain-by-chatgpt.json')
if output_file.exists():
    with output_file.open() as fin:
        result = json.load(fin)
    print(f'loaded {len(result)} results')
else:
    result = {}

for row_id in tqdm(range(len(df))):
    row = df.iloc[row_id]
    tid = str(row['tid'])
    if tid in result:
        continue
    text = f'标题：{row["title"]}\n内容：{row["description"]}'
    if not row["description"].strip():
        result[tid] = {'result': 0}
        continue
    result[tid] = query_gpt(text)
    tqdm.write(f'{tid}:\n{json.dumps(result[tid], indent=2, ensure_ascii=False)}\n{text}') 
    if output_file.exists():
        output_file.rename(output_file.with_suffix('.json.bak'))
    with output_file.open('w') as fout:
        json.dump(result, fout, ensure_ascii=False, indent=2)