# 使用 OpenAI 接口完成主题建模和情感分析任务

In [1]:
import os
import pandas as pd
import openai
from openai import OpenAI
from rich import print as pp
from tqdm import tqdm
from pandarallel import pandarallel


# 1. 准备环境

## 1.1. 初始化 OpenAI 客户端

In [2]:
# 尝试阅读环境变量
try:
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    if OPENAI_API_KEY is None:
        raise KeyError
    pp("[green]环境变量 OPENAI_API_KEY 已成功读取。")
except KeyError:
    OPENAI_API_KEY = None
    pp("[red]未找到环境变量 OPENAI_API_KEY。请确保已正确设置。")

client = OpenAI(api_key=OPENAI_API_KEY)  # 替换为你的 API 密钥

# 2. 基于单行的插补


## 2.1. 定义方法

In [3]:

def predict_label(features: dict) -> str:
    """
    根据给定的特征列构造提示，并使用 GPT-4 预测标签。
    
    参数:
        features (dict): 一个字典，键为特征名称，值为特征值。
    
    返回:
        str: GPT-4 模型返回的预测标签。
    """
    # 创建一个提示，列出所有特征并请求预测结果
    prompt = "给定以下特征，请预测缺失值:\n\n"
    for key, value in features.items():
        prompt += f"- {key}: {value}\n"
    prompt += "\n预测的标签:"
    
    # 使用 GPT-4 获取预测结果
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "你是一个基于提供的特征预测标签的智能助手。"},
            {"role": "user", "content": prompt}
        ]
    )
    
    # 从响应中提取预测的标签
    predicted_label = response.choices[0].message.content.strip()
    return predicted_label

## 2.2. 运行

In [4]:
# 定义特征列，例如：
# 使用 "The capital of China is " 作为示例输入, 预测缺失的城市名称
sample_features = {
    "sentence": "The capital of China is ",
    "city": "" # 这里 city 是缺失值，需要预测,我们并没有提供 city 的值
}

label = predict_label(sample_features)
pp(f"[green]补充的缺失值:[/green] {label}")

# 3. 基于微调的插补

### 3.1. 提交微调任务

In [None]:
from openai import OpenAI
from pathlib import Path

# 步骤 0：创建训练数据文件（training_data.jsonl）
with open("training_data.jsonl", "w", encoding="utf-8") as f:
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of France is"},{"role": "assistant", "content": "Paris"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of China is"},{"role": "assistant", "content": "Beijing"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Russia is"},{"role": "assistant", "content": "Moscow"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Japan is"},{"role": "assistant", "content": "Tokyo"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Germany is"},{"role": "assistant", "content": "Berlin"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Italy is"},{"role": "assistant", "content": "Rome"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Spain is"},{"role": "assistant", "content": "Madrid"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Canada is"},{"role": "assistant", "content": "Ottawa"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of Brazil is"},{"role": "assistant", "content": "Brasilia"}]}\n'
    )
    f.write(
        '{"messages": [{"role": "system", "content": "You are an assistant."},{"role": "user", "content": "The capital of India is"},{"role": "assistant", "content": "New Delhi"}]}\n'
    )

# 步骤 1：上传训练数据文件（JSONL 格式）
try:
    # 传入 Path 对象即可
    training_file = client.files.create(
        file=Path("training_data.jsonl"), purpose="fine-tune"
    )
    print("上传训练数据文件成功，文件ID:", training_file.id)
except Exception as e:
    print("上传训练数据文件失败:", e)
    exit(1)

# 步骤 2：创建微调任务（注意 fine-tuning 接口现在在 fine_tuning.jobs 下，并使用 epochs 参数）
try:
    fine_tune_job = client.fine_tuning.jobs.create(
        training_file=training_file.id,
        model="gpt-4o-mini-2024-07-18",  # 可选择其他支持微调的模型，如 ada, curie, babbage 等
        hyperparameters={
            "n_epochs": 1,  # 设置训练的轮数
        },
    )
    print("创建微调任务成功，任务ID:", fine_tune_job.id)
except Exception as e:
    print("创建微调任务失败:", e)
    exit(1)

因为模型较大，这个过程需要一些时间（本例小于十分钟）。在此过程中，如果我们需要检查微调任务状态：

### 3.2. 检查运行状态

In [None]:
try:
    fine_tune_status = client.fine_tuning.jobs.retrieve(fine_tune_job.id)
    print("微调任务当前状态:", fine_tune_status.status)
except Exception as e:
    print("获取微调任务状态失败:", e)


### 3.3. 使用微调完成的模型进行数据插补

当任务完成时，我们尝试标注：  

In [None]:
# 测试用例：使用微调后的模型对提示进行推理
test_input = "The capital of Singapore is"
try:
    # 假设 fine_tune_job 已经完成并返回了微调模型名称
    fine_tuned_model_name = client.fine_tuning.jobs.retrieve(
        fine_tune_job.id
    ).fine_tuned_model
    result = client.completions.create(
        model=fine_tuned_model_name,
        prompt=test_input,
        max_tokens=10,
        temperature=0.0,  # 设置为 0.0 以确保确定性输出
    )
    print("Annotation result:", result.choices[0].text.strip())
except Exception as e:
    print("注释任务执行失败:", e)