In [0]:
spark.conf.set("aoai.endpoint", "AOAI_Endpoint")
spark.conf.set("aoai.api_key",  "AOAI_API_KEY")
spark.conf.set("aoai.gpt_deployment", "gpt-5-mini")
spark.conf.set("aoai.embed_deployment", "text-embedding-3-small")

In [0]:
AOAI_ENDPOINT       = spark.conf.get("aoai.endpoint")
AOAI_API_KEY        = spark.conf.get("aoai.api_key")
AOAI_GPT_DEPLOYMENT = spark.conf.get("aoai.gpt_deployment")

print(AOAI_ENDPOINT, AOAI_GPT_DEPLOYMENT)


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

CHUNK_TEXT_DIR    = "dbfs:/silver/jsai2025/chunks_text"
CHUNK_SUMMARY_DIR = "dbfs:/silver/jsai2025/chunks_summary"

# 入力
chunk_df = (
    spark.read
         .format("delta")
         .load(CHUNK_TEXT_DIR)
)

# ★ driver で conf を読む（ここでは SparkContext OK）
AOAI_ENDPOINT      = spark.conf.get("aoai.endpoint")
AOAI_API_KEY       = spark.conf.get("aoai.api_key")
AOAI_GPT_DEPLOYMENT = spark.conf.get("aoai.gpt_deployment")

# 出力スキーマ
summary_schema = StructType([
    StructField("pdf_path", StringType(), False),
    StructField("chunk_id", IntegerType(), False),
    StructField("text",     StringType(), False),
    StructField("summary",  StringType(), False),
])


In [0]:
from pyspark.sql import Row

def gpt_partition(
    rows_iter,
    endpoint=AOAI_ENDPOINT,
    api_key=AOAI_API_KEY,
    deployment=AOAI_GPT_DEPLOYMENT,
):
    """
    各パーティション内で GPT-5-mini を 1 クライアントだけ作り、
    row.text を要約して Row(...) を返す。
    """
    from openai import AzureOpenAI

    client = AzureOpenAI(
        api_key=api_key,
        azure_endpoint=endpoint,
        api_version="2024-02-15-preview",  # 必要に応じて合わせる
    )

    MAX_CHARS = 6000  # 念のため context 長超え対策

    for row in rows_iter:
        text = row.text or ""
        if len(text) > MAX_CHARS:
            print(
                f"[WARN] skip long text: pdf_path={row.pdf_path}, "
                f"chunk_id={row.chunk_id}, len={len(text)}"
            )
            continue

        # プロンプトはお好みで
        prompt = (
            "以下は学会論文の一部です。日本語で3行以内に要約してください。\n\n"
            f"{text}"
        )

        # ★ Azure OpenAI Responses API を使用（GPT-5-mini 向け）
        resp = client.responses.create(
            model=deployment,
            input=[{"role": "user", "content": prompt}],
        )

        summary_text = resp.output[0].content[0].text  # SDK バージョンにより多少違うかも

        yield Row(
            pdf_path=row.pdf_path,
            chunk_id=int(row.chunk_id),
            text=text,
            summary=summary_text,
        )


In [0]:
%pip install openai==1.55.0


In [0]:
dbutils.library.restartPython()


In [0]:
# PDF 単位でパーティション分け & 並列度 32 で GPT 呼び出し
summary_rdd = (
    chunk_df
    .repartition(32, "pdf_path")
    .rdd
    .mapPartitions(gpt_partition)
)

summary_df = spark.createDataFrame(summary_rdd, schema=summary_schema)

display(summary_df.limit(5))

(
    summary_df
    .write
    .mode("overwrite")
    .format("delta")
    .save(CHUNK_SUMMARY_DIR)
)
