In [None]:
import os

import gokart
import json
import pandas as pd
from google.cloud import storage

from qrelllm.llm.vertex import GenerateTestCollectionWithVertexAIBatch
from qrelllm.queries import LoadQueries
from qrelllm.format import clean_json
from qrelllm.upload.gcs import UploadBatchForGCS
from qrelllm.upload.openai import UploadOpenAI


In [None]:

project = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
location = os.getenv("GOOGLE_CLOUD_LOCATION")


def main():
    bucket_name = "llm-testcollection"

    queries = LoadQueries(csv_file_path="../data/queries.csv")
    upload_task = UploadBatchForGCS(
        queries=queries,
        bucket_name=bucket_name,
        destination_blob_name="prompt/prompts.jsonl",
    )
    generate_search_dataset_task = GenerateTestCollectionWithVertexAIBatch(
        project=project,
        location=location,
        destination_uri_prefix=f"gs://{bucket_name}/result",
        upload_task=upload_task,
    )
    gokart.build(generate_search_dataset_task)


if __name__ == "__main__":
    main()


## gcsから結果ロード

In [None]:
def list_json_to_series(d):
    keys, values = zip(*[item for dct in d for item in dct.items()])
    df = pd.Series(values, index=keys)
    return df[["content"]]


def json_to_series(d):
    keys, values = zip(*[(dct["label"], dct["value"]) for dct in json.loads(d)])
    df = pd.Series(values, index=keys)
    return df


def download_blob(bucket_name, source_blob_name, destination_file_name):
    # """Downloads a blob from the bucket."""
    # # The ID of your GCS bucket
    # # bucket_name = "your-bucket-name"

    # # The ID of your GCS object
    # # source_blob_name = "storage-object-name"

    # # The path to which the file should be downloaded
    # # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Downloaded storage object {} from bucket {} to local file {}.".format(
            source_blob_name, bucket_name, destination_file_name
        )
    )

    df = pd.read_json(destination_file_name, orient="records", lines=True)
    df = pd.concat([df, df["predictions"].apply(list_json_to_series)], axis=1)
    df["content"] = df["content"].apply(clean_json)
    print(df["content"])
    result = pd.concat([df, df["content"].apply(json_to_series)], axis=1)
    print(df)


bucket_name = "llm-testcollection"
source_blob_name = (
    "result/prediction-model-XXXX/000000000000.jsonl"
)
download_blob(bucket_name, source_blob_name, "../data/batch-result.jsonl")
