In [None]:
!pip install openai
!pip install tiktoken

In [None]:
from openai import OpenAI
import pandas as pd
import tiktoken
import os
import json
import time

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Perspectivism/Dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Perspectivism/Dataset/test.csv')
val = pd.read_csv('/content/drive/MyDrive/Perspectivism/Dataset/val.csv')

In [None]:
df = pd.concat([train, test, val], axis=0)
print(df.shape)

In [None]:
b_1 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_1.csv')
b_2 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_2.csv')
b_3 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_3.csv')
b_4 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_4.csv')
b_5 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_5.csv')
b_6 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_6.csv')
b_7 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_7.csv')
b_8 = pd.read_csv('/content/drive/MyDrive/Perspectivism/GPT/old_batches/batch_8.csv')

all_batches = pd.concat([b_1, b_2, b_3, b_4, b_5, b_6, b_7, b_8]) # read from batches
all_batches.rename(columns={'custom_id': 'docID', 'content': 'gpt_summaries'}, inplace=True)

# Merge with the main DataFrame
df = df.merge(all_batches, how='left', on='docID')

In [None]:
df = df[df['gpt_summaries'].isna()]
df.shape

In [None]:
from openai import OpenAI
client = OpenAI(api_key= "")

In [None]:
def process_response_to_dataframe(response_text):
    responses = response_text.strip().split("\n")
    data = []
    for response in responses:
        json_response = json.loads(response)
        custom_id = json_response.get('custom_id')
        content = json_response.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content')
        data.append({"custom_id": custom_id, "content": content})
    batch = pd.DataFrame(data)
    return batch

In [None]:
def submit_batch(file_path, description):
    batch_input_file = client.files.create(
        file=open(file_path, "rb"),
        purpose="batch"
    )
    batch_input_file_id = batch_input_file.id

    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": description
        }
    )

    batch_id = batch.id
    print(f"Batch submitted successfully. Batch ID: {batch_id}")
    return batch_id

In [None]:
def check_batch_status(batch_id, delay=120):
    while True:
        batch_status = client.batches.retrieve(batch_id)
        status = batch_status.status
        print(f"Batch ID: {batch_id} Status: {status}")

        if status == 'completed':
            output_file_id = batch_status.output_file_id
            return {'status': status, 'output_file_id': output_file_id}
        elif status == 'failed':
            return {'status': status, 'output_file_id': None}

        print(f"Waiting for {delay} seconds before checking again...")
        time.sleep(delay)

In [None]:
def process_and_save_batch(output_file_id, batch_number, save_directory):
    file_response = client.files.content(output_file_id)
    response_text = file_response.text

    batch_df = process_response_to_dataframe(response_text)

    batch_df.to_csv(f'{save_directory}/batch_{batch_number}.csv', index=False)
    print(f"Batch {batch_number} saved as batch_{batch_number}.csv")


In [None]:
def process_batches(batch_files, save_directory):
    for batch_number, (file_path, description) in enumerate(batch_files, start=1):
        batch_id = submit_batch(file_path, description)

        status_info = check_batch_status(batch_id)

        if status_info['status'] == 'completed':
            process_and_save_batch(status_info['output_file_id'], batch_number, save_directory)
        else:
            print(f"Batch {batch_number} failed or is still in progress. Skipping file processing.")

# Please use batch numbers based on your number of batch files. if the batch is failes, please change input variable start to the batch number for conintuation
batch_files = [
    ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_11.jsonl", "batch_11 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_12.jsonl", "batch_12 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_13.jsonl", "batch_13 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_14.jsonl", "batch_14 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_15.jsonl", "batch_15 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_16.jsonl", "batch_16 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_17.jsonl", "batch_17 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_18.jsonl", "batch_18 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_19.jsonl", "batch_19 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_20.jsonl", "batch_20 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_21.jsonl", "batch_21 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_22.jsonl", "batch_22 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_23.jsonl", "batch_23 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_24.jsonl", "batch_24 summarization"),
    # ("/content/drive/MyDrive/Perspectivism/GPT/batchinput_25.jsonl", "batch_25 summarization"),

]

save_directory = '/content/drive/MyDrive/Perspectivism/GPT'
process_batches(batch_files, save_directory)