In [1]:
from openai import OpenAI
from dotenv import load_dotenv 
load_dotenv()
import os
import ast
import gc
import numpy as np
gc.disable()
from copy import deepcopy as cc
import pandas as pd
from utils import (get_data, process_facts_df, process_posts_df, get_openai_batch_ip)

In [2]:
# facts_df, posts_df, mapping_df = get_data("in_data/fact_checks.csv", "in_data/posts.csv", "in_data/fact_check_post_mapping.csv")
facts_df, posts_df, mapping_df = get_data(fact_path="sample_data/trial_fact_checks.csv", posts_path="sample_data/trial_posts.csv",
                                                post2fact_mapping_path="sample_data/trial_data_mapping.csv")
facts_df = process_facts_df(facts_df)
posts_df = process_posts_df(posts_df)

In [12]:
model_size = 'small'
# Transform the [post, fact, mapping] data into a format that can be batch-encoded (for getting embedding) by OpenAI models
orig_facts_dfs =  get_openai_batch_ip(df = facts_df, model=f"text-embedding-3-{model_size}", key_col="fact_check_id", encoding_col="facts_orig")
eng_facts_dfs =  get_openai_batch_ip(df = facts_df, model=f"text-embedding-3-{model_size}", key_col="fact_check_id", encoding_col="facts_eng")

# Split the data into smaller chunks as there is batch size limit
# orig_facts_dfs = np.array_split(orig_facts_dfs, 5)
# eng_facts_dfs = np.array_split(eng_facts_dfs, 5)

In [13]:
# only for eval data
orig_facts_dfs.to_json(f'openai-ip/eval_orig_fact.jsonl', lines=True, orient="records")
eng_facts_dfs.to_json(f'openai-ip/eval_eng_fact.jsonl', lines=True, orient="records")

In [4]:
# only for training data
for i, (e_fact, o_fact) in enumerate(zip(eng_facts_dfs, orig_facts_dfs)):
    o_fact.to_json(f'openai-ip/eval_orig_fact_{model_size}_{i}.jsonl', lines=True, orient="records")
    e_fact.to_json(f'openai-ip/eval_eng_fact_{model_size}_{i}.jsonl', lines=True, orient="records")

In [15]:
model_size = 'small'
l1_post_df =  get_openai_batch_ip(df = posts_df, model=f"text-embedding-3-{model_size}", key_col="post_id", encoding_col="post_l1")
l2_post_df =  get_openai_batch_ip(df = posts_df, model=f"text-embedding-3-{model_size}", key_col="post_id", encoding_col="post_l2")

l1_post_df.to_json(f'openai-ip/eval_l1_post_df.jsonl', lines=True, orient="records")
l2_post_df.to_json(f'openai-ip/eval_l2_post_df.jsonl', lines=True, orient="records")

In [16]:
client = OpenAI(api_key = os.getenv('OPENAI_API_KEY'))

In [11]:
# These are the files for gpt-small
# files_to_be_processed = [f'openai-ip/eng_fact_{i}.jsonl' for i in range(len(eng_facts_dfs))]
# files_to_be_processed += [f'openai-ip/orig_fact_{i}.jsonl' for i in range(len(orig_facts_dfs))]
# files_to_be_processed += ['openai-ip/l1_post_df.jsonl', 'openai-ip/l2_post_df.jsonl']

# These are the files for gpt-large
files_to_be_processed = [f'openai-ip/eng_fact_{model_size}_{i}.jsonl' for i in range(len(eng_facts_dfs))]
files_to_be_processed += [f'openai-ip/orig_fact_{model_size}_{i}.jsonl' for i in range(len(orig_facts_dfs))]
files_to_be_processed += [f'openai-ip/l1_post_{model_size}_df.jsonl', f'openai-ip/l2_post_{model_size}_df.jsonl']

files_to_be_processed

['openai-ip/eng_fact_large_0.jsonl',
 'openai-ip/eng_fact_large_1.jsonl',
 'openai-ip/eng_fact_large_2.jsonl',
 'openai-ip/eng_fact_large_3.jsonl',
 'openai-ip/eng_fact_large_4.jsonl',
 'openai-ip/orig_fact_large_0.jsonl',
 'openai-ip/orig_fact_large_1.jsonl',
 'openai-ip/orig_fact_large_2.jsonl',
 'openai-ip/orig_fact_large_3.jsonl',
 'openai-ip/orig_fact_large_4.jsonl',
 'openai-ip/l1_post_large_df.jsonl',
 'openai-ip/l2_post_large_df.jsonl']

In [21]:
files_to_be_processed = [os.path.join('openai-ip', f) for f in os.listdir('openai-ip') if f.startswith('eval')]
files_to_be_processed

['openai-ip/eval_eng_fact_large.jsonl',
 'openai-ip/eval_l1_post_df.jsonl',
 'openai-ip/eval_eng_fact.jsonl',
 'openai-ip/eval_orig_fact.jsonl',
 'openai-ip/eval_l1_post_large_df.jsonl',
 'openai-ip/eval_l2_post_df.jsonl',
 'openai-ip/eval_l2_post_large_df.jsonl',
 'openai-ip/eval_orig_fact_large.jsonl']

In [22]:
batch_file2ids = {}

for file in files_to_be_processed:
    batch_input_file = client.files.create(
    file=open(file, "rb"),
    purpose="batch"
    )
    batch_input_file_id = batch_input_file.id
    batch_response = client.batches.create(input_file_id=batch_input_file_id,
                                        endpoint="/v1/embeddings",
                                        completion_window="24h",
                                        metadata={"description": f"job for file at input: {file}; batch_input_file_id : {batch_input_file_id}"})
    batch_file2ids[file] = (batch_input_file_id, batch_response)

In [22]:
batch_job = client.batches.retrieve('batch_w9ssRkTQYzHPT9wWtlLue4tZ')
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [13]:
batch_file2ids

{'openai-ip/eng_fact_large_0.jsonl': ('file-Rmzgzso3hTYBebk3eZ6e1R8H',
  Batch(id='batch_jHWZVxQJuvNRTY39Z78ed5cW', completion_window='24h', created_at=1723492838, endpoint='/v1/embeddings', input_file_id='file-Rmzgzso3hTYBebk3eZ6e1R8H', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723579238, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_large_0.jsonl; batch_input_file_id : file-Rmzgzso3hTYBebk3eZ6e1R8H'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),
 'openai-ip/eng_fact_large_1.jsonl': ('file-mqJfhbQVRoNqldLejFI1q8wq',
  Batch(id='batch_YaUd5r0Js4pBVw7NeEelZPF7', completion_window='24h', created_at=1723492840, endpoint='/v1/embeddings', input_file_id='file-mqJfhbQVRoNqldLejFI1q8wq', object='batch', status='validating', cancelled_at=None, cancelling_at=N

### BATCH TRACKERS
- facts-original: batch_Vqz57UlMWEcdvx2LgkASpI5Y

{'openai-ip/eng_fact_0.jsonl': ('file-1DdhKF6v6sCbQ3DXj83GGt7o',
  Batch(id='batch_uwqGdPPck2HkSzC5L7hFbRt1', completion_window='24h', created_at=1723444161, endpoint='/v1/embeddings', input_file_id='file-1DdhKF6v6sCbQ3DXj83GGt7o', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530561, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_0.jsonl; batch_input_file_id : file-1DdhKF6v6sCbQ3DXj83GGt7o'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),
  
 'openai-ip/eng_fact_1.jsonl': ('file-0g0zL9AEbvIjVzbSxA040b1V',
  Batch(id='batch_LSNUI0NXb1Iyc9ExOIsG5Njt', completion_window='24h', created_at=1723444162, endpoint='/v1/embeddings', input_file_id='file-0g0zL9AEbvIjVzbSxA040b1V', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530562, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_1.jsonl; batch_input_file_id : file-0g0zL9AEbvIjVzbSxA040b1V'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/eng_fact_2.jsonl': ('file-pyTTLAcv4jThRaTnrKrPyVZb',
  Batch(id='batch_NWLU4GEjVifH8TIx8C7E7mSO', completion_window='24h', created_at=1723444164, endpoint='/v1/embeddings', input_file_id='file-pyTTLAcv4jThRaTnrKrPyVZb', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530564, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_2.jsonl; batch_input_file_id : file-pyTTLAcv4jThRaTnrKrPyVZb'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/eng_fact_3.jsonl': ('file-h8IRAgsvHNgaUFhyKXs8Iiyn',
  Batch(id='batch_Zv5IpylKcvFTRHvOxmj1QAkC', completion_window='24h', created_at=1723444165, endpoint='/v1/embeddings', input_file_id='file-h8IRAgsvHNgaUFhyKXs8Iiyn', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530565, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_3.jsonl; batch_input_file_id : file-h8IRAgsvHNgaUFhyKXs8Iiyn'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/eng_fact_4.jsonl': ('file-fyNMatocBghOShgR2FAaHoWE',
  Batch(id='batch_CbQjeyDo20dtrJGxFEzgFUM2', completion_window='24h', created_at=1723444167, endpoint='/v1/embeddings', input_file_id='file-fyNMatocBghOShgR2FAaHoWE', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530567, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/eng_fact_4.jsonl; batch_input_file_id : file-fyNMatocBghOShgR2FAaHoWE'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/orig_fact_0.jsonl': ('file-S2hiO1DVrJEuiRiUwIXGwPsg',
  Batch(id='batch_NMtze5sSAsyQ5tD17TRhC6XI', completion_window='24h', created_at=1723444168, endpoint='/v1/embeddings', input_file_id='file-S2hiO1DVrJEuiRiUwIXGwPsg', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530568, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/orig_fact_0.jsonl; batch_input_file_id : file-S2hiO1DVrJEuiRiUwIXGwPsg'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/orig_fact_1.jsonl': ('file-B3rHD9tDZzWSfPRpb3wMdOxg',
  Batch(id='batch_PDeZVbNUTUSSiKBkBiTZJXyB', completion_window='24h', created_at=1723444169, endpoint='/v1/embeddings', input_file_id='file-B3rHD9tDZzWSfPRpb3wMdOxg', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530569, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/orig_fact_1.jsonl; batch_input_file_id : file-B3rHD9tDZzWSfPRpb3wMdOxg'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/orig_fact_2.jsonl': ('file-hXHoyE3EdNcV12vN8AhCTdmq',
  Batch(id='batch_c63E5C020GMwtLX7Cwku1kt8', completion_window='24h', created_at=1723444170, endpoint='/v1/embeddings', input_file_id='file-hXHoyE3EdNcV12vN8AhCTdmq', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530570, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/orig_fact_2.jsonl; batch_input_file_id : file-hXHoyE3EdNcV12vN8AhCTdmq'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/orig_fact_3.jsonl': ('file-tIRBK5VyYDMG9zqBG2dBwgRc',
  Batch(id='batch_w9ssRkTQYzHPT9wWtlLue4tZ', completion_window='24h', created_at=1723444172, endpoint='/v1/embeddings', input_file_id='file-tIRBK5VyYDMG9zqBG2dBwgRc', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530572, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/orig_fact_3.jsonl; batch_input_file_id : file-tIRBK5VyYDMG9zqBG2dBwgRc'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/orig_fact_4.jsonl': ('file-FLGahSDvEXcTWVAgQh2Tpw0f',
  Batch(id='batch_NDKmtUNyVIaUNFYxPMCI4hSZ', completion_window='24h', created_at=1723444173, endpoint='/v1/embeddings', input_file_id='file-FLGahSDvEXcTWVAgQh2Tpw0f', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530573, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/orig_fact_4.jsonl; batch_input_file_id : file-FLGahSDvEXcTWVAgQh2Tpw0f'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),

 'openai-ip/l1_post_df.jsonl': ('file-83JDXEoNSsK4zD0vpIyylg83',
  Batch(id='batch_pzGbVEHvtuW6K6WU9R2tmGEf', completion_window='24h', created_at=1723444175, endpoint='/v1/embeddings', input_file_id='file-83JDXEoNSsK4zD0vpIyylg83', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530575, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/l1_post_df.jsonl; batch_input_file_id : file-83JDXEoNSsK4zD0vpIyylg83'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))),
  
 'openai-ip/l2_post_df.jsonl': ('file-df3nFQ0RuFsgJLZNt1c7OXzM',
  Batch(id='batch_XFg9ofIMjjFx3zk2hxxoJTPe', completion_window='24h', created_at=1723444176, endpoint='/v1/embeddings', input_file_id='file-df3nFQ0RuFsgJLZNt1c7OXzM', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723530576, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'job for file at input: openai-ip/l2_post_df.jsonl; batch_input_file_id : file-df3nFQ0RuFsgJLZNt1c7OXzM'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)))}

In [13]:
file_response = client.files.content('file-hXHoyE3EdNcV12vN8AhCTdmq')
res = file_response.content

In [19]:
result_file_name = "openai-op/temp.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(res)

In [None]:
batch_status = client.batches.retrieve(f"{batch_response.id}")

In [None]:

file_response = client.files.content(batch_status.output_file_id)
result = file_response.content

result_file_name = "openai-op/eval_facts_encode_fact_orig.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [None]:
import json
# Loading data from saved file
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [None]:
pd.DataFrame(results).iloc[0]['response']['body']['data'][0]['embedding']