In [2]:
from openai import OpenAI
from dotenv import load_dotenv 
load_dotenv()
import os
import ast
import gc
from tqdm import tqdm
import numpy as np
gc.disable()
from copy import deepcopy as cc
import pandas as pd
from utils import (get_data, process_facts_df, process_posts_df, 
                   get_openai_embedding, get_sbert_embeddings, get_openai_batch_ip)

In [3]:
from openai import OpenAI
client = OpenAI(api_key = os.getenv('OPENAI_API_KEY'))

In [5]:
# Retrieving the openai jobs response for embedding the post and fact data
batches = []

for batch in tqdm(client.batches.list(limit=8).data, total=len(client.batches.list(limit=8).data)):
    batch_id = batch.id
    print(batch.metadata['description'])
    if 'job for file' in batch.metadata['description']:
        batches.append(batch)
        op_file_id = client.batches.retrieve(batch_id).output_file_id
        file_response = client.files.content(op_file_id)
        dst_path = batch.metadata['description'].split(';')[0].split(':')[-1].strip().replace('openai-ip', 'openai-op')
        file_response.write_to_file(dst_path)
        gc.collect()

  0%|          | 0/8 [00:00<?, ?it/s]

job for file at input: openai-ip/eval_orig_fact_large.jsonl; batch_input_file_id : file-xICI12hBFg6dzX3UAt3ETho5


 12%|█▎        | 1/8 [00:00<00:06,  1.05it/s]

job for file at input: openai-ip/eval_l2_post_large_df.jsonl; batch_input_file_id : file-hlbiycVtn15O0acamZavQfTi


 25%|██▌       | 2/8 [00:01<00:05,  1.09it/s]

job for file at input: openai-ip/eval_l2_post_df.jsonl; batch_input_file_id : file-yRPQttjDsfNdp7rzwOUfwJHP


 38%|███▊      | 3/8 [00:02<00:04,  1.11it/s]

job for file at input: openai-ip/eval_l1_post_large_df.jsonl; batch_input_file_id : file-bebSPgvlyXbAwR7tYaFl5m5e


 50%|█████     | 4/8 [00:03<00:03,  1.09it/s]

job for file at input: openai-ip/eval_orig_fact.jsonl; batch_input_file_id : file-vehlLcFIHhEwviazr8GZduTH


 62%|██████▎   | 5/8 [00:04<00:02,  1.12it/s]

job for file at input: openai-ip/eval_eng_fact.jsonl; batch_input_file_id : file-eC3CL8vHYr3NHvssDlrDMDEZ


 75%|███████▌  | 6/8 [00:05<00:01,  1.15it/s]

job for file at input: openai-ip/eval_l1_post_df.jsonl; batch_input_file_id : file-nxaOPVKJfH5Ycg4q9koBcLjl


 88%|████████▊ | 7/8 [00:06<00:00,  1.21it/s]

job for file at input: openai-ip/eval_eng_fact_large.jsonl; batch_input_file_id : file-9hut5s7xVXyLJQPg4sRinFVZ


100%|██████████| 8/8 [00:06<00:00,  1.15it/s]


In [None]:
for lang in ['orig', 'eng']:
    df_final = []
    for ix in tqdm(range(5)):
        file_large = f"{lang}_fact_large_{ix}.jsonl"
        file_small = f"{lang}_fact_{ix}.jsonl"

        df = pd.read_json(os.path.join('openai-op', file_small), lines = True)
        df_large = pd.read_json(os.path.join('openai-op', file_large), lines = True)

        df[f'gpt_small_emb'] = df.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)
        df_large[f'gpt_large_emb'] = df_large.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)

        df['fact_check_id'] = df['custom_id']
        df_large['fact_check_id'] = df_large['custom_id']

        columns = ['response', 'custom_id', 'error', 'id']
        df.drop(columns, inplace=True, axis=1)
        df_large.drop(columns, inplace=True, axis=1)

        df_large = df_large.merge(df)
        del df
        df_final.append(df_large)
    df_final = pd.concat(df_final)
    df_final.to_pickle(f'openai-op/{lang}-fact.pkl', protocol = 5)

In [6]:
for lang in ['orig', 'eng']:
    file_large = f"eval_{lang}_fact_large.jsonl"
    file_small = f"eval_{lang}_fact.jsonl"

    df = pd.read_json(os.path.join('openai-op', file_small), lines = True)
    df_large = pd.read_json(os.path.join('openai-op', file_large), lines = True)

    df[f'gpt_small_emb'] = df.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)
    df_large[f'gpt_large_emb'] = df_large.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)

    df['fact_check_id'] = df['custom_id']
    df_large['fact_check_id'] = df_large['custom_id']

    columns = ['response', 'custom_id', 'error', 'id']
    df.drop(columns, inplace=True, axis=1)
    df_large.drop(columns, inplace=True, axis=1)

    df_large = df_large.merge(df)
    df_large.to_pickle(f'openai-op/eval_{lang}-fact.pkl', protocol = 5)

In [24]:
%%time
df_final = pd.read_pickle('openai-op/orig-fact.pkl')

CPU times: user 26.3 s, sys: 14.8 s, total: 41.1 s
Wall time: 41.1 s


In [7]:
for ix, lang in tqdm(enumerate(['l1', 'l2'])):
    df_final = []
    file_large = f"eval_{lang}_post_large_df.jsonl"
    file_small = f"eval_{lang}_post_df.jsonl"

    df = pd.read_json(os.path.join('openai-op', file_small), lines = True)
    df_large = pd.read_json(os.path.join('openai-op', file_large), lines = True)

    df[f'gpt_small_emb'] = df.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)
    df_large[f'gpt_large_emb'] = df_large.apply(lambda x: x['response']['body']['data'][0]['embedding'], axis = 1)

    df['post_id'] = df['custom_id']
    df_large['post_id'] = df_large['custom_id']

    columns = ['response', 'custom_id', 'error', 'id']
    df.drop(columns, inplace=True, axis=1)
    df_large.drop(columns, inplace=True, axis=1)

    df_large = df_large.merge(df)
    del df
    df_large.to_pickle(f'openai-op/eval_{lang}-post.pkl', protocol = 5)

2it [00:00, 30.80it/s]
