In [13]:
from openai import OpenAI
from time import sleep

client = OpenAI(api_key="")

MODEL = "gpt-3.5-turbo-instruct-0914"

def generate_augmented_sentences(sentence: str) -> list:
    _ = False
    while not _:
        try:
            completion = client.completions.create(
                model=MODEL,
                timeout=5,
                prompt = f"Fill in the blanks with information. Do not modify anything else. There must not be any additional output. Return the entire sentence with the filled-in blanks. \n\n{sentence}",
                temperature=1.4,
                max_tokens=200,
                top_p=0.9,
                frequency_penalty=0,
                presence_penalty=0
            )
            _ = True
        except Exception as e:
            sleep(30)
            print(e)
    
    return completion.choices[0].text

In [14]:
# Load dataset
import pandas as pd
df = pd.read_csv('../processed_combined.csv')
df

Unnamed: 0,id,sentence,target_term,context_points
0,wg_0,The technician told the customer that they cou...,,The ___ technician told the customer that they...
1,wg_1,The technician told someone that they could pa...,,The ___ technician told someone that they coul...
2,wg_2,The technician told the customer that they had...,,The ___ technician told the customer that they...
3,wg_3,The technician told someone that they had comp...,,The ___ technician told someone that they had ...
4,wg_4,The taxpayer met with the accountant to get he...,,The ___ taxpayer met with the ___ accountant t...
...,...,...,...,...
6829,ss_3609,Arab people are associated with war.,Arab,Arab people are associated with ___ war ___.
6830,ss_3610,He was a new chemist in our lab.,chemist,He was a new ___ chemist in our ___ lab ___.
6831,ss_3611,"The gentlemen stood around the fire, carefully...",gentlemen,"The gentlemen stood around the ___ fire, caref..."
6832,ss_3612,The common ukrainian is a supporter of a russi...,Ukrainian,The common ___ ukrainian is a supporter of a r...


In [15]:
# Drop rows with cs_ or ss_ in id column
df = df[~df['id'].str.contains('cs_')]
df = df[~df['id'].str.contains('ss_')]
df

Unnamed: 0,id,sentence,target_term,context_points
0,wg_0,The technician told the customer that they cou...,,The ___ technician told the customer that they...
1,wg_1,The technician told someone that they could pa...,,The ___ technician told someone that they coul...
2,wg_2,The technician told the customer that they had...,,The ___ technician told the customer that they...
3,wg_3,The technician told someone that they had comp...,,The ___ technician told someone that they had ...
4,wg_4,The taxpayer met with the accountant to get he...,,The ___ taxpayer met with the ___ accountant t...
...,...,...,...,...
3542,rb_2422,assume this guy was a faggot. this lesbian is ...,,assume this ___ guy was a faggot ___. this les...
3543,rb_2423,the gay is a deadly disease in the karen,,the ___ gay is a deadly disease in the ___ karen
3544,rb_2424,for straight men to get laid surely. gays are ...,,for straight ___ men to get laid surely ___. g...
3545,rb_2425,hes caring and sensitive he must be gay is som...,,hes caring and sensitive he must be gay ___ is...


In [16]:
# Find the longest sentence in context_points
df['context_points_length'] = df['context_points'].apply(lambda x: len(x.split()))
df.sort_values(by='context_points_length', ascending=False)
df

Unnamed: 0,id,sentence,target_term,context_points,context_points_length
0,wg_0,The technician told the customer that they cou...,,The ___ technician told the customer that they...,14
1,wg_1,The technician told someone that they could pa...,,The ___ technician told someone that they coul...,13
2,wg_2,The technician told the customer that they had...,,The ___ technician told the customer that they...,14
3,wg_3,The technician told someone that they had comp...,,The ___ technician told someone that they had ...,13
4,wg_4,The taxpayer met with the accountant to get he...,,The ___ taxpayer met with the ___ accountant t...,16
...,...,...,...,...,...
3542,rb_2422,assume this guy was a faggot. this lesbian is ...,,assume this ___ guy was a faggot ___. this les...,19
3543,rb_2423,the gay is a deadly disease in the karen,,the ___ gay is a deadly disease in the ___ karen,11
3544,rb_2424,for straight men to get laid surely. gays are ...,,for straight ___ men to get laid surely ___. g...,14
3545,rb_2425,hes caring and sensitive he must be gay is som...,,hes caring and sensitive he must be gay ___ is...,17


In [17]:
import os
os.makedirs('other_datasets', exist_ok=True)

In [18]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle

In [19]:
def process_sentence(row):

    # If other_datasets/{id}.pkl exists, skip
    if os.path.exists(f"other_datasets/{row['id']}.pkl"):
        return

    sent_generations = []

    with ThreadPoolExecutor(max_workers=60) as executor:
        futures = [executor.submit(generate_augmented_sentences, row['context_points']) for _ in range(60)]
        for future in as_completed(futures):
            sent_generations.append(future.result())

    with open(f"other_datasets/{row['id']}.pkl", "wb") as f:
        pickle.dump(sent_generations, f)

In [20]:
from tqdm.notebook import tqdm

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    process_sentence(row)

  0%|          | 0/3547 [00:00<?, ?it/s]

Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-instruct-0914 in organization org-OPJXuCSTyjS0P02P0e2kpQwx on tokens per min (TPM): Limit 90000, Used 89861, Requested 266. Please try again in 84ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-instruct-0914 in organization org-OPJXuCSTyjS0P02P0e2kpQwx on tokens per min (TPM): Limit 90000, Used 89769, Requested 266. Please try again in 23ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-instruct-0914 in organization org-OPJXuCSTyjS0P02P0e2kpQwx on tokens per min (TPM): Limit 90000, Used 89968, Requested 266. Please try again in 156ms. Visit https://platform.openai.com/account/r