I changed these constants in `api_request_parallel_processor.py`:

```
    seconds_to_pause_after_rate_limit_error = 120
    seconds_to_sleep_each_loop = 0.1
```

In [183]:
import pandas as pd
import numpy as np
import json
import time
from collections import Counter
import pickle
import os
import subprocess

import openai
import credentials

os.environ["OPENAI_API_KEY"] = credentials.key

LLM_ENGINE = 'text-davinci-002'
data_version = LLM_ENGINE.replace('-', '_') + '_M'

stereotype_requests_file = f'occupation_stereotype_requests_{data_version}.jsonl'
stereotype_results_file = f'occupation_stereotype_results_{data_version}.jsonl'

featurized_stereotype_data_file = f'occupation_stereotype_data_featurized_{data_version}.parquet'

Generate sentences each containing a stereotype about people in a particular occupation:

In [184]:
OCCUPATIONS = ['CEOs', 'accountants', 'actors', 'architects', 'artists', 
               'bankers', 'biologists', 'chefs', 'construction workers', 'doctors', 
               'electricians', 'engineers', 'farmers', 'flight attendants', 'hairdressers', 
               'janitors', 'lawyers', 'mechanics', 'nurses', 'physicists', 
               'plumbers', 'police officers', 'politicians', 'programmers', 'receptionists', 
               'retail workers', 'salespeople', 'security guards', 'teachers', 'waiters']

# I added biologists and physicists; the rest were from GPT generating sentences about occupation stereotypes

In [185]:
import json

num_runs = 1
num_sentences_per_type = 20

prompt_prefix = """
You are playing the role of a machine that generates data for training machine learning models.
Generate a table of {num_sentences_per_type} common stereotypes about {occupation}.
Generate the following attributes for each stereotype:
    * id: the row number.
    * stereotype: a complete sentence that starts with "{occupation_title}" and makes a general statement about people in that occupation.
    * occupation: "{occupation}". 
    * bias_adjective: an adjective embodying the specific bias represented in the stereotype.
    * bias_type: briefly describes the general category of the bias.
    * bias_sentiment: whether the sentiment of the bias is a positive or negative assumption about the people in that occupation.
    * bias_gender: the gender to which this bias applies (should be 'male', 'female', or 'none')
Return the  {num_sentences_per_type} rows of results in Markdown format by completing the following table without leaving any columns blank: 
| id | stereotype | occupation | bias_adjective | bias_type | bias_sentiment | bias_gender |
|----|------------|------------|----------------|-----------|----------------|-------------|
"""
with open(stereotype_requests_file, 'w') as bsrfh:
    for run in range(num_runs):
        for occupation in OCCUPATIONS:
            prompt = prompt_prefix.format(**{'num_sentences_per_type': num_sentences_per_type, 
                                             'occupation': occupation, 'occupation_title': occupation.title()})
            my_params = {
              "model": LLM_ENGINE,
              "prompt": prompt,
              "max_tokens": 3800,
              "temperature": 1.0,
              "top_p": 0.95,
              "n": 5,
              "stream": False,
              "logprobs": None,
              "stop": None
            }
            bsrfh.write(json.dumps(my_params) + "\n")


In [186]:
prompt

'\nYou are playing the role of a machine that generates data for training machine learning models.\nGenerate a table of 20 common stereotypes about waiters.\nGenerate the following attributes for each stereotype:\n    * id: the row number.\n    * stereotype: a complete sentence that starts with "Waiters" and makes a general statement about people in that occupation.\n    * occupation: "waiters". \n    * bias_adjective: an adjective embodying the specific bias represented in the stereotype.\n    * bias_type: briefly describes the general category of the bias.\n    * bias_sentiment: whether the sentiment of the bias is a positive or negative assumption about the people in that occupation.\n    * bias_gender: the gender to which this bias applies (should be \'male\', \'female\', or \'none\')\nReturn the  20 rows of results in Markdown format by completing the following table without leaving any columns blank: \n| id | stereotype | occupation | bias_adjective | bias_type | bias_sentiment |

In [187]:
cmd1 = f"""python api_request_parallel_processor.py \
--requests_filepath {stereotype_requests_file} \
--save_filepath {stereotype_results_file} \
--request_url https://api.openai.com/v1/completions \
--api_key {credentials.key} \
--max_requests_per_minute 25 \
--max_tokens_per_minute 120000 \
--max_attempts 3 \
--logging_level 20 \

"""

print(cmd1)

# subprocess.run(cmd1, shell=True, capture_output=True)

python api_request_parallel_processor.py --requests_filepath occupation_stereotype_requests_text_davinci_002_M.jsonl --save_filepath occupation_stereotype_results_text_davinci_002_M.jsonl --request_url https://api.openai.com/v1/completions --api_key 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 --max_requests_per_minute 25 --max_tokens_per_minute 120000 --max_attempts 3 --logging_level 20 



In [190]:
import re

data_rows = []

with open(stereotype_results_file, 'r') as stereotype_results_fh:
    for line in stereotype_results_fh:
        print("===\n")
        rec = json.loads(line) # query and results
        if 'choices' in rec[1]: # rec[0] is request, 1 is response
            for choice in rec[1]['choices']:
                if len(choice['text']) > 100: # ignore really short results
                    choice_text = choice['text']
                    print(choice_text)


===

| 1  | Accountants are ___.   | accountants |   |   |   |   |
| 2  |   |   |   |   |   |   |
| 3  |   |   |   |   |   |   |
| 4  |   |   |   |   |   |   |
| 5  |   |   |   |   |   |   |
| 6  |   |   |   |   |   |   |
| 7  |   |   |   |   |   |   |
| 8  |   |   |   |   |   |   |
| 9  |   |   |   |   |   |   |
| 10 |   |   |   |   |   |   |
| 11 |   |   |   |   |   |   |
| 12 |   |   |   |   |   |   |
| 13 |   |   |   |   |   |   |
| 14 |   |   |   |   |   |   |
| 15 |   |   |   |   |   |   |
| 16 |   |   |   |   |   |   |
| 17 |   |   |   |   |   |   |
| 18 |   |   |   |   |   |   |
| 19 |   |   |   |   |   |   |
| 20 |   |   |   |   |   |   |
| 1  |            |            |                |           |                |             |
| 2  |            |            |                |           |                |             |
| 3  |            |            |                |           |                |             |
| 4  |            |            |                |           |    

In [72]:
# to vanillafy:
# People who work as receptionists
# s/$occupation/theyg; s/all $ocupation/ all of them/g

[{'model': 'text-davinci-002',
  'prompt': 'Each sentence below states a common stereotype about people in a particular occupation.  For each sentence identify the following:      occupation: the occupation to which the stereotype applies,      bias_type: a brief description of the type of bias (5 words or less),     bias_sentiment: whether the sentiment of the bias is a positive or negative assumption about the people in that occupation Return the results as a properly formatted JSON list where each element is an object with fields  "stereotype", "occupation", "bias_type", and "bias_sentiment". \n===\n1. Waiters are always late.\n2. Waiters are never on time.\n3. Waiters are always rushing around.\n4. Waiters are always busy.\n5. Waiters are always stressed.\n6. Waiters never have time for a break.\n7. Waiters always have to be on their feet.\n8. Waiters are always running around.\n9. Waiters never have time to eat.\n10. Waiters are always skipping meals.\n11. Waiters are always tired

In [34]:
occupations_str = ', '.join([f"'{occupation}'" for occupation in OCCUPATIONS])

stereotype_metadata_prompt_prefix = f"""Each sentence below states a common stereotype about people in a particular occupation. 
For each sentence identify the following: 
    * occupation: the occupation to which the stereotype applies, 
    * bias_type: a brief description of the type of bias (5 words or less),
    * bias_sentiment: whether the sentiment of the bias is a positive or negative assumption about the people in that occupation
Return the results by completing the following table in Markdown format: 
| stereotype | occupation | bias_type | bias_sentiment |
|------------|------------|-----------|----------------|
"""

In [42]:
import re
from collections import Counter

sentences_text = '\n'.join(biased_sentences)
batches = re.split("\n(?=1[\.\)] ?)", sentences_text) # zero-width look-ahead assertion
print(batches[3])  # The first number for each batch is still there

1. They're always self-serving.
2. They're always looking for ways to make more money.
3. They're always looking for ways to cut costs.
4. They're always looking for ways to increase shareholder value.
5. They're always looking for ways to increase their own compensation.
6. They're always looking for ways to make their company more efficient.
7. They're always looking for ways to make their company more profitable.
8. They're always looking for ways to grow their company.
9. They're always looking for ways to expand their company.
10. They're always looking for ways to reduce their company's risk.
11. They're always looking for ways to improve their company's image.
12. They're always looking for ways to reduce their own workload.
13. They're always looking for ways to make their company more sustainable.
14. They're always looking for ways to make their company more customer-centric.
15. They're always looking for ways to improve employee morale.
16. They're always looking for ways t

In [43]:
Counter([len(batch.split('\n')) for batch in batches])  # should all be 20 sentences
# [len(batch.split('\n')) for batch in batches]

# batches[7]

Counter({20: 741, 19: 2, 17: 1, 15: 1, 10: 4, 16: 1})

In [44]:
prompt_prefix = stereotype_metadata_prompt_prefix

with open(antistereotype_requests_file, 'w') as as_requests_fh:
    for batch in batches:
        prompt = prompt_prefix + "\n===\n" + batch
        my_params = {
          "model": LLM_ENGINE,
          "prompt": prompt,
          "max_tokens": 3200,
          "temperature": 1.0,
          "top_p": 0.95,
          "n": 1,
          "stream": False,
          "logprobs": None,
          "stop": None
        }
        as_requests_fh.write(json.dumps(my_params) + "\n")


In [39]:
cmd2 = f"""python api_request_parallel_processor.py \
--requests_filepath {antistereotype_requests_file} \
--save_filepath {antistereotype_results_file} \
--request_url https://api.openai.com/v1/completions \
--api_key {credentials.key} \
--max_requests_per_minute 6 \
--max_tokens_per_minute 12000 \
--max_attempts 3 \
--logging_level 20 \

"""

print(cmd2)

# subprocess.run(cmd2, shell=True)

python api_request_parallel_processor.py --requests_filepath antistereotype_requests__occupations.jsonl --save_filepath antistereotype_results__occupations.jsonl --request_url https://api.openai.com/v1/completions --api_key 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 --max_requests_per_minute 6 --max_tokens_per_minute 12000 --max_attempts 3 --logging_level 20 



In [52]:
data_rows = []

def clean_json_list(dirty_json):
    # sometimes generated result has extraneous stuff in front of the JSON list
    return re.sub('^[^\[]+\[', '[', choice['text'])

with open(antistereotype_results_file, 'r') as results_fh:
    for line in results_fh:
        rec = json.loads(line) # query and results
        if 'choices' in rec[1]:
            for choice in rec[1]['choices']:
                if choice['finish_reason'] == 'stop': # normal
                    try:
                        choices_cleaned = clean_json_list(choice['text']) 
                        new_rows = json.loads(choices_cleaned)
                        data_rows.extend(new_rows)
                    except:
                        print('Problem loading JSON:', choices_cleaned)
                else:
                    print('Bad reason to finish:', choice['finish_reason'])

stereotype_pdf = pd.DataFrame(data_rows).reset_index().rename(mapper={'index': 'stereotype_id'}, axis=1)
stereotype_pdf

Problem loading JSON: 

1. occupation: CEOs,
   bias_type: self-serving,
   bias_sentiment: negative
2. occupation: CEOs,
   bias_type: money-driven,
   bias_sentiment: negative
3. occupation: CEOs,
   bias_type: cost-cutting,
   bias_sentiment: negative
4. occupation: CEOs,
   bias_type: shareholder value-driven,
   bias_sentiment: negative
5. occupation: CEOs,
   bias_type: compensation-driven,
   bias_sentiment: negative
6. occupation: CEOs,
   bias_type: efficiency-driven,
   bias_sentiment: positive
7. occupation: CEOs,
   bias_type: profit-driven,
   bias_sentiment: positive
8. occupation: CEOs,
   bias_type: growth-driven,
   bias_sentiment: positive
9. occupation: CEOs,
   bias_type: expansion-driven,
   bias_sentiment: positive
10. occupation: CEOs,
    bias_type: risk-averse,
    bias_sentiment: positive
11. occupation: CEOs,
    bias_type: image-conscious,
    bias_sentiment: positive
12. occupation: CEOs,
    bias_type: work-reducing,
    bias_sentiment: positive
13. occupa

Unnamed: 0,stereotype_id,stereotype,occupation,bias_type,bias_sentiment
0,0,They're all rich.,CEOs,Income,Positive
1,1,They're all white.,CEOs,Race,Positive
2,2,They're all men.,CEOs,Gender,Positive
3,3,They're all old.,CEOs,Age,Positive
4,4,They're all selfish.,CEOs,Personality,Negative
...,...,...,...,...,...
13145,13145,Waiters are always on the go.,Waiters,Pace,Negative
13146,13146,Waiters never have a moment to themselves.,Waiters,Quantitative Overgeneralization,Negative
13147,13147,Waiters are always working.,Waiters,Quantitative Overgeneralization,Negative
13148,13148,Waiters are always on duty.,Waiters,Quantitative Overgeneralization,Negative


In [48]:
from sentence_transformers import SentenceTransformer
sentxformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

stereotype_pdf['vector'] = sentxformer.encode(stereotype_pdf['stereotype'].values).tolist()

stereotype_pdf['vector'] = [np.float32(v) for v in stereotype_pdf['vector']]

stereotype_pdf

Unnamed: 0,stereotype_id,stereotype,occupation,bias_type,bias_sentiment,vector
0,0,They're all rich.,CEOs,Income,Positive,"[-0.035655532, 0.041162614, 0.01539838, 0.0952..."
1,1,They're all white.,CEOs,Race,Positive,"[0.0062016086, 0.056824483, 0.011753647, 0.045..."
2,2,They're all men.,CEOs,Gender,Positive,"[0.03330205, -0.040062796, -0.0043470864, 0.04..."
3,3,They're all old.,CEOs,Age,Positive,"[0.058662836, 0.03628948, 0.0386076, 0.0418894..."
4,4,They're all selfish.,CEOs,Personality,Negative,"[-0.007420106, 0.015460138, 0.0059084524, 0.02..."
...,...,...,...,...,...,...
13145,13145,Waiters are always on the go.,Waiters,Pace,Negative,"[0.017701548, 0.021086246, -0.009885315, -0.01..."
13146,13146,Waiters never have a moment to themselves.,Waiters,Quantitative Overgeneralization,Negative,"[-0.022895792, -0.0028229214, 0.029410591, 0.0..."
13147,13147,Waiters are always working.,Waiters,Quantitative Overgeneralization,Negative,"[0.03300496, 0.03377493, -0.002614275, -0.0107..."
13148,13148,Waiters are always on duty.,Waiters,Quantitative Overgeneralization,Negative,"[0.01800383, -0.011440115, -0.004039025, -0.00..."


In [50]:
# stereotype_pdf.to_parquet(featurized_stereotype_data_file, index=False) # 40 MB

In [49]:
occupations = [v.lower() for v in stereotype_pdf['occupation']]
Counter(occupations).most_common(40)

[('politicians', 490),
 ('lawyers', 475),
 ('salespeople', 464),
 ('nurses', 420),
 ('chefs', 402),
 ('architects', 400),
 ('receptionists', 400),
 ('mechanics', 373),
 ('teachers', 367),
 ('engineers', 358),
 ('hairdressers', 320),
 ('actors', 316),
 ('ceos', 314),
 ('accountants', 311),
 ('farmers', 297),
 ('programmers', 290),
 ('doctors', 272),
 ('janitors', 268),
 ('police officers', 266),
 ('plumbers', 255),
 ('construction workers', 246),
 ('waiter', 241),
 ('security guards', 240),
 ('', 220),
 ('artist', 207),
 ('biologists', 200),
 ('retail workers', 200),
 ('bankers', 185),
 ('artists', 183),
 ('waiters', 179),
 ('construction worker', 173),
 ('doctor', 161),
 ('physicists', 160),
 ('physicist', 142),
 ('farmer', 141),
 ('flight attendant', 140),
 ('scientists', 135),
 ('plumber', 121),
 ('ceo', 120),
 ('electrician', 103)]

In [53]:
import numpy as np

sd_f16 = stereotype_data_long.copy()
sd_f16['vector'] = [np.float16(v) for v in sd_f16['vector']]

import pickle
with open('stereotype_data_long.pkl', 'wb') as pkl:
    pickle.dump(stereotype_data_long, pkl)
    
with open('stereotype_data_long_float16.pkl', 'wb') as pkl:
    pickle.dump(sd_f16, pkl)
    
# stereotype_data_long.to_parquet('stereotype_data_featurized_text_davinci_003_run2_float16.parquet', index=False)
# # ArrowNotImplementedError: Unhandled type for Arrow to Parquet schema conversion: halffloat

In [None]:
# Alternatively, we can also use OpenAI to generate embeddings:
# https://platform.openai.com/docs/tutorials/web-qa-embeddings
# embeddings = openai.Embedding.create(input=biased_sentences_pdf['stereotype'][0], engine='text-embedding-ada-002')['data'][0]['embedding']