In [1]:
import pandas as pd
import os
import openai
from dotenv import load_dotenv
from llm import call_openai_api
import concurrent.futures
import time

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
model_name = "gpt-4"
# model_name = "gpt-4-32k"

STRUCTURED_DATA_FILE = "outputs/20231209/answers.tsv"
QUESTIONS_FILE = "prompts/questions_8.txt"
question_idx = 34  # for testing

merged = pd.read_csv(STRUCTURED_DATA_FILE, sep="\t", header=0)
print(merged.shape)
merged.head()

(93, 44)


Unnamed: 0,How old are you?,Where do you live?,What is your marital status?,Do you have kids?,"If you do have kids, provide details","Are you a caretaker otherwise? (if not own kids, eg elderly parents, adopted family member, etc)",What type of healthcare professional or student/trainee are you?,"If student or trainee, what year are you in?",What institution did you complete your (or are currently) training at?,"If you are a physician, did you train in the US at any point?",...,Would you seek help if you felt burned out? How?,Would you change jobs or career trajectories?,Has this crisis affected your specialty decision or career plans in any way?,Would you get (professional or other) help/care if you felt mentally overwhelmed? How? When?,Any obstacles you foresee in getting help if you needed to?,"If student or trainee, how closely do you feel that you are adhering to the Hippocratic oath during this time?","If student or trainee, do you agree with your school's policies regarding medical students' roles at this time?",What other questions was the subject asked that were not covered in this template?,file_name,subject_id
0,32,"Houston, Texas, United States of America",Single,No,,No,MD - Cardiology critical care,,University of Tennessee,Yes,...,"Yes, would seek help from Baylor or hospital r...","-Unknown Speaker 30:17 Oh, no, I love my job ...",,"Yes, would seek help from Baylor or hospital r...",No,,Not applicable,Unknown Speaker 31:02 Thank you for your time...,C001.gpt-4-32k.txt,C001
1,48,"San Benito, Texas, United States",Married,Yes,"Has three children aged 23, 20 and 18",No,Respiratory therapist,,"Texas Southmost College in Brownsville, Texas",,...,"Yes, would seek help from chapel services",No,No,"Yes, he's open to seeking help if needed",No,,,,C002.gpt-4-32k.txt,C002
2,62,"Randallstown, Maryland",Divorced,Yes,Two adult children,No,"Register Nurse, specializing in neurocritical ...",,University of Michigan and Houston,,...,"Yes, she would seek help from her primary care...","No, she does not see herself changing jobs or ...","No, she does not see the crisis affecting her ...","Yes, she would consult with a healthcare profe...",Yes. Access to healthcare is currently a chall...,,,"How have the steps taken to protect yourself, ...",C003.gpt-4-32k.txt,C003
3,27,"Orlando, Florida, United States",Single,No,,No,Medical Student,4th Year,Nova Southeastern University (Fort Lauderdale ...,,...,Yes,No,No,Yes,No,Closely,"Yes, but with concern","What is your your marital status?, Why does th...",C004.gpt-4-32k.txt,C004
4,24,"Houston, Texas, USA",Married,No,,No,Physician Assistant in neck surgery at a cance...,,"LSU Health Sciences Center Shreveport, Louisiana",Yes,...,"Yes, she would first reach out to supportive f...",No,,"Yes, she would reach out to supportive friends...",,,,She was asked if she thought her burnout level...,C005.gpt-4-32k.txt,C005


In [2]:
questions = pd.read_csv(QUESTIONS_FILE, sep="\t", header=0, index_col=0)[
    "question"
].to_list()
print(len(questions))
[print((i, q)) for i, q in enumerate(questions)]
questions[question_idx]

42
(0, 'How old are you?')
(1, 'Where do you live?')
(2, 'What is your marital status?')
(3, 'Do you have kids?')
(4, 'If you do have kids, provide details')
(5, 'Are you a caretaker otherwise? (if not own kids, eg elderly parents, adopted family member, etc)')
(6, 'What type of healthcare professional or student/trainee are you?')
(7, 'If student or trainee, what year are you in?')
(8, 'What institution did you complete your (or are currently) training at?')
(9, 'If you are a physician, did you train in the US at any point?')
(10, 'What is your specialty (if student, what specialty are you thinking of)?')
(11, 'How long have you been practicing?')
(12, 'Over the past two months, have you practiced clinically in areas where you could be in touch with patients who have covid-19?')
(13, 'Are you concerned about your safety, and how?')
(14, 'Are you concerned about safety of loved ones, and how?')
(15, 'Have you modified your routine to protect yourself or others, and how?')
(16, 'Has thi

'Would you seek help if you felt burned out? How?'

## Build prompt


In [3]:
# V1
# prompt = """
# Can you cluster the data in the table below highlighting any common themes per cluster.
# Exclude subjects that provide uncertain or no responses.
# Subjects can belong to multiple clusters. \n\n
# """ + tabular_data
# print(prompt)

# V2
# prompts = {
#     'default' :
# """Cluster the responses in the table below highlighting the common theme per cluster.
# Group subjects that provide unclear, irrelevant or no responses into a separate "excluded" cluster.
# Subjects can belong to multiple clusters. \n\n""",
#     16: # (16, 'Has this crisis taken a toll on you physically?'),
# """Cluster the responses in the table below at two levels.
# Top level clusters must be (1) Yes, (2) No, and (3) Unclear/irrelevant/no response, with mutually exclusive cluster membership.
# For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
# Subjects can belong to multiple clusters at this level.   \n\n
# """,
#     18: # How has your working schedule and logistics changed?
# """Cluster the responses in the table below at two levels.
# Top level clusters must be (1) Increased hours, (2) Decreased hours, (3) No change, (4) Other and (5) Unclear/irrelevant/no response, with mutually exclusive cluster membership.
# For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
# Subjects can belong to multiple clusters at this level.   \n\n
# """,
#     34:
# """Cluster the responses in the table below at two levels.
# Top level clusters must be (1) Yes, (2) No, (3) Mixed, and (4) Unclear/irrelevant/no response, with mutually exclusive cluster membership.
# For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
# Subjects can belong to multiple clusters at this level.   \n\n
# """,

# }
# question_idx = 18
# tabular_data = merged.iloc[:, [-1, question_idx]].to_csv(sep='\t', index=False)
# prompt = prompts[question_idx] if question_idx in prompts.keys() else prompts['default']
# prompt += tabular_data
# print(question_idx, prompt)


# V3
# TEMPLATE = """Cluster the responses in the table below at two levels.
# Top level clusters must be {clusters} with mutually exclusive cluster membership.
# For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
# Subjects can belong to multiple clusters at this level.
# Do not quote subject-responses in the clustering.
# Please make sure that all subject_ids are present in your response.
# \n"""

# prompts = {
#     'default': """Cluster the responses in the table below highlighting the common theme per cluster.
# Group subjects that provide unclear, irrelevant, or no responses into a separate "excluded" cluster.
# Subjects can belong to multiple clusters.
# Do not quote subject-responses in the clustering.
# Please make sure that all subject_ids present in the prompt (and no others) are present in your response.
# \n""",
#     14: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     16: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     # 17: Numeric: How many hours are you working on average (per week)?
#     18: TEMPLATE.format(clusters="(1) Increased hours, (2) Decreased hours, (3) No change, (4) Other, and (5) Unclear/irrelevant/no response"),
#     # 19: How does this compare to pre-covid-19 crisis?
#     20: TEMPLATE.format(clusters="(1) Better, (2) Worse, (3) No-change, (4) Other and (5) Unclear/irrelevant/no response"),
#     21: TEMPLATE.format(clusters="(1) Changed, (2) No change, (3) Fluctuating/uncertain change, and (4) Unclear/irrelevant/no response"),
#     22: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     23: TEMPLATE.format(clusters="(1) Prepared, (2) Unprepared, and (3) Unclear/irrelevant/no response"),
#     24: TEMPLATE.format(clusters="(1) Prepared, (2) Unprepared, and (3) Unclear/irrelevant/no response"),
#     25: TEMPLATE.format(clusters="(1) Positively (e.g. excitement), (2) Negatively, (3) Neutral and (4) Unclear/irrelevant/no response"),
#     26: TEMPLATE.format(clusters="(1) Yes, (2) No, (3) Mixed, (4) Fluctuating over time and (5) Unclear/irrelevant/no response"),
#     27: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     28: TEMPLATE.format(clusters="(1) No/Mild (e.g. 1, 2 or 3 out of 10), (2) Moderate (e.g. 4, 5 or 6 out of 10), (3) Severe (e.g. 7, 8, 9 or 10 out of 10), and (4) Unclear/irrelevant/no response"),
#     29: TEMPLATE.format(clusters="(1) No/Mild (e.g. 1, 2 or 3 out of 10), (2) Moderate (e.g. 4, 5 or 6 out of 10), (3) Severe (e.g. 7, 8, 9 or 10 out of 10), and (4) Unclear/irrelevant/no response"),
#     30: TEMPLATE.format(clusters="(1) Positively (e.g. excitement), (2) Negatively, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"),
#     31: TEMPLATE.format(clusters="(1) Yes, (2) No, (3) Mixed, and (4) Unclear/irrelevant/no response"),
#     32: TEMPLATE.format(clusters="(1) Positive, (2) Negative, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"),
#     33: TEMPLATE.format(clusters="(1) Positive, (2) Negative, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"),
#     34: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     35: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
#     36: TEMPLATE.format(clusters="(1) Yes will get professional help, (1) Yes but not professional help, (3) Mixed, (4) Will not seek/get help and (5) Unclear/irrelevant/no response"),
#     37: TEMPLATE.format(clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"),
# }


# V4
TEMPLATE = """Cluster the responses in the table below at two levels.
Top level clusters must be {clusters}.
Top level clusters have mutually-exclusive cluster membership.
For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
Subjects can belong to multiple clusters at this level. 

Your response should be in tab-separated-values format, with the following columns:
subject_id  top_level_cluster_id    secondary_cluster_ids

Example output line: 
C-002   C1  "C1.1,C1.2,C1.4"

Start your response by defining each top and secondary cluster in tab-separated-values format, with columns: 
cluster_id  cluster_name    cluster_description

Note that some subject_ids may not be present in the prompt, and so should also not be present in your response.
Provide both the (tab-separated) cluster-definitions table and the (tab-separated) cluster-assignments table in your response.
\n"""

prompts = {
    "default": """Cluster the responses in the table below highlighting the common theme per cluster.
Group subjects that provide unclear, irrelevant, or no responses into a separate "excluded" cluster.
Subjects can belong to multiple clusters. Your response should be in tab-separated-values format, 
with the following columns: subject_id, cluster_ids

Example output line: 
subject_id  cluster_ids
C-002   "C2,C3"

Start your response by defining each cluster in tab-separated-values format, with columns: 
cluster_id, cluster_name, cluster_description

Note that some subject_ids may not be present in the prompt, and so should also not be present in your response.
Provide both the (tab-separated) cluster-definitions table and the (tab-separated) cluster-assignments table in your response.
\n""",
    0: TEMPLATE.format(
        clusters="(1) Young Adults (22 to 33), (2) Middle-aged Adults (34 to 45), (3) Older Adults (46 to 60), (4) Seniors (61 and above), and (5) Unclear/irrelevant/no response"
    ),
    1: TEMPLATE.format(
        clusters="(1) Houston, Texas, (2) San Antonio, Texas, (3) Texas (Other), (4) Florida, (5) Mid-West US, (6) US (Other) and (7) Unclear/Excluded/No response"
    ),
    2: TEMPLATE.format(
        clusters="(1) Not currently married, (2) Married currently, and (3) Unclear/Excluded/No response"
    ),
    14: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    16: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    # 17: Numeric: How many hours are you working on average (per week)?
    17: TEMPLATE.format(
        clusters="(1) Full-time (40 hr/week), (2) Less than Full-time (under 40 hr/week), (3) More than Full-time (over 40 hr/week), and (4) Unclear/Excluded/No response"
    ),
    18: TEMPLATE.format(
        clusters="(1) Increased hours, (2) Decreased hours, (3) No change, (4) Other, and (5) Unclear/irrelevant/no response"
    ),
    # 19: How does this compare to pre-covid-19 crisis?
    19: TEMPLATE.format(
        clusters="(1) Increased hours, (2) Decreased hours, (3) No change, (4) Other, and (5) Unclear/irrelevant/no response"
    ),
    20: TEMPLATE.format(
        clusters="(1) Better, (2) Worse, (3) No-change, (4) Other and (5) Unclear/irrelevant/no response"
    ),
    21: TEMPLATE.format(
        clusters="(1) Changed, (2) No change, (3) Fluctuating/uncertain change, and (4) Unclear/irrelevant/no response"
    ),
    22: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    23: TEMPLATE.format(
        clusters="(1) Prepared, (2) Unprepared, and (3) Unclear/irrelevant/no response"
    ),
    24: TEMPLATE.format(
        clusters="(1) Prepared, (2) Unprepared, and (3) Unclear/irrelevant/no response"
    ),
    25: TEMPLATE.format(
        clusters="(1) Positively (e.g. excitement), (2) Negatively, (3) Mix of Positively and Negatively, (4) Neutral, and (5) Unclear/irrelevant/no response"
    ),
    26: TEMPLATE.format(
        clusters="(1) Yes, (2) No, (3) Mixed, (4) Fluctuating over time and (5) Unclear/irrelevant/no response"
    ),
    27: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    28: TEMPLATE.format(
        clusters="(1) No/Mild (e.g. 1, 2 or 3 out of 10), (2) Moderate (e.g. 4, 5 or 6 out of 10), (3) Severe (e.g. 7, 8, 9 or 10 out of 10), and (4) Unclear/irrelevant/no response"
    ),
    29: TEMPLATE.format(
        clusters="(1) No/Mild (e.g. 1, 2 or 3 out of 10), (2) Moderate (e.g. 4, 5 or 6 out of 10), (3) Severe (e.g. 7, 8, 9 or 10 out of 10), and (4) Unclear/irrelevant/no response"
    ),
    30: TEMPLATE.format(
        clusters="(1) Positively (e.g. excitement), (2) Negatively, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"
    ),
    31: TEMPLATE.format(
        clusters="(1) Yes, (2) No, (3) Mixed, and (4) Unclear/irrelevant/no response"
    ),
    32: TEMPLATE.format(
        clusters="(1) Positive, (2) Negative, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"
    ),
    33: TEMPLATE.format(
        clusters="(1) Positive, (2) Negative, (3) Neutral/Mixed and (4) Unclear/irrelevant/no response"
    ),
    34: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    35: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    36: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    37: TEMPLATE.format(
        clusters="(1) Yes will get professional help, (1) Yes but not professional help, (3) Mixed, (4) Will not seek/get help and (5) Unclear/irrelevant/no response"
    ),
    38: TEMPLATE.format(
        clusters="(1) Yes, (2) No, and (3) Unclear/irrelevant/no response"
    ),
    39: TEMPLATE.format(
        clusters="(1) Adhering Closely, (2) Not adhering closely OR Adhering conditionally, and (3) Unclear/irrelevant/no response"
    ),
    40: TEMPLATE.format(
        clusters="(1) Yes, (2) No, (3) Mixed/Conditionally, and (3) Unclear/irrelevant/no response"
    ),
}


def get_prompt_with_data(question_idx, merged, prompts):
    """Return the prompt combined with tabular data."""
    tabular_data = merged.iloc[:, [-1, question_idx]].to_csv(sep="\t", index=False)
    # tabular_data.index.name = 'Line_Number'
    prompt = prompts.get(question_idx, prompts["default"])
    return prompt + tabular_data


question_idx = 17
prompt = get_prompt_with_data(question_idx, merged, prompts)
print(question_idx, questions[question_idx])
print(prompt)

17 How many hours are you working on average (per week) nowadays?
Cluster the responses in the table below at two levels.
Top level clusters must be (1) Full-time (40 hr/week), (2) Less than Full-time (under 40 hr/week), (3) More than Full-time (over 40 hr/week), and (4) Unclear/Excluded/No response.
Top level clusters have mutually-exclusive cluster membership.
For the next level, cluster the responses from subjects belonging to each top-level cluster highlighting the common theme per cluster.
Subjects can belong to multiple clusters at this level. 

Your response should be in tab-separated-values format, with the following columns:
subject_id  top_level_cluster_id    secondary_cluster_ids

Example output line: 
C-002   C1  "C1.1,C1.2,C1.4"

Start your response by defining each top and secondary cluster in tab-separated-values format, with columns: 
cluster_id  cluster_name    cluster_description

Note that some subject_ids may not be present in the prompt, and so should also not be p

In [4]:
## Test OpenAI call
# response = call_openai_api(prompt, model_name, [])
# response_message = response["choices"][0]["message"]["content"]
# response_message = "Test"

# print(response_message)

In [5]:
def save_to_file(question_idx, prompt, response_message):
    # Save just the response
    filename = f"outputs/Q_{question_idx}.txt"
    with open(filename, "w") as file:
        file.write("PROMPT:\n")
        file.write(prompt)
        print(f"Saved: {filename}")

    # Save the full QnA
    filename = f"outputs/QnA_{question_idx}.txt"
    with open(filename, "w") as file:
        file.write("PROMPT:\n")
        file.write(prompt)
        file.write("\n\nRESPONSE:\n")
        file.write(response_message)
        file.write("\n")
        print(f"Saved: {filename}")

    # Save just the response
    response_filename = f"outputs/A_{question_idx}.txt"
    with open(response_filename, "w") as response_file:
        response_file.write(response_message)
        print(f"Saved: {response_filename}")


# save_to_file(question_idx, prompt, response_message)

In [6]:
# Loop over all questions
# q_idxs = list(range(13, len(questions)))  # All relevant questions
# q_idxs = list(range(13, len(questions)))  # All relevant questions
# q_idxs = list(range(19, len(questions)))  # All relevant questions
# q_idxs = list(range(32, len(questions))) # Subset
# q_idxs = [18, 20, 22, 24, 26, 27, 30, 31, 33, 35, 36]  # Test/Debug/Re-Do
# q_idxs = [22,33] # Test/Debug/Re-Do
# q_idxs = [25]  # Test/Debug/Re-Do

# q_idxs = list(range(0, 13))  # Structured data
# q_idxs = [18, 21, 38, 7]  # Test/Debug/Re-Do
# q_idxs = [11]  # Test/Debug/Re-Do
# q_idxs = [0, 1, 2, 17, 19, 36, 37, 38, 39, 40]
# q_idxs = [38, 39]
q_idxs = [17]


def process_and_save(question_idx):
    print(f"Question {question_idx}: {questions[question_idx]}")
    # tabular_data = merged.iloc[:, [-1, question_idx]].to_csv(sep="\t", index=False)
    # prompt = "Can you cluster the data in the table below highlighting any common themes per cluster. Exclude subjects that provide uncertain or no responses. Subjects can belong to multiple clusters. \n\n" + tabular_data
    prompt = get_prompt_with_data(question_idx, merged, prompts)
    # print(prompt)
    response = call_openai_api(prompt, model_name, [])
    response_message = response["choices"][0]["message"]["content"]
    save_to_file(question_idx, prompt, response_message)
    print(f"Done {question_idx}")


# Sequential
# for question_idx in q_idxs:
#     print(f"Question {question_idx}: {questions[question_idx]}")
#     process_and_save(question_idx)

# Parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    results = executor.map(process_and_save, q_idxs)
print("All done!")

Question 17: How many hours are you working on average (per week) nowadays?
Using OpenRouter to call gpt-4 [Streaming: False]...
Attempt 1 of 5
<class 'openai.types.chat.chat_completion.ChatCompletion'>
Saved: outputs/Q_17.txt
Saved: outputs/QnA_17.txt
Saved: outputs/A_17.txt
Done 17
All done!


In [7]:
len(range(13, len(questions)))

29