In [None]:
%%capture
!pip install openai
!pip install scikit-learn
!pip install matplotlib
!pip install rouge
!pip install rouge_score

In [None]:
import os
import openai
import pandas as pd
import json
import math
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from metrics import Rouge, AutomaticNgramEval, AutomaticFactEval
from tqdm import tqdm
from tqdm.notebook import tqdm
import time

In [None]:
# openai.organization = 
# OPENAI_API_KEY = 
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
train_df = pd.read_csv("TaskA-TrainingSet.csv")
eval_df = pd.read_csv("TaskA-ValidationSet.csv")
train_df.rename(columns={'dialogue': 'Conv_snippet'}, inplace=True)
eval_df.rename(columns={'dialogue': 'Conv_snippet'}, inplace=True)
train_df.rename(columns={'section_text': 'summary'}, inplace=True)
eval_df.rename(columns={'section_text': 'summary'}, inplace=True)
train_df.rename(columns={'section_header': 'section'}, inplace=True)
eval_df.rename(columns={'section_header': 'section'}, inplace=True)
#what about test_df? it was in your code

In [None]:
target_section = 'EXAM'
#'OTHER_HISTORY',
# 'CC',
# 'DIAGNOSIS',
# 'FAM/SOCHX',
# 'MEDICATIONS',
# 'PROCEDURES',
# 'ALLERGY',
# 'GENHX',
# 'ROS', '
# PASTMEDICALHX',
#  'PASTSURGICAL',
#  'DISPOSITION',
# 'EDCOURSE', '
# PLAN',
#  'LABS',
# 'ASSESSMENT',
# 'GYNHX',
# 'IMAGING',
# 'IMMUNIZATIONS'

train_df = train_df[train_df['section']==target_section]
eval_df = eval_df[eval_df['section']==target_section]
# test_df = test_df[test_df['section']==target_section]

In [None]:
logging = {}

In [None]:
summarize_raw_instruction = """[target_trainable_instruction]
[target_trainable_few_shot_examples]

SOAP note section:
[section]
Conversation snippet:
[Conv_snippet]

Output your summary.
Return the output as a dictionary object, adhering to the following structure:
{"summary": ...}
Please provide your response solely in the dictionary format without including any additional text.
"""

In [None]:
#p0
target_trainable_instruction = """In this task, we ask for your expertise in writing SOAP notes from the doctor-patient conversation.
Mainly we provide the target section in the SOAP note and the conversation snippet.
We need you to generate a summary for the respective snippet.

"""

#gradient
training_prompt_forward = """In this task, you need to provide suggestions to modify the instruction in our SOAP notes writing system, which uses a model to generate SOAP notes from the doctor-patient conversation according to manually created instructions.
Specifically, we feed the AI a conversation snippet and the target section in the SOAP note and ask it to generate the corresponding summary.
But we found that the instruction in the current system is not perfect, so we need you to modify the instruction for this model to improve our system.

The instruction now in our rating system:
[target_trainable_instruction]
SOAP note section for summary:
[section]
Conversation snippet for the model:
[Conv_snippet]
Current AI summary:
[AI_summary]
Reference summary:
[label_summary]

Here are some of the requirements you need to be aware of when suggesting the instruction modification in our system:
1) For better generalization, what you suggest should be abstracted as high-level criteria as much as possible instead of only describing the details
2) We will improve the instructions based on your suggestions. If I re-provide the system with the conversation snippet and the target section in the SOAP note, it needs to be able to generate the reference summary using your new suggested instructions.
3) The instruction now in our system is for the zero-shot setting, don't try to add any examples to the instruction.
4) We are currently only focusing on this target section, so you don't need to consider the situation of other sections in the SOAP note, just optimize the instructions completely for this section.

Let's think step by step. First, output your reasons for why the current instruction in the system cannot generate the correct reference summary, then output your suggestions to modify the instruction for our system.
Return the output as a dictionary object, adhering to the following structure:
{"reasons": ..., "suggestions": ...}
Ensure the 'suggestions' only includes text but not a list. Please provide your response solely in the dictionary format without including any additional text.
"""

#delta
training_prompt_backward_prefix = """In this task, you need to provide suggestions to modify the instruction in our SOAP notes writing system, which uses a model to generate SOAP notes from the doctor-patient conversation according to manually created instructions.
Specifically, we feed the AI a conversation snippet and the target section in the SOAP note and ask it to generate the corresponding summary.
But we found that the instruction in the current system is not perfect, so we need you to modify the instruction for this model to improve our system.

The instruction now in our system:
[target_trainable_instruction]
"""

#delta
training_prompt_backward_suggestions = """Suggestions from summary [i]:
[suggestions]
"""

#delta
training_prompt_backward_suffix = """
Here are some of the requirements you need to be aware of when modifying the instruction in our system:
1) For better generalization, what you suggest should be abstracted as high-level criteria as much as possible instead of only describing the details
2) We will improve the instructions based on your suggestions. If I re-provide the system with the conversation snippet and the target section in the SOAP note, it needs to be able to generate the reference summary using your new suggested instructions.
3) The instruction now in our system is for the zero-shot setting, don't try to add any examples to the instruction.
4) We are currently only focusing on this target section, so you don't need to consider the situation of other sections in the SOAP note, just optimize the instructions completely for this section.

Let's think step by step. First, briefly summarize the suggestions of all the data to get a final suggestion containing only the highest priority requirement, then output your modified instruction for our system based on the final suggestion.
Return the output as a dictionary object, adhering to the following structure:
{"final suggestion": ..., "new instruction": ...}
Please provide your response solely in the dictionary format without including any additional text.
"""

In [None]:
def dataloader(train_df, bsz,
               target_trainable_instruction, rating_raw_instruction,
               target_trainable_few_shot_examples, do_few_shot,
               ngram_eval, factev,
               sample_mode='random'):

    #samples at random bsz amount of rows
    if sample_mode == 'random':
        sampled_data = train_df.sample(n=bsz)

    if sample_mode == 'hard_negative':
        print("----------------------------------------------------------------------------------------------------------------")
        print("eval results on all TRAIN DATA because of hard_negative sampling")
        print("----------------------------------------------------------------------------------------------------------------")

        eval_dict = eval_loop(train_df, target_trainable_instruction, rating_raw_instruction,
                              target_trainable_few_shot_examples, do_few_shot,
                              ngram_eval, factev, eval_training_step=True)

        # Find indices with different values
        different_indices = [i for i in range(len(eval_dict['labels'])) if eval_dict['labels'][i] != eval_dict['preds'][i]]
        print('hard_negative target datapoints:', different_indices)

        sampled_data = train_df.sample(n=bsz, weights=weights)

    return sampled_data

In [None]:
# input will be p0, x --> y^
def do_summarize(target_trainable_instruction, rating_raw_instruction, section, Conv_snippet,
              target_trainable_few_shot_examples='', do_few_shot=False):
    instruction = rating_raw_instruction.replace('[target_trainable_instruction]', target_trainable_instruction) #p0
    if do_few_shot:
        instruction = instruction.replace('[target_trainable_few_shot_examples]', target_trainable_few_shot_examples)
    else:
        instruction = instruction.replace('[target_trainable_few_shot_examples]', '')
    instruction = instruction.replace('[section]', section) #x
    instruction = instruction.replace('[Conv_snippet]', Conv_snippet) #x
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",#gpt-3.5-turbo-16k
        messages=[
            {"role": "system", "content": instruction}
        ],
        temperature=0
    )

    summary = json.loads(response["choices"][0]["message"]["content"], strict=False)['summary'] #y^
    return summary

In [None]:
#logs training
def eval_log(logging, epoch, eval_dict):
    print('epoch:', epoch)

    # Check if the epoch exists in the logging dictionary
    if 'epoch'+str(epoch) not in logging:
        logging['epoch'+str(epoch)]= {}  # Create a new dictionary for the epoch

    for k, v in eval_dict.items():
        logging['epoch'+str(epoch)][k] = v
        if k != 'labels' and k != 'preds':
            print('\t', k, v)

def eval_one_step(eval_data, target_trainable_instruction, summarize_raw_instruction, target_trainable_few_shot_examples='', do_few_shot=False):
    section = eval_data['section']
    Conv_snippet = eval_data['Conv_snippet']

    # get the model's rating on training data before training
    curr_summary = do_summarize(target_trainable_instruction,
                                summarize_raw_instruction,
                                section, Conv_snippet,
                                target_trainable_few_shot_examples, do_few_shot)

    return curr_summary

def eval_loop(eval_df, target_trainable_instruction, summarize_raw_instruction, target_trainable_few_shot_examples, do_few_shot, ngram_eval, factev, eval_training_step=False):
    summary_gpt = []
    summary_doctor = []
    for eval_step in tqdm(range(eval_df.shape[0]), desc="Evaluation"):
        eval_data = eval_df.iloc[eval_step]
        try:
            curr_summary = eval_one_step(eval_data, target_trainable_instruction, summarize_raw_instruction, target_trainable_few_shot_examples, do_few_shot)
            summary_gpt.append(curr_summary)
            summary_doctor.append(eval_df.iloc[eval_step]['summary'])
        except:
            continue

    # eval generated critique
    eval_dict = ngram_eval.run_all_evaluation(summary_doctor, summary_gpt)
    UMLS_dict = factev.run_source_concept_faithfulness(ref_sums = summary_doctor, gen_sums = summary_gpt)
    del UMLS_dict['pred_concepts_term']
    del UMLS_dict['pred_concepts_cuis']
    eval_dict.update(UMLS_dict)

    eval_dict = {'summary_'+k: round(v, 4) for k, v in eval_dict.items()}

    eval_dict['labels'] = summary_doctor
    eval_dict['preds'] = summary_gpt

    return eval_dict

In [None]:
def training_forward_step(training_prompt_forward, target_trainable_instruction,
                          section, Conv_snippet,
                          AI_summary,
                          label_summary,
                          learning_temperature_rate=0):
    instruction = training_prompt_forward.replace('[target_trainable_instruction]', target_trainable_instruction)
    instruction = instruction.replace('[section]', section)
    instruction = instruction.replace('[Conv_snippet]', Conv_snippet)
    instruction = instruction.replace('[AI_summary]', AI_summary)
    instruction = instruction.replace('[label_summary]', label_summary)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k", #gpt-4
        messages=[
            {"role": "system", "content": instruction}
        ],
        temperature=learning_temperature_rate
    )

    suggestions = json.loads(response["choices"][0]["message"]["content"], strict=False)['suggestions']

    return suggestions

def training_backward_step(training_prompt_backward_prefix,
                           training_prompt_backward_suggestions,
                           training_prompt_backward_suffix,
                           target_trainable_instruction,
                           bsz, bsz_suggestion,
                           learning_temperature_rate=0):
    # make backward instruction with prefix, suggestions, and suffix
    instruction = training_prompt_backward_prefix.replace('[target_trainable_instruction]', target_trainable_instruction) #p0
    for i in range(bsz): #g
        suggestions_instruction = training_prompt_backward_suggestions.replace('[i]', str(i+1))
        suggestions_instruction = suggestions_instruction.replace('[suggestions]', bsz_suggestion[i]['suggestions'])
        instruction = instruction + suggestions_instruction
    instruction = instruction + training_prompt_backward_suffix

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k", #gpt-4
        messages=[
            {"role": "system", "content": instruction}
        ],
        temperature=learning_temperature_rate
    )

    response = json.loads(response["choices"][0]["message"]["content"], strict=False)

    new_target_trainable_instruction = response['new instruction']
    final_suggestion = response['final suggestion']
    print("final suggestion in this step: ", final_suggestion)

    return new_target_trainable_instruction

def train_one_step(epoch, step, training_data, bsz, target_trainable_instruction,
                   summarize_raw_instruction,
                   logging, learning_temperature_rate=0):

    # get forward suggestions (cal loss)
    bsz_suggestions = []
    for i in tqdm(range(bsz), desc="batch cal loss"):
        section = training_data.iloc[i]['section'] #x
        Conv_snippet = training_data.iloc[i]['Conv_snippet'] #x
        label_summary = training_data.iloc[i]['summary'] #y

        # get the model's summary on training data before training --> y^
        AI_summary = do_summarize(target_trainable_instruction,
                                  summarize_raw_instruction,
                                  section, Conv_snippet)

        # get suggestions for every data in batch --> input: p0, x, y^, y --> g
        suggestions = training_forward_step(training_prompt_forward, target_trainable_instruction, section,
                                     Conv_snippet, AI_summary, label_summary,
                                     learning_temperature_rate)

        bsz_suggestions.append({'label_summary': label_summary,
                                'AI_summary': AI_summary,
                                'suggestions': suggestions})

    # make backward update
    new_target_trainable_instruction = training_backward_step(training_prompt_backward_prefix,
                                                              training_prompt_backward_suggestions,
                                                              training_prompt_backward_suffix,
                                                              target_trainable_instruction,
                                                              bsz,
                                                              bsz_suggestions,
                                                              learning_temperature_rate)
    #update to get new target_trainable_instruction
    target_trainable_instruction = new_target_trainable_instruction
    is_updated = True

    return target_trainable_instruction, is_updated

In [None]:
def train_loop(train_df, eval_df, target_trainable_instruction, summarize_raw_instruction,
               logging, ngram_eval, factev,
               target_trainable_few_shot_examples='', do_few_shot=False,
               EPOCH=1, steps_per_epoch=5, bsz=10,
               eval_at_beginning=False, dataloader_sample_mode='random', learning_temperature_rate=0):

    #does evaluatoin on the original prompt before any training
    if eval_at_beginning:
        print("the init target_trainable_instruction is:")
        print(target_trainable_instruction)
        if do_few_shot:
            print("the init target_trainable_few_shot_examples is:")
            print(target_trainable_few_shot_examples)

        print("----------------------------------------------------------------------------------------------------------------")
        print("eval results on all TRAINING DATA in the beginning")
        print("----------------------------------------------------------------------------------------------------------------")
        eval_dict = eval_loop(train_df, target_trainable_instruction, summarize_raw_instruction,
                              target_trainable_few_shot_examples, do_few_shot,
                              ngram_eval, factev, eval_training_step=False)
        eval_log(logging, -1, eval_dict)

        print("----------------------------------------------------------------------------------------------------------------")
        print("eval results on all EVAL DATA in the beginning")
        print("----------------------------------------------------------------------------------------------------------------")
        eval_dict = eval_loop(eval_df, target_trainable_instruction, summarize_raw_instruction,
                              target_trainable_few_shot_examples, do_few_shot,
                              ngram_eval, factev, eval_training_step=False)
        eval_log(logging, -1, eval_dict)

    for epoch in range(EPOCH):
        print(str(epoch), "EPOCH BEGIN--------------------------------------------------------------------------------------------------")
        print(str(epoch), "EPOCH BEGIN--------------------------------------------------------------------------------------------------")
        print(str(epoch), "EPOCH BEGIN--------------------------------------------------------------------------------------------------")

        # Check if the epoch exists in the logging dictionary
        if 'epoch'+str(epoch) not in logging:
            logging['epoch'+str(epoch)] = {}  # Create a new dictionary for the epoch

        any_change_in_this_epoch = False

        for train_step in tqdm(range(steps_per_epoch), desc="Training"):
            print("------------------------------------------------------------------")
            print("START NEW TRAINING STEP")
            print("------------------------------------------------------------------")

            # try:
            # load training data for the epoch
            training_data = dataloader(train_df, bsz,
                                       target_trainable_instruction, summarize_raw_instruction,
                                       target_trainable_few_shot_examples, do_few_shot,
                                       ngram_eval, factev,
                                       dataloader_sample_mode)

            #-----------
            print("training metrics: before training step")
            eval_dict = eval_loop(training_data, target_trainable_instruction, summarize_raw_instruction,
                                  target_trainable_few_shot_examples, do_few_shot,
                                  ngram_eval, factev, eval_training_step=True)
            eval_log(logging, epoch, eval_dict)
            #-----------

            print('training section:', training_data.index.tolist())

            #train the "instruction"
            if not do_few_shot:
                target_trainable_instruction, is_updated = train_one_step(epoch, train_step, training_data, bsz,
                                                                          target_trainable_instruction,
                                                                          summarize_raw_instruction,
                                                                          logging, learning_temperature_rate=learning_temperature_rate)

            #train the few-shot-examples
            if do_few_shot:
                target_trainable_few_shot_examples = train_one_step_for_few_shot_example(epoch, train_step,
                                                                                         training_data,
                                                                                         bsz,
                                                                                         target_trainable_instruction,
                                                                                         target_trainable_few_shot_examples,
                                                                                         summarize_raw_instruction,
                                                                                         logging,
                                                                                         learning_temperature_rate=0)
                is_updated = True

            if is_updated:
                any_change_in_this_epoch = True

            #-----------
            print("training metrics: after training step")
            eval_dict = eval_loop(training_data, target_trainable_instruction, summarize_raw_instruction,
                                  target_trainable_few_shot_examples, do_few_shot,
                                  ngram_eval, factev, eval_training_step=True)
            eval_log(logging, epoch, eval_dict)
            #-----------
            # except:
            #     print('Encounter some errors from OpenAI API')
            #     # print('Encounter some errors from OpenAI API, start to sleep 60s...')
            #     # time.sleep(60)
            #     # print('End sleep, resume the training')
            #     continue


            print("------------------------------------------------------------------")
            print("END THIS TRAINING STEP")
            print("------------------------------------------------------------------")

        print(str(epoch), "EPOCH END----------------------------------------------------------------------------------------------------")
        print(str(epoch), "EPOCH END----------------------------------------------------------------------------------------------------")
        print(str(epoch), "EPOCH END----------------------------------------------------------------------------------------------------")

        # do eval
        if any_change_in_this_epoch:
            print("----------------------------------------------------------------------------------------------------------------")
            print("eval results on all TRAINING DATA for EPOCH", str(epoch))
            print("----------------------------------------------------------------------------------------------------------------")
            eval_dict = eval_loop(train_df, target_trainable_instruction, summarize_raw_instruction,
                                  target_trainable_few_shot_examples, do_few_shot,
                                  ngram_eval, factev, eval_training_step=False)
            eval_log(logging, epoch, eval_dict)

            print("----------------------------------------------------------------------------------------------------------------")
            print("eval results on all EVAL DATA for EPOCH", str(epoch))
            print("----------------------------------------------------------------------------------------------------------------")
            eval_dict = eval_loop(eval_df, target_trainable_instruction, summarize_raw_instruction,
                                  target_trainable_few_shot_examples, do_few_shot,
                                  ngram_eval, factev, eval_training_step=False)
            eval_log(logging, epoch, eval_dict)

            print("after curr epoch, the target_trainable_instruction is:")
            print(target_trainable_instruction)
            if do_few_shot:
                print("after curr epoch, the target_trainable_few_shot_examples is:")
                print(target_trainable_few_shot_examples)

    return target_trainable_instruction, target_trainable_few_shot_examples

In [20]:
ngram_eval = AutomaticNgramEval()
# factev = AutomaticFactEval()
factev=None

target_trainable_instruction, target_trainable_few_shot_examples = train_loop(train_df, eval_df,
                                                                              target_trainable_instruction,
                                                                              summarize_raw_instruction, logging,
                                                                              ngram_eval, factev,
                                                                              target_trainable_few_shot_examples='',
                                                                              do_few_shot=False,
                                                                              EPOCH=1, steps_per_epoch=5, bsz=10,
                                                                              eval_at_beginning=True,
                                                                              dataloader_sample_mode='random',
                                                                              learning_temperature_rate=0)

the init target_trainable_instruction is:
In this task, we ask for your expertise in writing SOAP notes from the doctor-patient conversation.
Mainly we provide the target section in the SOAP note and the conversation snippet.
We need you to generate a summary for the respective snippet.


----------------------------------------------------------------------------------------------------------------
eval results on all TRAINING DATA in the beginning
----------------------------------------------------------------------------------------------------------------


Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

ZeroDivisionError: ignored