# Generate Experiment Summaries

This notebook creates summaries for the transcripts generated in the chatbot_patient_scenarios notebook.

### Import and Setup

In [1]:
# Import necessary libraries
import os
import random
import copy
import datetime
import json
import typing
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import BaseMessage
from langchain_openai import ChatOpenAI
from PIL import Image as PILImage
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfgen import canvas
from reportlab.platypus import Paragraph


from reco_analysis.data_model import data_models
from reco_analysis.summarizer_app import post_office, report_maker, summarizer_engine, data_type
from reco_analysis.summarizer_app.prompts import system_message_summarize_json



[32m2024-07-25 19:46:57.682[0m | [1mINFO    [0m | [36mreco_analysis.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/anniefriar/Desktop/Berkeley/DATASCI_210/reco/reco_analysis[0m


In [2]:
load_dotenv("../.env") 

True

In [3]:
default_model = ChatOpenAI(temperature=0.0, model_name="gpt-3.5-turbo")

In [4]:
logo_path = f"../docs/reco_logo.jpeg"

### Set Up and Test Summary & PDF Generators

#### Set Up Functions

In [5]:
def summarize(
    patient_transcript: list[str],
    model: ChatOpenAI = default_model,
    system_prompt: str = system_message_summarize_json,
) -> typing.Tuple[data_type.TranscriptSummary, BaseMessage]:
    """Summarizes a patient transcript.

    Args:
        patient_transcript (list[str]): The patient transcript to summarize.
        model (ChatOpenAI, optional): The model to use for summarization.
            Defaults to default_model.
        system_prompt (str, optional): The system prompt to use.
    """
    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("user", "\n".join(patient_transcript)),
        ]
    )

    response = model.invoke(prompt_template.format_messages())
    result_summary = response.content

    try:
        # Process the result, remove markdown and convert to JSON
        processed_result = json.loads(
            result_summary.replace("```json", "").replace("```", "").replace("\n", "")
        )
        vitals: typing.Dict[str, typing.Any] = processed_result["vital_signs"]

        def get_vital(vital_name: str) -> typing.Any:
            ret = vitals.get(vital_name, None)
            if not isinstance(ret, (int, float)):
                return None
            return ret

        return (
            data_type.TranscriptSummary(
                patient_overview=processed_result["patient_overview"],
                current_symptoms=processed_result["current_symptoms"],
                vital_signs=data_type.VitalSigns(
                    temperature=get_vital("temperature"),
                    heart_rate=get_vital("heart_rate"),
                    respiratory_rate=get_vital("respiratory_rate"),
                    oxygen_saturation=get_vital("oxygen_saturation"),
                    blood_pressure_systolic=get_vital("blood_pressure_systolic"),
                    blood_pressure_diastolic=get_vital("blood_pressure_diastolic"),
                    weight=get_vital("weight"),
                ),
                current_medications=processed_result["current_medications"],
                summary=processed_result["summary"],
            ),
            response,
        )

    except json.JSONDecodeError as e:
        raise ValueError("Failed to decode JSON from model output") from e

In [6]:
def create_patient_report(
    summary_data: data_type.TranscriptSummary,
    transcript: list[str],
    output_filename: str | None = None,
) -> bytes:
    """Create a PDF report summarizing the patient's conversation with the virtual doctor.

    Args:
        summary_data (data_type.TranscriptSummary): The summary data of the patient's conversation.
        transcript (list[str]): The transcript of the patient's conversation.
        output_filename (str): The output filename for the PDF report.

    Returns:
        A file object of the PDF report.
    """
    c = canvas.Canvas(output_filename, pagesize=letter)

    logo = PILImage.open(logo_path)
    img_width, img_height = logo.size

    first_comma_pos = transcript[0].find(",")
    patient_name = transcript[0][14:first_comma_pos]
    patient_first_name = patient_name.split(" ")[0]
    patient_last_name = patient_name.split(" ")[1]

    # Set font styles
    title_style = "Helvetica-Bold"
    section_title_style = "Helvetica-Bold"
    section_content_style = "Helvetica"

    # Draw logo and title
    title = "RECO Patient Report"
    c.setFont(title_style, 18)
    c.drawImage(logo_path, 50, 720, width=img_width / 4, height=img_height / 4)
    c.drawString(88, 727, title)
    c.line(50, 710, 550, 710)  # Draw a line under the title

    # Draw patient name and conversation date
    c.setFont(section_content_style, 11)
    c.drawString(50, 695, f"Patient: {patient_first_name}, {patient_last_name.upper()}")
    c.drawString(
        50,
        680,
        f"", # Removed conversation date for batch jobs
    )

    # Vertical position for content
    y_position = 665

    # Define paragraph styles
    styles = getSampleStyleSheet()
    body_style = styles["Normal"]
    body_style.fontName = "Helvetica"
    body_style.fontSize = 11
    body_style.leading = 14

    bulleted_body_style = copy.deepcopy(body_style)
    bulleted_body_style.leftIndent = 10

    def start_new_page_if_needed(new_height):
        """Check if the line will fit on the current page, if not, start a new page"""
        nonlocal y_position
        if y_position - new_height < 50:
            c.showPage()
            y_position = letter[1] - 50

    vitals_lines = "\n".join(
        [
            f"Temperature: {summary_data.vital_signs.temperature or 'N/A'} °F",
            f"Heart Rate: {summary_data.vital_signs.heart_rate or 'N/A'} bpm",
            f"Respiratory Rate: {summary_data.vital_signs.respiratory_rate or 'N/A'} bpm",
            f"Oxygen Saturation: {summary_data.vital_signs.oxygen_saturation or 'N/A'} %",
            (
                "Blood Pressure: "
                + (
                    f"{summary_data.vital_signs.blood_pressure_systolic}/{summary_data.vital_signs.blood_pressure_diastolic}"
                    if summary_data.vital_signs.blood_pressure_systolic
                    and summary_data.vital_signs.blood_pressure_diastolic
                    else "N/A"
                )
            ),
            f"Weight: {summary_data.vital_signs.weight or 'N/A'} lbs",
        ]
    )

    # Iterate through the sections and draw each section
    for key, value in [
        ("Patient Overview", summary_data.patient_overview),
        ("Current Symptoms", summary_data.current_symptoms),
        ("Vital Signs", vitals_lines),
        ("Current Medications", summary_data.current_medications),
        ("Summary", summary_data.summary),
    ]:
        value = copy.deepcopy(value)

        # Section title
        c.setFont(section_title_style, 12)
        y_position -= 20  # Move down 20 units
        c.drawString(50, y_position, key.upper())

        # Section content
        c.setFont(section_content_style, 11)
        y_position -= 20  # Move down another 20 units for content

        if isinstance(value, str):  # patient overview, summary
            summary_text = value.replace("\n", "<br/>")  # Replace newlines with HTML line breaks

            if summary_text[-1] != ".":
                summary_text = summary_text + "."

            summary_paragraph = Paragraph(summary_text, body_style)

            width, height = summary_paragraph.wrap(500, 800)
            start_new_page_if_needed(height)
            summary_paragraph.drawOn(c, 50, y_position - height + 10)
            y_position -= height  # Add extra space after the paragraph

        elif isinstance(value, list):  # current symptoms, current medications
            for line in value:
                bulleted_paragraph = Paragraph(line, bulleted_body_style, bulletText="•")
                width, height = bulleted_paragraph.wrap(500, 800)
                start_new_page_if_needed(height)
                bulleted_paragraph.drawOn(c, 50, y_position - height + 10)
                y_position -= height

    # Add the transcript to the end of the file
    c.showPage()  # Start a new page
    y_position = letter[1] - 50  # Reset y position for new page

    section_title = "Transcript"
    c.setFont(section_title_style, 12)
    y_position -= 20  # Move down 20 units
    c.drawString(50, y_position, section_title.upper())

    # Section content
    c.setFont(section_content_style, 11)
    y_position -= 20  # Move down another 20 units for content

    for transcript_line in transcript:
        # Create a paragraph with the summary text
        summary_text = transcript_line.replace("\n", "<br/>")

        if "Doctor" in summary_text[:6]:
            summary_text = "DOCTOR" + summary_text[6:]
        if "Patient" in summary_text[:7]:
            summary_text = "PATIENT" + summary_text[7:]

        summary_paragraph = Paragraph(summary_text, body_style)
        width, height = summary_paragraph.wrap(500, 800)

        start_new_page_if_needed(height)
        summary_paragraph.drawOn(c, 50, y_position - height + 10)
        y_position -= height + 10

    # Save the PDF file
    if output_filename:
        c.save()

    return c.getpdfdata()

In [7]:
def path_maker(
        type: typing.Literal['transcripts', 'transcripts_eval', 'transcripts_eval_improvements', 'summaries', 'summaries_eval', 'summaries_eval_improvements', 'pdfs'],
        transcript_type: typing.Literal['full', 'short'],
        model_name: typing.Literal['3.5', '4o-mini', '4o'],
        patient_prompt: typing.Literal['base', 'reluctant', 'distracted'],
        doctor_prompt: typing.Literal['base', 'improved']
):
    """
    Creates a path for saving or loading files based on the type of file and the model and prompt used.
    
    Args:
        type: The type of file to create the path for (either 'transcripts', 'transcripts_eval', 'transcripts_eval_improvements', 'summaries', 'summaries_eval', 'summaries_eval_improvements').
        transcript_type: The type of transcript to create the path for (either 'full' or 'short'). Full is the full chat transcript, while short is the extracted chat transcript.
        model_name: The name of the model used in the transcript.
        patient_prompt: The type of patient prompt used in the transcript.
        doctor_prompt: The type of doctor prompt used in the transcript.
    """
    # Set the folder path depending on type
    if type == 'transcripts':
        folder_path = '../data/transcripts'
    elif type in ['transcripts_eval', 'transcripts_eval_improvements']:
        folder_path = '../data/evaluations/transcripts'
    elif type == 'summaries':
        folder_path = '../data/summaries'
    elif type in ['summaries_eval', 'summaries_eval_improvements']:
        folder_path = '../data/evaluations/summaries'
    elif type in ['pdfs']:
        folder_path = '../data/pdfs'

    # Reformat fields
    model_name = 'gpt' + model_name.replace('4o-mini', '4o-m')
    patient_prompt = patient_prompt[:4] + 'pat'   
    doctor_prompt = doctor_prompt[:4] + 'doc'

    # Set to csv if this is a eval file
    if type in ['transcripts_eval', 'summaries_eval']:
        extension = 'csv'
    else:
        extension = 'json'

    # Pdfs are unique
    if type in ['pdfs']:
        return f"{folder_path}/{type}_{transcript_type}_{model_name}_{patient_prompt}_{doctor_prompt}/"

    return f"{folder_path}/{type}_{transcript_type}_{model_name}_{patient_prompt}_{doctor_prompt}.{extension}"

#### Test Using a Random Patient

In [8]:
TRANSCRIPTS_1_PATH = path_maker('transcripts', 'full', '3.5', 'base', 'base')
TRANSCRIPTS_1_PDF_PATH = path_maker('pdfs', 'full', '3.5', 'base', 'base')

In [9]:
print(TRANSCRIPTS_1_PDF_PATH)

../data/pdfs/pdfs_full_gpt3.5_basepat_basedoc/


In [10]:
if not os.path.exists(TRANSCRIPTS_1_PDF_PATH):
    # Create the directory if it doesn't exist
    os.makedirs(TRANSCRIPTS_1_PDF_PATH)
    print(f"Directory '{TRANSCRIPTS_1_PDF_PATH}' created successfully.")
else:
    print(f"Directory '{TRANSCRIPTS_1_PDF_PATH}' already exists.")

Directory '../data/pdfs/pdfs_full_gpt3.5_basepat_basedoc/' already exists.


In [11]:
# Open and read the JSON file
with open(TRANSCRIPTS_1_PATH, 'r') as json_file:
    patients = json.load(json_file)

In [12]:
random_key = random.choice(list(patients.keys()))

In [13]:
patient_transcript = patients[random_key]['chat_transcript']

In [14]:
patient_summary = summarize(patient_transcript)

In [15]:
# Specify the path to your PDF file
pdf_filename = f"{TRANSCRIPTS_1_PDF_PATH}TEST_{random_key}_pdf.pdf"

In [16]:
#create_patient_report(patient_summary[0], patient_transcript, pdf_filename)

## Generate PDFs & Summaries

#### Generator Function

In [17]:
def generate_everything(transcripts_path, summaries_path, pdfs_path, model=default_model, n_transcripts=20):
    if not os.path.exists(pdfs_path):
        # Create the directory if it doesn't exist
        os.makedirs(pdfs_path)
        print(f"Directory '{pdfs_path}' created successfully.")
    else:
        print(f"Directory '{pdfs_path}' already exists.")
    
    # Open and read the JSON file
    with open(transcripts_path, 'r') as json_file:
        patients = json.load(json_file)

    summaries = {}
    i=0

    for p_id in patients:
        if i < n_transcripts:
            summary = {}

            p_transcript = patients[p_id]['chat_transcript']

            print(f"Generating summary for patient {p_id}")

            p_summary = summarize(patient_transcript=p_transcript, model=model)

            summary['id'] = p_id
            
            p_summary_output = p_summary[0]
            p_summary_output_dict = p_summary_output.to_dict()

            summary['summary'] = p_summary_output_dict

            summaries[str(p_id)] = summary

            print("Summary generation successful")

            print(f"Generating PDF for patient {p_id}")

            first_underscore_pos = pdfs_path.find("_")
            experiment_name = pdfs_path[first_underscore_pos+1:-1]

            pdf_filename = f"{pdfs_path}patient_{p_id}_{experiment_name}.pdf"

            create_patient_report(p_summary[0], p_transcript, pdf_filename)

            i+=1
    
        else:
            break

    print(f"Summary and PDF generation complete")
    print()
    print(f"Saving all patient summaries in summary file")

    with open(summaries_path, 'w') as json_file:
        json.dump(summaries, json_file)

#### Transcripts 2
- Naturally terminated transcripts
- GPT3.5-turbo
- Baseline patient prompt
- Baseline doctor prompt

In [18]:
TRANSCRIPTS_2_PATH = path_maker('transcripts', 'short', '3.5', 'base', 'base')
TRANSCRIPTS_2_SUMMARIES_PATH = path_maker('summaries', 'short', '3.5', 'base', 'base')
TRANSCRIPTS_2_PDF_PATH = path_maker('pdfs', 'short', '3.5', 'base', 'base')

In [19]:
generate_everything(TRANSCRIPTS_2_PATH, TRANSCRIPTS_2_SUMMARIES_PATH, TRANSCRIPTS_2_PDF_PATH, model=default_model, n_transcripts=20)

Directory '../data/pdfs/pdfs_short_gpt3.5_basepat_basedoc/' already exists.
Generating summary for patient 19597377
Summary generation successful
Generating PDF for patient 19597377
Generating summary for patient 14206800
Summary generation successful
Generating PDF for patient 14206800
Generating summary for patient 17072793
Summary generation successful
Generating PDF for patient 17072793
Generating summary for patient 14584705
Summary generation successful
Generating PDF for patient 14584705
Generating summary for patient 16521649
Summary generation successful
Generating PDF for patient 16521649
Generating summary for patient 14717765
Summary generation successful
Generating PDF for patient 14717765
Generating summary for patient 15343100
Summary generation successful
Generating PDF for patient 15343100
Generating summary for patient 13228928
Summary generation successful
Generating PDF for patient 13228928
Generating summary for patient 11922236
Summary generation successful
Genera

#### Transcripts 3
- Naturally terminated transcripts
- GPT4o-mini
- Baseline patient prompt
- Baseline doctor prompt

In [20]:
TRANSCRIPTS_3_PATH = path_maker('transcripts', 'short', '4o-mini', 'base', 'base')
TRANSCRIPTS_3_SUMMARIES_PATH = path_maker('summaries', 'short', '4o-mini', 'base', 'base')
TRANSCRIPTS_3_PDF_PATH = path_maker('pdfs', 'short', '4o-mini', 'base', 'base')

In [21]:
model_update = ChatOpenAI(temperature=0.0, model_name="gpt-4o-mini")

In [22]:
generate_everything(TRANSCRIPTS_3_PATH, TRANSCRIPTS_3_SUMMARIES_PATH, TRANSCRIPTS_3_PDF_PATH, model=model_update, n_transcripts=20)

Directory '../data/pdfs/pdfs_short_gpt4o-m_basepat_basedoc/' already exists.
Generating summary for patient 11280189
Summary generation successful
Generating PDF for patient 11280189
Generating summary for patient 14717765
Summary generation successful
Generating PDF for patient 14717765
Generating summary for patient 11477097
Summary generation successful
Generating PDF for patient 11477097
Generating summary for patient 19818404
Summary generation successful
Generating PDF for patient 19818404
Generating summary for patient 19973319
Summary generation successful
Generating PDF for patient 19973319
Generating summary for patient 12641479
Summary generation successful
Generating PDF for patient 12641479
Generating summary for patient 19124949
Summary generation successful
Generating PDF for patient 19124949
Generating summary for patient 18435540
Summary generation successful
Generating PDF for patient 18435540
Generating summary for patient 12246674
Summary generation successful
Gener

#### Transcripts 4

- Naturally terminated transcripts
- GPT4o
- Baseline patient prompt
- Baseline doctor prompt

In [23]:
TRANSCRIPTS_4_PATH = path_maker('transcripts', 'short', '4o', 'base', 'base')
TRANSCRIPTS_4_SUMMARIES_PATH = path_maker('summaries', 'short', '4o', 'base', 'base')
TRANSCRIPTS_4_PDF_PATH = path_maker('pdfs', 'short', '4o', 'base', 'base')

In [24]:
model_update = ChatOpenAI(temperature=0.0, model_name="gpt-4o")

In [25]:
generate_everything(TRANSCRIPTS_4_PATH, TRANSCRIPTS_4_SUMMARIES_PATH, TRANSCRIPTS_4_PDF_PATH, model=model_update, n_transcripts=20)

Directory '../data/pdfs/pdfs_short_gpt4o_basepat_basedoc/' already exists.
Generating summary for patient 17548402
Summary generation successful
Generating PDF for patient 17548402
Generating summary for patient 10045960
Summary generation successful
Generating PDF for patient 10045960
Generating summary for patient 18203000
Summary generation successful
Generating PDF for patient 18203000
Generating summary for patient 19854363
Summary generation successful
Generating PDF for patient 19854363
Generating summary for patient 14755254
Summary generation successful
Generating PDF for patient 14755254
Generating summary for patient 14030143
Summary generation successful
Generating PDF for patient 14030143
Generating summary for patient 14834029
Summary generation successful
Generating PDF for patient 14834029
Generating summary for patient 11684618
Summary generation successful
Generating PDF for patient 11684618
Generating summary for patient 16777967
Summary generation successful
Generat

#### Transcripts 5

- Naturally terminated transcripts
- GPT4o mini
- Reluctant patient prompt
- Baseline doctor prompt

In [31]:
TRANSCRIPTS_5_PATH = path_maker('transcripts', 'short', '4o-mini', 'reluctant', 'base')
TRANSCRIPTS_5_SUMMARIES_PATH = path_maker('summaries', 'short', '4o-mini', 'reluctant', 'base')
TRANSCRIPTS_5_PDF_PATH = path_maker('pdfs', 'short', '4o-mini', 'reluctant', 'base')

In [32]:
model_update = ChatOpenAI(temperature=0.0, model_name="gpt-4o-mini")

In [33]:
generate_everything(TRANSCRIPTS_5_PATH, TRANSCRIPTS_5_SUMMARIES_PATH, TRANSCRIPTS_5_PDF_PATH, model=model_update, n_transcripts=20)

Directory '../data/pdfs/pdfs_short_gpt4o-m_relupat_basedoc/' already exists.
Generating summary for patient 10112163
Summary generation successful
Generating PDF for patient 10112163
Generating summary for patient 14108973
Summary generation successful
Generating PDF for patient 14108973
Generating summary for patient 15393180
Summary generation successful
Generating PDF for patient 15393180
Generating summary for patient 11169394
Summary generation successful
Generating PDF for patient 11169394
Generating summary for patient 12007928
Summary generation successful
Generating PDF for patient 12007928
Generating summary for patient 17133133
Summary generation successful
Generating PDF for patient 17133133
Generating summary for patient 15862920
Summary generation successful
Generating PDF for patient 15862920
Generating summary for patient 15911683
Summary generation successful
Generating PDF for patient 15911683
Generating summary for patient 15132645
Summary generation successful
Gener

#### Transcripts 6

- Naturally terminated transcripts
- GPT4o mini
- Distracted patient prompt
- Baseline doctor prompt

In [34]:
TRANSCRIPTS_6_PATH = path_maker('transcripts', 'short', '4o-mini', 'distracted', 'base')
TRANSCRIPTS_6_SUMMARIES_PATH = path_maker('summaries', 'short', '4o-mini', 'distracted', 'base')
TRANSCRIPTS_6_PDF_PATH = path_maker('pdfs', 'short', '4o-mini', 'distracted', 'base')

In [35]:
model_update = ChatOpenAI(temperature=0.0, model_name="gpt-4o-mini")

In [37]:
generate_everything(TRANSCRIPTS_6_PATH, TRANSCRIPTS_6_SUMMARIES_PATH, TRANSCRIPTS_6_PDF_PATH, model=model_update, n_transcripts=20)

Directory '../data/pdfs/pdfs_short_gpt4o-m_distpat_basedoc/' already exists.
Generating summary for patient 17521224
Summary generation successful
Generating PDF for patient 17521224
Generating summary for patient 17133133
Summary generation successful
Generating PDF for patient 17133133
Generating summary for patient 11642460
Summary generation successful
Generating PDF for patient 11642460
Generating summary for patient 14856000
Summary generation successful
Generating PDF for patient 14856000
Generating summary for patient 14015736
Summary generation successful
Generating PDF for patient 14015736
Generating summary for patient 14588689
Summary generation successful
Generating PDF for patient 14588689
Generating summary for patient 19442084
Summary generation successful
Generating PDF for patient 19442084
Generating summary for patient 17697993
Summary generation successful
Generating PDF for patient 17697993
Generating summary for patient 18670109
Summary generation successful
Gener