# Conversation Closing Detector

In [77]:
import csv
import numpy as np
import pandas as pd
import json
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from sklearn.metrics import accuracy_score, confusion_matrix

In [50]:
load_dotenv("../.env")

True

## Load Transcripts and Create Test Data

In [51]:
TRANSCRIPT_PATH = "../data/patients/patients_1.0_with_transcripts.json"
with open(TRANSCRIPT_PATH, "r") as file:
    data = json.load(file)
transcripts = {patient['id']: patient['chat_transcript'] for patient in data.values()}

First we need to extract 'AI:user' conversation segments and then manually label it

In [54]:
def extract_segments(data):
    """
    Extract segments from the chat transcripts where the AI output is followed by the user input.

    Args:
        data (dict): The data dictionary containing the chat transcripts.
    
    Returns:
        list: A list of dictionaries containing the patient_id, AI output, and user input for each segment.
    """
    segments = []

    for patient_id, patient_data in data.items():
        transcript = patient_data["chat_transcript"]
        for i in range(1, len(transcript)):
            ai_output = transcript[i - 1]
            user_input = transcript[i]

            # Check if ai_output starts with 'Doctor' and user_input starts with 'Patient'
            if ai_output.startswith("Doctor: ") and user_input.startswith("Patient: "):
                # Remove the 'Doctor: ' and 'Patient: ' prefixes
                ai_output_clean = ai_output[len("Doctor: "):]
                user_input_clean = user_input[len("Patient: "):]

                segments.append({
                    'patient_id': patient_id,
                    'ai_output': ai_output_clean,
                    'user_input': user_input_clean
                })
    
    return segments

# Extract segments
segments = extract_segments(data)

# Save segments to CSV for manual labeling
csv_file_path = '../data/interim/transcript_segments_for_labeling.csv'

with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['patient_id', 'ai_output', 'user_input']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for segment in segments:
        writer.writerow(segment)

print(f"Segments have been extracted and saved to {csv_file_path}")

Segments have been extracted and saved to ../data/interim/transcript_segments_for_labeling.csv


Once we have manually labeled the dataset we can import it as our test data

In [55]:
# Load the CSV file
csv_file_path = '../data/interim/transcript_segments_labeled.csv'
labeled_segments = pd.read_csv(csv_file_path)

## Define and Test Conversation Closure Detection Model

### Define Conversation Closing Detector

In [69]:
def create_prompt_template(prompt_text):
    """
    Create a prompt template for the language model.

    Args:
        prompt_text (str): The text of the prompt template.

    Returns:
        PromptTemplate: A LangChain PromptTemplate object.
    """
    return PromptTemplate(
        input_variables=["ai_output", "user_input"],
        template=prompt_text,
    )

def check_conversation_closing(chain, ai_output, user_input):
    """
    Check if the conversation is coming to a close based on the last AI output and user input.

    Args:
        chain (LLMChain): The LangChain chain to process the input.
        ai_output (str): The last AI output in the conversation.
        user_input (str): The last user input in the conversation.

    Returns:
        bool: True if the conversation is coming to a close, False otherwise.
    """
    result = chain.invoke({"ai_output": ai_output, "user_input": user_input})
    return result.strip().lower() == 'true'

### Define Experiment Functions

In [78]:
def run_experiment(prompt_text, model, temperature):
    """
    Run an experiment with a given prompt template, model, and temperature to evaluate conversation closing detection.

    Args:
        prompt_text (str): The text of the prompt template to use in the experiment.
        model (OpenAI): The OpenAI model to use in the experiment.
        temperature (float): The temperature setting for the model.

    Returns:
        tuple: A tuple containing the accuracy, confusion matrix, predictions, and timing information.
    """
    # Create the model with the specified temperature
    model = ChatOpenAI(model_name=model, temperature=temperature)

    # Create the prompt template and LLM chain
    prompt = create_prompt_template(prompt_text)
    chain = prompt | model | StrOutputParser()
    
    # Prepare the data for testing
    y_true = labeled_segments['end'].tolist()
    y_pred = []
    response_times = []

    for _, row in labeled_segments.iterrows():
        ai_output = row['ai_output']
        user_input = row['user_input']
        start_time = time.time()
        prediction = check_conversation_closing(chain, ai_output, user_input)
        end_time = time.time()
        y_pred.append(int(prediction))
        response_times.append(end_time - start_time)

    # Calculate accuracy and confusion matrix
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = pd.DataFrame(confusion_matrix(y_true, y_pred))

    return accuracy, conf_matrix, y_pred, response_times

def print_misclassifications(y_true, y_pred, labeled_segments):
    """
    Print out cases where the true labels and predicted labels do not match.

    Args:
        y_true (list): List of true labels.
        y_pred (list): List of predicted labels.
        labeled_segments (DataFrame): DataFrame containing the labeled segments.
    """
    for index, (true_label, pred_label) in enumerate(zip(y_true, y_pred)):
        if true_label != pred_label:
            print(f"Index: {index}")
            print(f"Patient ID: {labeled_segments.loc[index, 'patient_id']}")
            print(f"AI Output: {labeled_segments.loc[index, 'ai_output']}")
            print(f"User Input: {labeled_segments.loc[index, 'user_input']}")
            print(f"True Label: {true_label}, Predicted Label: {pred_label}")
            print("---")

### Run Base Experiment

In [79]:
# Base Experiment
base_prompt = """
You are analyzing a conversation between a doctor and a patient. Based on the last user input and the previous AI output, determine if the conversation is coming to a close. Respond with 'True' or 'False'.

Examples:

Example 1:
AI Output: Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?
User Input: No, Doctor, I think that covers everything for now. Thank you for checking in on me.
Response: True

Example 2:
AI Output: Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?
User Input: My temperature is 97.4 degrees, Doctor.
Response: False

Now, analyze the following conversation and determine if it is coming to a close.

AI Output:
{ai_output}

User Input:
{user_input}
"""
base_model = "gpt-3.5-turbo"
base_temperature = 0.7

accuracy, conf_matrix, y_pred, response_times = run_experiment(base_prompt, base_model, base_temperature)

print("Base Experiment Results:")
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix")
display(conf_matrix)
print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
avg_response_time = sum(response_times) / len(response_times)
print(f"Average response time: {avg_response_time:.4f} seconds")

Base Experiment Results:
Accuracy: 0.872
Confusion Matrix


Unnamed: 0,0,1
0,420,64
1,0,16


Index: 7
Patient ID: 12305811
AI Output: Thank you for letting me know, Kevin. Lastly, have you felt more tired than usual or experienced any sudden changes in your mental clarity?
User Input: No, Doctor, my mental clarity has been the same. Just feeling a bit more tired than usual.
True Label: 0, Predicted Label: 1
---
Index: 12
Patient ID: 12305811
AI Output: Thank you for sharing your oxygen saturation level, Kevin. Lastly, could you provide me with your blood pressure reading?
User Input: My blood pressure is 127/70, Doctor.
True Label: 0, Predicted Label: 1
---
Index: 13
Patient ID: 12305811
AI Output: Based on your responses, Kevin, your vital signs seem stable. Now, let's review the medications you are currently taking. Are you on any of the following medications: ACE inhibitors, Angiotensin II Receptor Blockers, ARNI, Beta-Blockers, Diuretics, Mineralocorticoid Receptor Antagonists, Hydralazine, Nitrate medications, Ivabradine, SGLT2 inhibitors, or GLP-1 agonists? Please confir

### Experiment 2: Expanded Prompts

In [80]:
prompt_var_1 = """
You are analyzing a conversation between a doctor (AI) and a patient (user). Based on the last user input and the previous AI output, determine if the conversation is coming to a close. Respond with 'True' or 'False'.

Examples:

Example 1:
AI Output: Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?
User Input: No, Doctor, I think that covers everything for now. Thank you for checking in on me.
Response: True

Example 2:
AI Output: Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?
User Input: My temperature is 97.4 degrees, Doctor.
Response: False

Example 3:
AI Output: Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?
User Input: No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.
Response: False

Example 4:
AI Output: Thank you for sharing your oxygen saturation level. Lastly, could you please provide your blood pressure reading for today?
User Input: My blood pressure today is 123/56.
Response: False

Example 5:
AI Output: Based on your responses, it seems like you're experiencing ankle swelling and the need to prop yourself up at night to breathe comfortably. Let's continue with your medications. Besides beta-blockers and diuretics, are you taking any other medications currently?
User Input: No, those are the main ones I'm taking right now. Is there anything specific you want to know about them?
Response: False

Now, analyze the following conversation and determine if it is coming to a close.

AI Output:
{ai_output}

User Input:
{user_input}
"""

accuracy, conf_matrix, y_pred, response_times = run_experiment(prompt_var_1, base_model, base_temperature)
print(f"Results for Prompt Variation:\n{prompt_var_1}")
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
avg_response_time = sum(response_times) / len(response_times)
print(f"Average response time: {avg_response_time:.4f} seconds")
print("---")

Results for Prompt Variation:

You are analyzing a conversation between a doctor (AI) and a patient (user). Based on the last user input and the previous AI output, determine if the conversation is coming to a close. Respond with 'True' or 'False'.

Examples:

Example 1:
AI Output: Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?
User Input: No, Doctor, I think that covers everything for now. Thank you for checking in on me.
Response: True

Example 2:
AI Output: Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?
User Input: My temperature is 97.4 degrees, Doctor.
Response: False

Example 3:
AI Output: Thank you

### Experiment 3: Model Variations

In [81]:
model_variations = ["gpt-3.5-turbo-0125", "gpt-3.5-turbo-1106", "gpt-3.5-turbo"]

for model_name in model_variations:
    accuracy, conf_matrix, y_pred, response_times = run_experiment(base_prompt, model_name, base_temperature)
    print(f"Results for Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
    avg_response_time = sum(response_times) / len(response_times)
    print(f"Average response time: {avg_response_time:.4f} seconds")
    print("---")
    print()

Results for Model: gpt-3.5-turbo-0125
Accuracy: 0.86
Confusion Matrix:
     0   1
0  414  70
1    0  16
Index: 7
Patient ID: 12305811
AI Output: Thank you for letting me know, Kevin. Lastly, have you felt more tired than usual or experienced any sudden changes in your mental clarity?
User Input: No, Doctor, my mental clarity has been the same. Just feeling a bit more tired than usual.
True Label: 0, Predicted Label: 1
---
Index: 14
Patient ID: 12305811
AI Output: Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?
User Input: No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.
True Label: 0, Predicted Label: 1
---
Index: 35
Patient ID: 14185111
AI Output: Based on your responses, it seems like you're experiencing ankle swelling and the need to prop yourself up at night to breathe comfortably. Let's continue with your medications. Besides beta-b

### Experiment 4: Temperature Variations

In [82]:
# Temperature Variations
temperature_variations = [0.0, 0.3, 0.7]

for temperature in temperature_variations:
    accuracy, conf_matrix, y_pred, response_times = run_experiment(base_prompt, base_model, temperature)
    print(f"Results for Temperature: {temperature}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
    avg_response_time = sum(response_times) / len(response_times)
    print(f"Average response time: {avg_response_time:.4f} seconds")
    print("---")
    print()

Results for Temperature: 0.0
Accuracy: 0.882
Confusion Matrix:
     0   1
0  425  59
1    0  16
Index: 12
Patient ID: 12305811
AI Output: Thank you for sharing your oxygen saturation level, Kevin. Lastly, could you provide me with your blood pressure reading?
User Input: My blood pressure is 127/70, Doctor.
True Label: 0, Predicted Label: 1
---
Index: 14
Patient ID: 12305811
AI Output: Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?
User Input: No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.
True Label: 0, Predicted Label: 1
---
Index: 33
Patient ID: 14185111
AI Output: Thank you for sharing your oxygen saturation level. Lastly, could you please provide your blood pressure for today?
User Input: My blood pressure today is 123/56.
True Label: 0, Predicted Label: 1
---
Index: 35
Patient ID: 14185111
AI Output: Based on your responses, it 