# Conversation Closing Detector

In [67]:
import csv
import numpy as np
import pandas as pd
import json
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from sklearn.metrics import accuracy_score, confusion_matrix
import pprint

pp = pprint.PrettyPrinter(indent=2)

In [68]:
load_dotenv("../.env")

True

## Load Transcripts and Create Test Data

In [69]:
TRANSCRIPT_PATH = "../data/patients/patients_1.0_with_transcripts.json"
UNLABELLED_SEGMENTS_PATH = '../data/conversation_termination/transcript_segments_for_labeling.csv'
LABELLED_SEGMENTS_PATH = '../data/conversation_termination/transcript_segments_labeled.csv'

In [70]:
with open(TRANSCRIPT_PATH, "r") as file:
    data = json.load(file)
transcripts = {patient['id']: patient['chat_transcript'] for patient in data.values()}

First we need to extract 'AI:user' conversation segments and then manually label it

In [71]:
def extract_segments(data):
    """
    Extract segments from the chat transcripts where the AI output is followed by the user input.

    Args:
        data (dict): The data dictionary containing the chat transcripts.
    
    Returns:
        list: A list of dictionaries containing the patient_id, AI output, and user input for each segment.
    """
    segments = []

    for patient_id, patient_data in data.items():
        transcript = patient_data["chat_transcript"]
        for i in range(1, len(transcript)):
            ai_output = transcript[i - 1]
            user_input = transcript[i]

            # Check if ai_output starts with 'Doctor' and user_input starts with 'Patient'
            if ai_output.startswith("Doctor: ") and user_input.startswith("Patient: "):
                # Remove the 'Doctor: ' and 'Patient: ' prefixes
                ai_output_clean = ai_output[len("Doctor: "):]
                user_input_clean = user_input[len("Patient: "):]

                segments.append({
                    'patient_id': patient_id,
                    'ai_output': ai_output_clean,
                    'user_input': user_input_clean
                })
    
    return segments

# Extract segments
segments = extract_segments(data)

# Save segments to CSV for manual labeling
with open(UNLABELLED_SEGMENTS_PATH, 'w', newline='') as csvfile:
    fieldnames = ['patient_id', 'ai_output', 'user_input']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for segment in segments:
        writer.writerow(segment)

print(f"Segments have been extracted and saved to {UNLABELLED_SEGMENTS_PATH}")

Segments have been extracted and saved to ../data/conversation_termination/transcript_segments_for_labeling.csv


Once we have manually labeled the dataset we can import it as our test data

In [72]:
# Load the CSV file
labeled_segments = pd.read_csv(LABELLED_SEGMENTS_PATH)

## Define and Test Conversation Closure Detection Model

### Define Conversation Closing Detector

In [73]:
def create_prompt_template(prompt_text):
    """
    Create a prompt template for the language model.

    Args:
        prompt_text (str): The text of the prompt template.

    Returns:
        PromptTemplate: A LangChain PromptTemplate object.
    """
    return PromptTemplate(
        input_variables=["ai_output", "user_input"],
        template=prompt_text,
    )

def check_conversation_closing(chain, ai_output, user_input):
    """
    Check if the conversation is coming to a close based on the last AI output and user input.

    Args:
        chain (LLMChain): The LangChain chain to process the input.
        ai_output (str): The last AI output in the conversation.
        user_input (str): The last user input in the conversation.

    Returns:
        bool: True if the conversation is coming to a close, False otherwise.
    """
    result = chain.invoke({"ai_output": ai_output, "user_input": user_input})
    return result.strip().lower() == 'true'

### Define Experiment Functions

In [74]:
def get_misclassifications (y_true, y_pred, labeled_segments):
    """
    Gets out cases where the true labels and predicted labels do not match.

    Args:
        y_true (list): List of true labels.
        y_pred (list): List of predicted labels.
        labeled_segments (DataFrame): DataFrame containing the labeled segments.
    """
    misclassifications = []
    for index, (true_label, pred_label) in enumerate(zip(y_true, y_pred)):
        if true_label != pred_label:
            misclassifications.append({
                'index': index,
                'true_label': true_label,
                'pred_label': pred_label,
                'ai_output': labeled_segments.loc[index, 'ai_output'],
                'user_input': labeled_segments.loc[index, 'user_input'],
                'patient_id': labeled_segments.loc[index, 'patient_id']
            })
    return misclassifications

def run_experiment(prompt_text, model_name, temperature, labeled_segments):
    """
    Run an experiment with a given prompt template, model, and temperature to evaluate conversation closing detection.

    Args:
        prompt_text (str): The text of the prompt template to use in the experiment.
        model_name (str): The name of the OpenAI model to use in the experiment.
        temperature (float): The temperature setting for the model.
        labeled_segments (DataFrame): DataFrame containing the labeled segments.

    Returns:
        dict: A dictionary containing the accuracy, confusion matrix, predictions, timing information, and misclassifications.
    """
    model = ChatOpenAI(model_name=model_name, temperature=temperature)
    prompt = create_prompt_template(prompt_text)
    chain = prompt | model | StrOutputParser()

    y_true = labeled_segments['end'].tolist()
    y_pred = []
    response_times = []

    for _, row in labeled_segments.iterrows():
        ai_output = row['ai_output']
        user_input = row['user_input']
        start_time = time.time()
        prediction = check_conversation_closing(chain, ai_output, user_input)
        end_time = time.time()
        y_pred.append(int(prediction))
        response_times.append(end_time - start_time)

    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred).tolist()
    misclassifications = get_misclassifications(y_true, y_pred, labeled_segments)

    return {
        'prompt_text': prompt_text,
        'model_name': model_name,
        'temperature': temperature,
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'avg_response_time': np.mean(response_times),
        'misclassifications': misclassifications
    }

def append_results_to_json(results, file_path='../data/conversation_termination/experiment_results.json'):
    """
    Append the results of an experiment to a JSON file.

    Args:
        results (dict): The results of the experiment.
        file_path (str): The path to the JSON file to append the results to.

    Returns:
        list: The updated list of results in the JSON file.
    """
    try:
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
    except FileNotFoundError:
        data = []

    data.append(results)

    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    return data

def append_results_to_dataframe(results, df):
    """
    Append the results of an experiment to a DataFrame.
    
    Args:
        results (dict): The results of the experiment.
        df (DataFrame): The DataFrame to append the results to.

    Returns:
        DataFrame: The updated DataFrame with the results appended.
    """
    new_row = pd.DataFrame([{
        'prompt_text': results['prompt_text'],
        'model_name': results['model_name'],
        'temperature': results['temperature'],
        'accuracy': results['accuracy'],
        'confusion_matrix': results['confusion_matrix'],
        'avg_response_time': results['avg_response_time'],
        'misclassifications': results['misclassifications']
    }])
    df = pd.concat([df, new_row], ignore_index=True)
    return df

## Run Experiments
First we create a dataframe to store all the experiment results

In [75]:
results_df = pd.DataFrame()

### Run Base Experiment

In [76]:
prompt_1 = """
You are analyzing a conversation between a doctor and a patient. Your task is to determine if the conversation is coming to a close based on the AI Output and the User Input. Respond with 'True' if the conversation is coming to a close, and 'False' otherwise.

Follow the provided examples to understand the criteria:

### Example 1:
**AI Output:** Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?  
**User Input:** No, Doctor, I think that covers everything for now. Thank you for checking in on me.  
**Response:** True

### Example 2:
**AI Output:** Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?  
**User Input:** My temperature is 97.4 degrees, Doctor.  
**Response:** False

Now, analyze the following conversation and determine if it is coming to a close:

**AI Output:**  
```{ai_output}```  

**User Input:**  
```{user_input}```
"""

In [77]:
experiment_1_results = run_experiment(
    prompt_text=prompt_1,
    model_name="gpt-3.5-turbo",
    temperature=0,
    labeled_segments=labeled_segments
)

In [78]:
results_df = append_results_to_dataframe(experiment_1_results, results_df)

### Experiment 2: Prompt with More Examples

In [79]:
prompt_2 = """
You are analyzing a conversation between a doctor and a patient. Your task is to determine if the conversation is coming to a close based on the AI Output and the User Input. Respond with 'True' if the conversation is coming to a close, and 'False' otherwise.

Follow the provided examples to understand the criteria:

### Examples:

#### Example 1:
**AI Output:** Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?  
**User Input:** No, Doctor, I think that covers everything for now. Thank you for checking in on me.  
**Response:** True

#### Example 2:
**AI Output:** Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?  
**User Input:** My temperature is 97.4 degrees, Doctor.  
**Response:** False

#### Example 3:
**AI Output:** Thank you for sharing your oxygen saturation level, Kevin. Lastly, could you provide me with your blood pressure reading?  
**User Input:** My blood pressure is 127/70, Doctor.  
**Response:** False

#### Example 4:
**AI Output:** Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?  
**User Input:** No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.  
**Response:** False

#### Example 5:
**AI Output:** Thank you for sharing your respiratory rate, Gregory. Could you now provide your oxygen saturation level for me?  
**User Input:** My oxygen saturation level is 99.0, doctor. It's good to see that it's in a healthy range.  
**Response:** False

#### Example 6:
**AI Output:** Thank you for sharing about your medications. It's important to continue taking them as prescribed. Please remember to reach out to your healthcare provider if you notice any significant changes or worsening of symptoms. If you have any further concerns or questions, feel free to share them with me.
**User Input:** Thank you, Doctor. Um... I will definitely reach out if I notice any changes. I appreciate your help and advice.  
**Response:** True

#### Example 7:
**AI Output:** Thank you for sharing your blood pressure, Maria. Lastly, could you provide me with your current weight?  
**User Input:** I'm sorry, doctor, I don't have a scale at home to check my weight.  
**Response:** False

#### Example 8:
**AI Output:** Thank you for sharing your current medications, Jennifer. Are you taking any other medications apart from Beta-Blockers and Diuretics?  
**User Input:** No, doctor, those are the main ones I'm taking right now. Um... I try to remember to take them at the right times every day.  
**Response:** False

Now, analyze the following conversation and determine if it is coming to a close:

**AI Output:**  
```{ai_output}```  

**User Input:**  
```{user_input}```
"""

In [80]:
experiment_2_results = run_experiment(
    prompt_text=prompt_2,
    model_name="gpt-3.5-turbo",
    temperature=0,
    labeled_segments=labeled_segments
)

In [81]:
results_df = append_results_to_dataframe(experiment_2_results, results_df)

In [82]:
results_df

Unnamed: 0,prompt_text,model_name,temperature,accuracy,confusion_matrix,avg_response_time,misclassifications
0,\nYou are analyzing a conversation between a d...,gpt-3.5-turbo,0,0.902,"[[436, 48], [1, 15]]",0.395545,"[{'index': 12, 'true_label': 0, 'pred_label': ..."
1,\nYou are analyzing a conversation between a d...,gpt-3.5-turbo,0,0.814,"[[391, 93], [0, 16]]",0.641109,"[{'index': 6, 'true_label': 0, 'pred_label': 1..."


Note that adding more examples leads to longer response times and actually leads to a reduction in accuracy (albeit with more true positives)

### Experiment 3: Prompt with Guidance

In [83]:
prompt_3 = """
You are analyzing a conversation between a doctor and a patient. Your task is to determine if the conversation is coming to a close based on the AI Output and the User Input. Respond with 'True' if the conversation is coming to a close, and 'False' otherwise.

### Patterns indicating a closing:
1. AI providing final instructions or reminders.
2. AI providing encouragement for future contact.
3. User saying that they will make further contact if needed.
3. AI expressing care and well wishes.
4. AI asking if the user has any more questions or concerns.
5. User stating that they don't have any more questions or concerns.
5. User thanking the AI or expressing gratitude.
6. AI or user explicitly saying "Goodbye."

### Patters indicating a continuation:
1. AI says asking for more information, even if it includes terms like 'lastly' or 'finally'.
2. User providing additional information or asking questions.

Follow the provided examples to understand the criteria:

### Example 1:
**AI Output:** Based on our conversation, Kevin, it seems you are mainly experiencing tiredness and leg swelling, and you are currently taking Furosemide, Spironolactone, and fish oil for your heart condition. Is there anything else you would like to share regarding your symptoms, vital signs, or medications?  
**User Input:** No, Doctor, I think that covers everything for now. Thank you for checking in on me.  
**Response:** True

### Example 2:
**AI Output:** Based on our conversation, Kevin, it seems like you are mainly experiencing tiredness and leg swelling. Could you please provide your latest vital signs, starting with your temperature?  
**User Input:** My temperature is 97.4 degrees, Doctor.  
**Response:** False

Now, analyze the following conversation and determine if it is coming to a close:

**AI Output:**  
```{ai_output}```  

**User Input:**  
```{user_input}```
"""

### Experiment 3: Model Variations

In [81]:
model_variations = ["gpt-3.5-turbo-0125", "gpt-3.5-turbo-1106", "gpt-3.5-turbo"]

for model_name in model_variations:
    accuracy, conf_matrix, y_pred, response_times = run_experiment(base_prompt, model_name, base_temperature)
    print(f"Results for Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
    avg_response_time = sum(response_times) / len(response_times)
    print(f"Average response time: {avg_response_time:.4f} seconds")
    print("---")
    print()

Results for Model: gpt-3.5-turbo-0125
Accuracy: 0.86
Confusion Matrix:
     0   1
0  414  70
1    0  16
Index: 7
Patient ID: 12305811
AI Output: Thank you for letting me know, Kevin. Lastly, have you felt more tired than usual or experienced any sudden changes in your mental clarity?
User Input: No, Doctor, my mental clarity has been the same. Just feeling a bit more tired than usual.
True Label: 0, Predicted Label: 1
---
Index: 14
Patient ID: 12305811
AI Output: Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?
User Input: No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.
True Label: 0, Predicted Label: 1
---
Index: 35
Patient ID: 14185111
AI Output: Based on your responses, it seems like you're experiencing ankle swelling and the need to prop yourself up at night to breathe comfortably. Let's continue with your medications. Besides beta-b

### Experiment 4: Temperature Variations

In [82]:
# Temperature Variations
temperature_variations = [0.0, 0.3, 0.7]

for temperature in temperature_variations:
    accuracy, conf_matrix, y_pred, response_times = run_experiment(base_prompt, base_model, temperature)
    print(f"Results for Temperature: {temperature}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print_misclassifications(labeled_segments['end'].tolist(), y_pred, labeled_segments)
    avg_response_time = sum(response_times) / len(response_times)
    print(f"Average response time: {avg_response_time:.4f} seconds")
    print("---")
    print()

Results for Temperature: 0.0
Accuracy: 0.882
Confusion Matrix:
     0   1
0  425  59
1    0  16
Index: 12
Patient ID: 12305811
AI Output: Thank you for sharing your oxygen saturation level, Kevin. Lastly, could you provide me with your blood pressure reading?
User Input: My blood pressure is 127/70, Doctor.
True Label: 0, Predicted Label: 1
---
Index: 14
Patient ID: 12305811
AI Output: Thank you for sharing your current medications, Kevin. Is there any other medication you are taking for your heart condition or any other health issue?
User Input: No, Doctor, those are the main ones for my heart. I also take some fish oil for general health.
True Label: 0, Predicted Label: 1
---
Index: 33
Patient ID: 14185111
AI Output: Thank you for sharing your oxygen saturation level. Lastly, could you please provide your blood pressure for today?
User Input: My blood pressure today is 123/56.
True Label: 0, Predicted Label: 1
---
Index: 35
Patient ID: 14185111
AI Output: Based on your responses, it 