In [8]:
from openai import OpenAI, files
from dotenv import load_dotenv
import os
import csv
import time

# Load the environment variables from the .env file
load_dotenv()

# Access the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')

patients = ["307", "323", "324", "328", "335", "345", "347", "348", "350", "352", "369", "379", "387", "390", "402", "412", 
            "413", "419", "428", "431", "432", "437", "461", "463", "468", "480", "485"]

In [9]:
# 1. Truncate the CSV Transcripts
class OpenAILLM():
    def init(self, model_name: str):
        self.model_name = model_name
        self.client = OpenAI()

    def generate(self, prompt: str):
        completion = self.client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "system", "content": "You are a expert therapist and know extensive knowledge on the PHQ-8 diagnostic and how to evaluate patients health given their transcripts."},
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )


        return completion.choices[0].message.content
    
def get_summarized_transcript(transcript: str, llm) -> str:
    summary_prompt = f"""
    Rewrite the conversation to be shorter. Make sure it is in the same general format as the original conversation. Remove any unnecessary details and keep the main points. Be concise
    {transcript}
    """
    summarized_transcript = llm.generate(summary_prompt)
    return summarized_transcript

def process_transcript(input_csv_path: str, output_csv_path: str, llm) -> None:
    """
    Reads a transcript from a CSV file, summarizes it, and writes the summary to an output CSV file.

    Parameters:
    - input_csv_path (str): Path to the input CSV file containing the transcript.
    - output_csv_path (str): Path to the output CSV file for the summarized transcript.
    - llm: An object representing the language model with a generate method.
    """
    # Read the transcript from the CSV file
    transcript = ""
    with open(input_csv_path, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile, delimiter='\t')  # Specify tab as the delimiter
        columns = reader.fieldnames
        print(f"Column names in CSV: {columns}")
        for row in reader:
            # Join columns to maintain original formatting in each line
            transcript += f"{row['start_time']} {row['stop_time']} {row['speaker']}: {row['value']}\n"

    # Summarize the transcript
    summarized_transcript = get_summarized_transcript(transcript, llm)

    # Write the summarized transcript to the output CSV file with the same column format
    with open(output_csv_path, mode='w', encoding='utf-8', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['start_time', 'stop_time', 'speaker', 'value'])  # Write header
        
        # Split summarized transcript by lines and parse each line back to columns
        for line in summarized_transcript.strip().split('\n'):
            # Assuming the summarized line has format "start_time stop_time speaker: value"
            parts = line.split(maxsplit=3)  # Split into 4 parts: start_time, stop_time, speaker, and value
            if len(parts) == 4:
                start_time, stop_time, speaker, value = parts
                # Remove any trailing colons or whitespace from the speaker
                speaker = speaker.rstrip(':')
                writer.writerow([start_time, stop_time, speaker, value])

# Sample Snippet
# model = OpenAILLM()
# model.init("gpt-4o-2024-08-06")
# for patient in patients:
#   process_transcript(f"data/DAIC_WOZ/participants/{patient}/{patient}_TRANSCRIPT.csv", f"./preprocessed/{patient}_TRANSCRIPT.csv", model)

In [10]:
# 2. Convert Truncated Transcripts to JSONL
model = "gpt-4o-2024-08-06"

def csv_to_jsonl(csv_file, jsonl_file):
    with open(csv_file, 'r', encoding='utf-8') as csv_f, open(jsonl_file, 'w', encoding='utf-8') as jsonl_f:
        reader = csv.DictReader(csv_f, delimiter=',')
        # Check the column names for debugging
        columns = reader.fieldnames
        print(f"Column names in CSV: {columns}")  # Debugging step to ensure correct columns
        
        # Define the system message
        system_message = {"role": "system", "content": "You are a participant in a study where your depression, anxiety and stress is being assessed."}
        
        messages = [system_message]
        
        for row in reader:
            # Determine the role based on the speaker
            if row['speaker'].strip().lower() == 'ellie':
                role = "user"
            else:
                role = "assistant"
            
            # Append the current message
            messages.append({"role": role, "content": row['value'].strip()})
            
            # Write to JSONL file after each pair (when role is "assistant")
            if role == "assistant":
                jsonl_f.write(json.dumps({"messages": messages}) + '\n')
                # Reset messages for the next pair, keeping the system message
                messages = [system_message]

# Do this Conversion for Each Participant
# for patient in patients:
#     csv_file = f"preprocessed/{patient}_TRANSCRIPT.csv"
#     jsonl_file = f"formatted_final/{patient}_TRANSCRIPT_formatted_data.jsonl"
#     csv_to_jsonl(csv_file=csv_file, jsonl_file=jsonl_file)

In [11]:
# 3. Fine-tune for a Given Transcript

def fine_tune_model(file_path):
    """
    Function to fine-tune a model using OpenAI API.

    Args:
        file_path (str): Path to the training data file in JSONL format.
    """
    # Load the environment variables from the .env file
    load_dotenv()

    # Access the OpenAI API key
    openai_api_key = os.getenv('OPENAI_API_KEY')
    client = OpenAI()

    try:
        # Upload the file
        print("Uploading file...")
        response = files.create(
            file=open(file_path, "rb"),
            purpose="fine-tune"
        )

        # Retrieve file ID from the uploaded file
        file_id = response.id
        print(f"File uploaded successfully. File ID: {file_id}")

        # Start fine-tuning based on the file
        print("Starting fine-tuning job...")
        ft_response = client.fine_tuning.jobs.create(
            training_file=file_id,
            model="gpt-4o-mini-2024-07-18"
        )

        # Fine-tuning job ID
        response_id = ft_response.id
        print(f"Fine-tuning job started. Job ID: {response_id}")

        # Training loop - monitor fine-tuning job
        while True:
            # Retrieve the status of the fine-tuning job
            response = client.fine_tuning.jobs.retrieve(response_id)
            status = response.status

            # Print the current status
            print(f"Fine-tuning job status: {status}")

            # Check if the fine-tuning is completed (either succeeded, failed, or cancelled)
            if status in ['succeeded', 'failed', 'cancelled']:
                break

            # Wait for the specified interval before checking again
            time.sleep(120)

        # Final status update once the loop ends
        if status == 'succeeded':
            print(f"Fine-tuning job completed successfully! Fine-tuned model ID: {response.fine_tuned_model}")
            return response.fine_tuned_model
        elif status == 'failed':
            print("Fine-tuning job failed.")
            return None
        elif status == 'cancelled':
            print("Fine-tuning job was cancelled.")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Sample Usage:
# model = fine_tune_model(file_path)
# completion = client.chat.completions.create(
#   model=model,
#   messages=[
#     {"role": "system", "content": "You are a participant in a psychological study which is measuring your Depression, Anxiety and Stress"},
#     {"role": "user", "content": "How are you feeling today?"}
#   ]
# )

# print(completion.choices[0].message)

In [14]:
# 4. Query the Model (Better Responses using OpenAI Playground due to API suffering from quality degradation due to API
# response time constraints)

prompts = ["Little interest or pleasure in doing things.", 
           "Feeling down, depressed, or hopeless", 
           "Trouble falling or staying asleep, or sleeping too much",
           "Feeling tired or having little energy",
           "Poor appetite or overeating",
           "Feeling bad about yourself or that you are a failure or have let yourself or your family down",
           "Trouble concentrating on things, such as reading the newspaper or watching television",
           "Moving or speaking so slowly that other people could have noticed. Or the opposite being so fidgety or restless that you have been moving around a lot more than usual"]

# model = fine_tune_model(file_path)

def get_phq8_scores(prompts, model):
    """
    Get PHQ-8 scores for a list of prompts by querying the model.

    Args:
        prompts (list): List of prompts to evaluate.
        model (str): Fine-tuned model ID.

    Returns:
        dict: Dictionary mapping each prompt to its assigned score.
    """
    client = OpenAI()

    prompt_text = "\n".join([f"{i+1}. {prompt}" for i, prompt in enumerate(prompts)])

    system_message = """
    You are a participant in a study where we are measuring your Depression, Anxiety and Stress Levels. Be conservative whilst scoring. Only provide numeric answers.
    Answer the questions as if you're the "user" on the following scale:\n
    0 - Not at all\n
    1 - Several Days\n
    2 - More than half the days\n
    3 - Nearly Every Day\n
    Then add up the score you assign to each prompt, providing a total.\n
    Give me a numerical answer, exclusive to 0, 1, 2, 3.\n
    Analyse the conversation provided and assign it a PHQ-8 Score.
    """

    user_message = f"On a scale of 0 - 3, rate the following:\n\n{prompt_text}"

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )

    return completion.choices[0].message.content