# Chatbot

In [1]:
import ast  # for converting embeddings saved as strings back to arrays
import openai    
import pandas as pd  
import tiktoken  # for counting tokens
import os
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

openai.api_key = os.getenv("OPENAI KEY")  #ADD YOUR OWN API KEY

## Installation

In [2]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install openai pandas scipy

Collecting scipy
  Using cached scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl.metadata (53 kB)
Using cached scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl (35.0 MB)
Installing collected packages: scipy
Successfully installed scipy-1.10.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade charset_normalizer

Note: you may need to restart the kernel to use updated packages.


In [3]:
df_try = pd.read_csv('Client_Data.csv')
df_try    # customised dataset

Unnamed: 0,Client_ID,Client Name,Age,Gender,Transcript
0,1,John Doe,35,Male,"Therapist (T): ""Welcome, and thank you for sha..."
1,2,Jaxonina Peter,25,Female,"Therapist (T): Good morning, and welcome. I'm ..."
2,3,Nate Rust,33,Non-Binary,THERAPIST: I want to welcome you to your first...


In [4]:
df_cleaned = df_try.dropna()
df_cleaned #Data Cleaning

Unnamed: 0,Client_ID,Client Name,Age,Gender,Transcript
0,1,John Doe,35,Male,"Therapist (T): ""Welcome, and thank you for sha..."
1,2,Jaxonina Peter,25,Female,"Therapist (T): Good morning, and welcome. I'm ..."
2,3,Nate Rust,33,Non-Binary,THERAPIST: I want to welcome you to your first...


## Function to get embeddings

In [5]:
df = df_cleaned

openai.api_key = "OPENAI KEY"  #ADD YOUR OWN API KEY

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        # Convert the text to string in case it's not
        text = str(text)
        try:
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=text  
            )
            embeddings.append(response['data'][0]['embedding'])
        except openai.error.InvalidRequestError as e:
            # Error when text is too long
            print(f"An error occurred: {e}")
        except Exception as e:
            # General exception handling
            print(f"An unexpected error occurred: {e}")
    return embeddings

embeddings = get_embeddings(df['Transcript'].tolist())

# Creating a DataFrame with the original text and the embeddings
df_embeddings = pd.DataFrame({
    'text': df['Transcript'],  # Replace 'Transcript' with the actual column name
    'embedding': embeddings
})

# Save the new DataFrame to a CSV file
df_embeddings.to_csv('text_embeddings.csv', index=False)

In [7]:
# download pre-chunked text and pre-computed embeddings
embeddings_path = "text_embeddings.csv"

df = pd.read_csv(embeddings_path)

In [8]:
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [9]:
# the dataframe has two columns: "text" and "embedding"
df

Unnamed: 0,text,embedding
0,"Therapist (T): ""Welcome, and thank you for sha...","[0.004258978180587292, -0.003586169332265854, ..."
1,"Therapist (T): Good morning, and welcome. I'm ...","[-0.010963448323309422, -0.01884562335908413, ..."
2,THERAPIST: I want to welcome you to your first...,"[-0.00798663217574358, -0.018524734303355217, ..."


## Search function on the basis of ranking 

In [10]:
from typing import List, Tuple
import openai
openai.api_key = "OPENAI KEY"  #ADD YOUR OWN API KEY

EMBEDDING_MODEL = "text-embedding-ada-002"

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> Tuple[List[str], List[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=query,
    )
    query_embedding = query_embedding_response['data'][0]['embedding']
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for _, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return list(strings)[:top_n], list(relatednesses)[:top_n]

## Finetuning our Generative Model

In [11]:
openai.api_key = "OPENAI KEY"  #ADD YOUR OWN API KEY

client = openai

# Function for calculating vector similarities for search
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int,
    client_number: int 
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    
    df = df.iloc[[client_number - 1]]  # Subtracting 1 because index starts at 0
    
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the above transcript which is a therapy session between thearapist and a client to answer the subsequent question. keep the answer crisp. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nTherapy Session History:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame,
    client_number: int,
    model: str = 'gpt-4',
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget, client_number=client_number)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the therapy session history."},
        {"role": "user", "content": message},
    ]
    
    # Use the correct method for chat completion
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    
    response_message = response['choices'][0]['message']['content']
    return response_message




### Example questions

Finally, let's ask our system our original question about gold medal curlers:

In [75]:
ask(
    query='what was the issue the client was struggling with?',
    df=df, 
    client_number=1, 
    model="gpt-4")

'The client was struggling with anxiety and stress, which was amplified by a recent minor car accident. This anxiety was affecting their daily routines, sleep patterns, work performance, and relationships. They were also dealing with intrusive thoughts and fears about driving following the accident.'

In [57]:
ask('How long the client has been struggling with issues?', df=df, model="gpt-4", client_number=1)

'The client mentioned that they have been dealing with anxiety for a while, even before the car accident that happened last month. However, the exact duration of their struggle is not specified in the therapy session history.'

In [58]:
# comparison question
ask('Has COVID-19 affected the patient?', df=df, model="gpt-4", client_number=1)

'The therapy session transcript does not mention any impact of COVID-19 on the patient.'

In [76]:
ask(
    query='what was the issue the client was struggling with?', df=df, model="gpt-4", client_number=1)

'The client was struggling with several issues including work-related stress and anxiety, procrastination, difficulty in setting boundaries with others, and feeling disconnected from their partner.'

In [12]:
ask(
    query='what was the issue the client was struggling with?',
    df=df, 
    client_number=3, 
    model="gpt-4")

'The client was struggling with alcohol consumption as a coping mechanism for their problems.'

In [18]:
def generate_related_questions(
    query: str,
    df: pd.DataFrame,
    client_number: int,
    model: str = 'gpt-4',
    token_budget: int = 4096 - 500,
    num_questions: int = 3,  
    print_message: bool = False,
) -> str:
    """
    Generates a set of questions related to a given query using GPT and a dataframe of relevant texts.
    """
    message = query_message(query, df, model=model, token_budget=token_budget, client_number=client_number)
    
    if print_message:
        print(message)
        
    prompt_to_generate_questions = f"{message}\n\nCan you generate {num_questions} related questions based on the above question and therapy session history?"
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant. Please generate related questions based on the therapy session history."},
        {"role": "user", "content": prompt_to_generate_questions},
    ]
    
    response = client.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.7,
        max_tokens=token_budget
    )
    
    generated_questions = response['choices'][0]['message']['content']
    
    # Print and return the generated questions
    if print_message:
        print(generated_questions)
    
    return generated_questions.strip().split('\n')



In [21]:
generate_related_questions('what was the issue the client was struggling with?', df, client_number=1)


["1. What major event in the client's recent past has contributed to their heightened state of anxiety?",
 "2. How is the client's anxiety impacting their daily routine and relationships?",
 '3. What strategies did the therapist suggest to help the client manage their anxiety and improve their sleep?']

In [22]:
ask(
    query="What major event in the client's recent past has contributed to their heightened state of anxiety?",
    df=df, 
    client_number=1, 
    model="gpt-4")

"The major event in the client's recent past that has contributed to their heightened state of anxiety is a car accident they had last month."