# Generate multiple responses for 6k questions

In [None]:
!pip install openai

In [None]:
from openai import OpenAI
api_key=''
client = OpenAI(api_key = api_key)

In [None]:
import pandas as pd
import json
df_question_answer = pd.read_csv("combined_counselchat.csv")
df_questions       = pd.read_csv("augmented_questions.csv")

df_question_answer.head()
print("Total question answer pairs ", len(df_question_answer))
print("total number of questions ", len(df_questions))
all_questions = df_questions['question'].tolist()


def generate_prompt(question, df_question_answer):
  sampled_rows = df_question_answer.sample(n=3)
  snippets = '\n\n'.join([f"**Snippet {i+1}:**\nClient: {row['question']}\nTherapist: {row['answer']}"
                        for i, (_, row) in enumerate(sampled_rows.iterrows())])
  prompt = f"""
    Imagine you are a therapist responding to a person struggling with a mental health issue.
    As a mental health therapist, you often engage in conversations similar to the following snippets:
  {snippets}
  Now, a new client asks: "{question}".
  Please generate 10 responses as a mental health therapist.
  Please provide the output in the following JSON format without response number:

    "responses": [
      "Response 1",
      "Response 2",
      "Response 3",
      "Response 4",
      "Response 5",
      "Response 6",
      "Response 7",
      "Response 8",
      "Response 9",
      "Response 10"
    ]

  """
  return prompt.format(new_question=question, snippets=snippets)

def generate_answers_from_prompt(client, question, answers):
  prompt = generate_prompt(question, answers)
  response = client.chat.completions.create(
    model="gpt-4o",
    response_format= {"type": "json_object"},
    messages =[   {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content":prompt}
                  ]
    )
  output = json.loads(response.choices[0].message.content)
  generated_answers = set(output.get("responses", []))
  return generated_answers



all_questions = df_questions['question'].values.tolist()
all_questions = all_questions[:12]

synthetic_answers_dict = {}
count=0
for q in all_questions:
  count+=1
  if count%10==0:
    print("Count is ", count)
  try:
    generated_answers = generate_answers_from_prompt(client, q, df_question_answer)
    synthetic_answers_dict[q] = generated_answers
  except Exception as e:
    print(f"An error occurred: {e}")

def create_df(synthetic_answers_dict):
  rows = []
  for question, answers in synthetic_answers_dict.items():
    for answer in answers:
        rows.append((question, answer))
  df = pd.DataFrame(rows, columns=['question', 'answer'])
  return df
df_generated = create_df(synthetic_answers_dict)
df_generated.to_excel("6k_questions_with_answers.xlsx", index=None)
print("length of generadted df is ", len(df_generated))


In [None]:
df_generated.to_excel("questions_with_answers.xlsx", index=None)


In [None]:
import random
import json
def generate_prompt(questions_list):
  random_sample = random.sample(questions_list, 5)
  sample_string = "\n".join([f"{i+1}. {question}" for i, question in enumerate(random_sample)])
  prompt="""
  Imagine you are someone struggling with a mental health issue and you're seeking help from a therapist.
  Based on common concerns and difficulties people face, generate a list of 10-15 questions that someone in this situation might ask.
  These questions should reflect a range of emotions, from seeking understanding and validation to asking for coping strategies and treatment options.
  Here are some of the questions that might come to mind:
  ```{}```
  Please provide the output as a JSON array with each question listed as a separate item, without item numbers, and exclude any example questions given.
  """
  return prompt.format(sample_string)


def generate_questions(client, questions_list):
  prompt = generate_prompt(questions_list)
  response = client.chat.completions.create(
    model="gpt-4o",
    response_format= {"type": "json_object"},
    messages =[   {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content":prompt}
                  ]
    )
  output = json.loads(response.choices[0].message.content)
  generated_questions = set(output.get("questions", []))
  return generated_questions


df = pd.read_csv("combined_counselchat.csv")

questions_list = list(set(df['question']))
questions_db = set(questions_list)
initial_length = len(questions_db)
new_questions_db = set()
to_be_generated = 2000
while len(questions_db)<initial_length+to_be_generated:
  new_questions = generate_questions(client, questions_list)
  questions_db.update(new_questions)
  new_questions_db.update(new_questions)
  print("Questions database now...", len(questions_db))

new_questions_list = list(new_questions_db)
old_questions = set(df['question'])
print("Total old questions ", len(old_questions))
print("Total new questions ", len(new_questions_db))
print("Total common questions ", old_questions.intersection(new_questions_db))

# Create a DataFrame with a single column named 'question'
df_questions = pd.DataFrame(new_questions_list, columns=['question'])

df_questions.to_csv("augmented_questions.csv", index=None)
print("Length of df was ", len(df_questions))
print("Questions generated successfully.")
# Print the DataFrame
# print(df_questions)