In [None]:
!pip install openai



In [None]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.19.1-py3-none-any.whl (542 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m13.5 MB/s[0m eta 

In [None]:
!pip install pandas



In [None]:
import openai

In [None]:
from datasets import load_dataset

In [None]:
import random

In [None]:
import pandas as pd

In [None]:
# load a dataset from huggingface datasets
def load_huggingface_dataset(dataset_name, subset, split = "train"):
  dataset = load_dataset(dataset_name, subset , split = split)
  return dataset

In [None]:
import string
# pre-process the questions in the dataset
def text_preprocess(text): # lemmatization and removing stop words might affect the model performance in a negative way
  text = text.lower() # convert to lower case
  text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation

  return text

In [None]:
# extract questions from the dataset
def get_questions_from_dataset(dataset):
  return [text_preprocess(item["question"]) for item in dataset]

In [None]:
# rephrase the prompt to generate variation in questions
def rephrase_question(original_question):
  modifications = [
      " more details on its key term.",
        " common treatments for this condition.",
        " how this condition affects daily life.",
        " the usual symptoms of this condition.",
        " to describe the progression of this condition if untreated.",
        " the preventive measures for this condition."
  ]

  return original_question + " Create a short new question based on the provided question by focusing on asking" + random.choice(modifications) + \
  "The output should only contain the generated question, nothing more."

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key = "private_key"
)

In [None]:
# generate synthetic questions using OpenAI's api
def generate_questions(existing_questions, target_count = 300):
  new_questions = []
  needed_count = target_count - len(existing_questions)

  while len(new_questions) < needed_count:
    seed_question = random.choice(existing_questions)
    prompt = rephrase_question(seed_question)
    question_generated = False

    while not question_generated:
      try:
        response = client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        question = response.choices[0].message.content.strip()
        word_count = len(question.split())

        if 3 <= word_count <= 30:
          print(f"Generated: {question}")
          new_questions.append(question)
          question_generated = True

      except openai.APIError as err:
        print(f"Encountered an error: {err}")

    if len(new_questions) + len(existing_questions) >= target_count:
      break

  return existing_questions + new_questions


In [None]:
# save the generated synthetic dataset into a csv file
def save_questions_to_csv(questions, file_name = "questions.csv"):
  df = pd.DataFrame(questions, columns = ["question"])
  df.to_csv(file_name, index = False)
  print(f"Saved {len(questions)} questions to {file_name}")

In [None]:
# main
dataset_name = "katielink/healthsearchqa"
subset = "140_question_subset"

loaded_dataset = load_huggingface_dataset(dataset_name, subset)
existing_questions = get_questions_from_dataset(loaded_dataset)

print(f"Loaded {len(existing_questions)} questions from dataset.")

extended_questions = generate_questions(existing_questions, 300)

print(f"Total number of questions after augmentation: {len(extended_questions)}")

save_questions_to_csv(extended_questions)

Loaded 140 questions from dataset.
Generated: How does anaphylactic shock impact daily activities and routines?
Generated: How does disordered thought affect daily functioning?
Generated: What are some specific symptoms of a reduced sense of touch in individuals experiencing this condition?
Generated: What specific symptoms or side effects have you experienced as a result of taking multiple doses of Lasix 40 mg per day?
Generated: What preventive measures should be taken when using fluticasone alongside oxygen therapy?
Generated: What is the typical progression of developmental delays if left untreated?
Generated: What are some preventive measures for avoiding ringworm?
Generated: What are the usual symptoms of leg cramps that could be potentially eased by quinine in seltzer water?
Generated: What are common treatments for nose bleeds that occur for no apparent reason?
Generated: What can happen if diarrhoea is left untreated?
Generated: What are the usual symptoms of SSPE, a rare and 

In [None]:
# Now, having 300 questions, we want to generate responses of the questions based on DPO format (i.e., prompt, chosen, rejected)

In [None]:
def save_questions_and_responses_to_csv(extended_questions, chosen_responses, rejected_responses, file_name = 'questions_and_responses.csv'):
  df = pd.DataFrame({'prompt': extended_questions,
                     'chosen': chosen_responses,
                     'rejected': rejected_responses})
  df.to_csv(file_name, index = False)
  print(f"Saved {len(extended_questions)} questions to {file_name}")
  print(f"Saved {len(chosen_responses)} chosen responses to {file_name}")
  print(f"Saved {len(rejected_responses)} rejected responses to {file_name}")

In [None]:
def generate_response(question, chosen = True):
  if chosen:
    prompt = "Answer the following question correctly (in less than 70 tokens): " + question # chosen
  else:
    prompt = "Answer the following question incorrectly (in less than 70 tokens): " + question # rejected

  response = client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
  response = response.choices[0].message.content.strip()

  if chosen:
    print(f"Chosen response: {response}")
  else:
    print(f"Rejected response: {response}")
  return response

In [None]:
def generate_responses(questions): # resspond to all the questions
  chosen_responses = []
  rejected_responses = []

  for index, question in enumerate(questions):
    # generate chosen response for the question
    chosen_response = generate_response(question, chosen = True)
    chosen_responses.append(chosen_response)

    # generate rejected response for the question
    rejected_response = generate_response(question, chosen = False)
    rejected_responses.append(rejected_response)

    print(f"Chosen/ Rejected responses generated for question {index+1}")

  return chosen_responses, rejected_responses

In [None]:
chosen_responses, rejected_responses = generate_responses(extended_questions)
save_questions_and_responses_to_csv(extended_questions, chosen_responses, rejected_responses)

Chosen response: Losing balance can be a symptom of various conditions, including inner ear problems, neurological issues, dehydration, medication side effects, or musculoskeletal problems. It is important to consult a healthcare provider for proper evaluation and diagnosis.
Rejected response: Losing balance is a symptom of eating too much chocolate. It can cause dizziness and stumbling. It is important to limit chocolate consumption to avoid balance problems.
Chosen/ Rejected responses generated for question 1
Chosen response: Social anxiety feels like an overwhelming fear of being judged or embarrassed in social situations, leading to physical symptoms like sweating, rapid heartbeat, and avoidance behaviors.
Rejected response: Social anxiety feels like a warm, cozy blanket wrapping around you, providing comfort and security in social situations. It gives you a sense of confidence and ease when meeting new people.
Chosen/ Rejected responses generated for question 2
Chosen response: No