In [4]:
import os
import openai
import pandas as pd
import logging
import random
import config
import re
import time 
import argparse
import json


In [42]:
def query_gpt(prompt, gpt_version, test, print_output = False):
  logging.debug(f'querying gpt {gpt_version}')
  if test:
    return prompt + '.test.response'
  
  openai.api_key = config.OPENAI_API_KEY
  completions = openai.ChatCompletion.create( #a method that allows you to generate text-based chatbot responses using a pre-trained GPT language model.
      model=gpt_version, 
      temperature = 0, #controls the level of randomness or creativity in the generated text; . A higher temperature value will result in a more diverse and creative output, as it increases the probability of sampling lower probability tokens. 
#         max_tokens = 2000, #controls the maximum number of tokens (words or subwords) in the generated text.
#         stop = ['###'], #specifies a sequence of tokens that the GPT model should stop generating text when it encounters
      n = 1, #the number of possible chat completions or responses that the GPT model should generate in response to a given prompt
      messages=[
        {'role':'user', 'content': prompt},
        ])
  # Displaying the output can be helpful if things go wrong
  if print_output:
      logging.debug(completions)

  gpt_response = completions.choices[0]['message']['content']
  # Return the first choice's text
  return gpt_response

In [30]:
def load_hp_synonyms():
    # load hpo from json
    hpo_json = json.load(open('hp.json'))
    synonym_dict_list = []
    nodes = hpo_json['graphs'][0]['nodes']
    for node in nodes:
        # "id" : "http://purl.obolibrary.org/obo/HP_0000016"
        try:
            id_component_list = node['id'].split('/')
            if 'HP_' in id_component_list[-1]:
                synonym_dict = {}
                synonym_dict['hp_id'] = id_component_list[-1]
                synonym_dict['name'] = node['lbl']
                synonyms = node['meta']['synonyms']
                synonym_dict['synonyms'] = []
                for synonym in synonyms:
                    synonym_dict['synonyms'].append(synonym['val'])
                synonym_dict_list.append(synonym_dict)
        except Exception as e:
            pass
    return synonym_dict_list

In [31]:
# sample x elements from a list
def sample_list(input_list, x):
    if len(input_list) > x:
        return random.sample(input_list, x)
    else:
        return input_list

In [66]:
synonym_dict_list = load_hp_synonyms()
sampled_synonym_dict_list = sample_list(synonym_dict_list, 43)
gpt_version = 'gpt-4'
print_output = False
test = False# load sampled_synonym_dict_list
if os.path.exists('sampled_synonym_dict_list_gpt_response.json'):
    sampled_synonym_dict_gpt_response_list = json.load(open('sampled_synonym_dict_list_gpt_response.json'))
    hp_id_list_with_gpt_response = [sampled_synonym_dict['hp_id'] for sampled_synonym_dict in sampled_synonym_dict_gpt_response_list]
else:
    sampled_synonym_dict_gpt_response_list = []
    hp_id_list_with_gpt_response = []
for sampled_synonym_dict in sampled_synonym_dict_list:
    if sampled_synonym_dict['hp_id'] in hp_id_list_with_gpt_response:
        continue
    prompt = f'Please generate five synonyms for the given phenotype term. For example, if the phenotype term is "Loss of consciousness", return ["Fainting", "Loss of consciousness", "Passing out"]. Phenotype term: "{sampled_synonym_dict["name"]}"'
    sampled_synonym_dict['gpt_response'] = query_gpt(prompt, gpt_version, test, print_output = print_output)

# combine sampled_synonym_dict_gpt_response_list and sampled_synonym_dict_list
sampled_synonym_dict_gpt_response_list.extend(sampled_synonym_dict_list)
# output sampled_synonym_dict_list to json
json.dump(sampled_synonym_dict_gpt_response_list, open('sampled_synonym_dict_list_gpt_response.json', 'w'), indent=2)



In [75]:
def convert_to_per_synonyms():
    synonym_dict_list = []
    sampled_synonym_dict_gpt_response_list = json.load(open('sampled_synonym_dict_list_gpt_response.json'))
    hp_id_list_with_gpt_response = [sampled_synonym_dict['hp_id'] for sampled_synonym_dict in sampled_synonym_dict_gpt_response_list]
    len(hp_id_list_with_gpt_response)
    for sampled_synonym_dict in sampled_synonym_dict_gpt_response_list:
        hp_id = sampled_synonym_dict['hp_id']
        name = sampled_synonym_dict['name']
        gpt_response = sampled_synonym_dict['gpt_response']
        synonyms = re.findall(r'"(.*?)"', gpt_response)
        for synonym in synonyms:
            if len(synonym) > 3:
                synonym_dict_list.append({'synonym': synonym,'name':name, 'hp_id': hp_id})
    
    synonym_df = pd.DataFrame(synonym_dict_list)
    synonym_df.to_csv('synonym_df.csv', index = False)
    return synonym_df


In [76]:
synonym_df = convert_to_per_synonyms()