In [12]:
import pandas as pd

# Specify the path to the Excel file and the sheet name
file_path = '01-code-list-main-db.xlsx'
sheet_name = 'main'

# Use pandas to read the specified sheet
try:
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    print("Excel sheet read successfully:")
    print(df)
except Exception as e:
    print(f"An error occurred: {e}")


Excel sheet read successfully:
  Code list  Relevant Event/ Synonym  Search Term  Full Search Term  \
0  Hayfever                      NaN          NaN               NaN   

   Exec Draft  Manager Review  Search Term Finalised  
0         NaN             NaN                    NaN  


## Get synonyms for a disease

In [13]:
def read_api_key(file_path):
    """
    Reads the OpenAI API key from a given text file.
    
    Args:
    file_path (str): The path to the text file containing the API key.
    
    Returns:
    str: The OpenAI API key.
    """
    try:
        with open(file_path, 'r') as file:
            # Assuming the API key is on the first line of the file
            api_key = file.readline().strip()
            return api_key
    except FileNotFoundError:
        print("The file was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Usage
api_key = read_api_key('00-openai-apikey-codes-list.txt')
if api_key:
    print("API Key loaded successfully:", api_key[0:5])
else:
    print("Failed to load API Key.")


API Key loaded successfully: sk-pr


In [17]:
import pandas as pd
import difflib

# Function to normalize text by removing spaces and converting to lower case
def normalize_text(text):
    return ''.join(text.lower().split())

# Function to check for similar disease names in the DataFrame
def find_similar_diseases(df, disease_name, column='Code list', threshold=0.8):
    normalized_disease_name = normalize_text(disease_name)
    normalized_column = df[column].apply(normalize_text)
    
    matches = []
    for entry in normalized_column:
        similarity = difflib.SequenceMatcher(None, normalized_disease_name, entry).ratio()
        if similarity > threshold:
            matches.append(entry)
    
    if matches:
        print("Warning: Similar entries found in the Database:")
        for match in matches:
            print(match)
        return True
    else:
        return False

def get_disease_name(df):
    while True:
        # Ask for the disease name
        disease = input("What disease do you want to find the synonyms of? ")
        
        # Ask for confirmation
        confirmation = input(f"You entered '{disease}'. Is this correct? (yes/no): ")
        
        if confirmation.lower() == 'yes':
            # Check for similar entries in the DataFrame
            if find_similar_diseases(df, disease):
                proceed = input("Similar entries found. Do you want to proceed anyway, enter a new name, or exit? (proceed/new/exit): ")
                if proceed.lower() == 'proceed':
                    print("Proceeding with the entered disease name.")
                    return disease
                elif proceed.lower() == 'new':
                    print("Please re-enter the disease name.")
                elif proceed.lower() == 'exit':
                    print("Exiting the process.")
                    return None
                else:
                    print("Invalid option, please enter a valid choice (proceed, new, or exit).")
            else:
                print("Disease name confirmed.")
                return disease
        else:
            print("Please re-enter the disease name.")


# Example DataFrame loading (replace with actual loading from your data source)
# df = pd.read_csv('path_to_your_file.csv')

# Usage of the function, assuming df is your DataFrame
disease_name = get_disease_name(df)
print("Disease to find synonyms for:", disease_name)

# Uncomment the DataFrame loading and function call line and replace with actual DataFrame loading line.


What disease do you want to find the synonyms of?  Hayfever
You entered 'Hayfever'. Is this correct? (yes/no):  yes


hayfever


Similar entries found. Do you want to proceed anyway, enter a new name, or exit? (proceed/new/exit):  new


Please re-enter the disease name.


What disease do you want to find the synonyms of?  Hayfever
You entered 'Hayfever'. Is this correct? (yes/no):  yes


hayfever


Similar entries found. Do you want to proceed anyway, enter a new name, or exit? (proceed/new/exit):  proceed


Proceeding with the entered disease name.
Disease to find synonyms for: Hayfever


In [18]:
disease_name

'Hayfever'

In [20]:
def read_and_customize_prompt(folder_path, file_name, disease_name):
    """
    Reads the content of a text file, replaces placeholders with the actual disease name, and returns it.
    
    Args:
    folder_path (str): The path to the folder containing the file.
    file_name (str): The name of the text file to read.
    disease_name (str): The disease name to insert into the prompt.
    
    Returns:
    str: The customized content of the file.
    """
    file_path = f"{folder_path}/{file_name}"
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            prompt_template = file.read()
            prompt_text = prompt_template.replace("{disease_name}", disease_name)
            return prompt_text
    except FileNotFoundError:
        print(f"The file {file_name} was not found in {folder_path}.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

# Usage Example:
folder_path = '01-prompts'
file_name = '01-get-synonyms.txt'
disease_name = 'Hayfever'  # Assume this is obtained dynamically elsewhere in your program
prompt_text = read_and_customize_prompt(folder_path, file_name, disease_name)

if prompt_text:
    print("Prompt text loaded successfully:")
    print(prompt_text)
else:
    print("Failed to load the prompt text.")


Prompt text loaded successfully:
Prompt:

You are tasked with providing synonyms for a specified disease, along with a score for each synonym to indicate its closeness to the original disease name. The score should range from 0 to 10, where 10 indicates a synonym that is essentially identical to the disease name.

Task:

For the disease "Hayfever", list all relevant synonyms. Each synonym should be accompanied by a score reflecting its similarity to "Hayfever". Scores close to 10 suggest high similarity, whereas lower scores indicate less similarity.

For instance, "hay fever" is also known as "allergic rhinitis." Typically, to find all synonymous terms for "hay fever," I conduct a Google search to gather all relevant terms. These terms are then added to our disease database to ensure we capture all associated codes for the disease.



Expectation:

Direct Synonyms: List terms that are very similar or nearly identical to "Hayfever" with high scores.
Related Terms: List terms that are r

In [None]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    response_format={"type": "json_object"}  # Setting the response to be in JSON mode
)

# Extracting the JSON output from the response
json_output = response.choices[0].message.content
print(json_output)


In [None]:
import json

# Parse the JSON string to a Python dictionary
synonyms_data = json.loads(json_output)
print(synonyms_data)
