In [1]:
import os
import sys

In [2]:
import os
import re
import json
import base64
import requests
import urllib.parse

In [3]:
os.environ["API_KEY"] = "your-API-key"
os.environ["API_VERSION"]='2024-02-01'
os.environ["RESOURCE_ENDPOINT"]="https://unified-api.ucsf.edu/general"

In [11]:
API_KEY = os.environ.get('API_KEY')  # Match the environment variable name to the name you used in the .env file
API_VERSION = os.environ.get('API_VERSION')
RESOURCE_ENDPOINT = os.environ.get('RESOURCE_ENDPOINT')

In [12]:
api_base = os.getenv("RESOURCE_ENDPOINT")
api_key = os.getenv("API_KEY")
api_type = os.environ.get("AZURE_OPENAI_TYPE", "azure")
api_version = os.environ.get("API_VERSION", "2023-06-01-preview")
engine = os.getenv("AZURE_OPENAI_DEPLOYMENT", 'gpt-4o-2024-05-13')
model = os.getenv("AZURE_OPENAI_MODEL")
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = api_version
os.environ["OPENAI_API_BASE"] = api_base
os.environ["OPENAI_API_KEY"] = api_key

In [13]:
embedding_deployment = 'text-embedding-ada-002'

In [7]:
prefix = "Given the following description, identify the most detailed AO/OTA Classification, including bone, location, group, and subgroup, as well as universal modifiers and qualifications if present. Explain your answer."
def construct_prompt(text):
    return f'{prefix}: \n{clean_text(text)}'

def clean_text(text):
    return re.sub(r"\s+", " ", text)

def construct_prompt_2(text):
    input_text = f'Case: {text} Task: Analyze the provided fracture description, determine the AO Classification based on the details of the affected bone, location, and characteristics, and provide as response !only! the appropriate AO Code!'
    return input_text

In [8]:
from typing import Optional, List, Mapping, Any

from llama_index import ServiceContext, SimpleDirectoryReader, SummaryIndex, VectorStoreIndex, GPTVectorStoreIndex
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index.callbacks import CallbackManager
from llama_index.llms import (
    LLM,
    OpenAI,
    AzureOpenAI,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index import set_global_service_context

In [14]:
embed_model = AzureOpenAIEmbedding(
    azure_endpoint=RESOURCE_ENDPOINT,
    api_key=API_KEY,
    api_version=API_VERSION,  
    deployment_name=embedding_deployment,
)

In [15]:
llm = AzureOpenAI(azure_endpoint=RESOURCE_ENDPOINT, api_key=API_KEY, api_version=API_VERSION, engine=engine)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

In [16]:
set_global_service_context(service_context)  

In [17]:
documents = SimpleDirectoryReader("augmented_data").load_data()
index = GPTVectorStoreIndex.from_documents(
    documents, service_context=service_context
)

In [18]:
query_engine = index.as_query_engine(similarity_top_k=5)

# Run Queries

In [None]:
# CSV file with 2 columns: study_id, radiology_report_text
df = pd.read_csv('/path/to/radiology_reports.csv')

In [None]:
prompt_categories = ["single_prompt_data_augmented", "data_augmented_with_explanation"]
prompt_prefixes = {
    "single_prompt_data_augmented": "Provide an AO/OTA classification label for the following fracture description. Be as precise as possible, including subgroups, universal modifiers, and qualifiers if available. Provide only the label in your response, with no explanation. If there are multiple fractures identified, provide a comma-separated list.",
    "data_augmented_with_explanation": "Provide an AO/OTA classification label for the following fracture description. Be as precise as possible, including subgroups, universal modifiers, and qualifiers if available. Provide the label in your response, followed by a brief explanation for each part of the classification. If there are multiple fractures identified, provide a comma-separated list."
}

In [None]:
results = []
for _, row in df.iterrows():
    for prompt in prompt_categories:
        full_prompt = f"{prompt_prefixes[prompt]}: {row['radiology_report_text']}"
        response = query_engine.query(prompt)
        results.append((row['study_id'], prompt, prompt_prefixes[prompt], row['radiology_report_text'], response.response))

In [None]:
results = pd.DataFrame(results, columns=["study_id", "prompt", "prompt_prefix", "radiology_report_text", "response"])

In [None]:
results.to_csv("/path/to/output_file.csv")