In [24]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [27]:
import csv
from tqdm import tqdm
from langchain_openai import AzureOpenAI, AzureChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [30]:
# Initialize the OpenAI language model
llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0.1,
)

# Define the prompt template
prompt_template = PromptTemplate(
    input_variables=["source_table", "source_column"],
    template="""
    Given the source healthcare data table and column:
    Source Table: {source_table}
    Source Column: {source_column}

    Please match this to the most appropriate table and column in the OMOP Common Data Model.
    Provide your answer in the following format:
    Table: [OMOP table name]
    Column: [OMOP column name]

    If you're unsure, provide your best guess.
    """,
)

parser = StrOutputParser()

# Create the LLMChain
chain = prompt_template | llm | parser


def process_csv(input_file):
    results = []

    with open(input_file, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in tqdm(reader):
            source_table = row["source_table"]
            source_column = row["source_column"]

            # Get prediction from the LLM
            response = chain.invoke(
                {"source_table": source_table, "source_column": source_column}
            )
            try:
                # Parse the response
                target_table = response.split("\n")[0].split(": ")[1]
                target_column = response.split("\n")[1].split(": ")[1]

            except:
                target_table = None
                target_column = None

            # Create the JSON object
            result = {
                "source_table": source_table,
                "source_column": source_column,
                "target_table_pred": target_table,
                "target_table_column_pred": target_column,
            }

            results.append(result)

    return results

In [31]:
op_results = process_csv("../historical_data/all_to_OMOP_Mapping.csv")

503it [06:58,  1.20it/s]


In [32]:
import pandas as pd

df_results = pd.DataFrame(op_results)

In [33]:
df_results.to_csv("zero_shot_ops.csv", index=False)