In [None]:
!wget https://raw.githubusercontent.com/shubhamgupta-dat/MIMIC_2_OMOP/main/data/OMOP_Schema.csv -O ingestion_data/OMOP_Schema.csv
!wget https://raw.githubusercontent.com/shubhamgupta-dat/MIMIC_2_OMOP/main/data/MIMIC_to_OMOP_Mapping.csv -O ingestion_data/MIMIC_to_OMOP_Mapping.csv

## With improved LLaMa Index and Azure AI GPT-4o

In [1]:
## Load Binaries and Modules
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core.indices import VectorStoreIndex#, SimpleDirectoryReader
import logging, sys, os

from dotenv import load_dotenv

## Setup Logging Configuration
logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Setup LLM and Embeddings
## Load Environment Variables
load_dotenv()

## Configure Azure LLM
llm = AzureOpenAI(
    model="gpt-4-turbo",
    deployment_name="gpt-4-turbo",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("OPENAI_API_VERSION"),
)

## Configure Azure Embedding
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("OPENAI_API_VERSION"),
)

Module 'llama_index.core.global_handler' not found.
Module 'llama_index.core.global_handler' not found.


In [2]:
# from llama_index.core.service_context import ServiceContext,set_global_service_context
# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embed_model,
# )
# set_global_service_context(service_context)
from llama_index.core.settings import Settings
from llama_index.core import set_global_tokenizer

Settings.llm = llm
Settings.embed_model = embed_model

from functools import partial
import tiktoken
enc = tiktoken.encoding_for_model("text-embedding-ada-002")
tokenizer = partial(enc.encode, allowed_special="all")
set_global_tokenizer(tokenizer)

Module 'llama_index.core.global_handler' not found.
global_tokenizer updated


In [3]:
import pandas as pd
from typing import Dict, List
from llama_index.core.indices.vector_store import VectorStoreIndex
from llama_index.core.schema import Document
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.prompts import PromptTemplate
# from llama_index.llms import OpenAI, Ollama

# import faiss

# # dimensions of text-ada-embedding-002
# d = 4096
# faiss_index = faiss.IndexFlatL2(d)

class OMOPIndexer:
    def __init__(
            self,
            llm_client,
            # service_context
            ):
        self.index = None
        # self.llm = 
        self.llm = llm_client
        # self.service_context = service_context
        # self.embed_model = OllamaEmbedding(
        #     model_name="llama3",
        #     base_url="http://localhost:11434",
        #     ollama_additional_kwargs={"mirostat": 0},
        # )
        # self.service_context = ServiceContext.from_defaults(
        #     llm=self.llm,
        #     embed_model=self.embed_model
        # )

    def ingest_historical_mappings(self, csv_file_path: str):
        df_history = pd.read_csv(csv_file_path)
        df_history.rename(columns={
        'SRC_ENT': 'Source Table',
        'SRC_ATT': 'Source Associated Table Column',
        'TGT_ENT': 'Target Table',
        'TGT_ATT': 'Target Associated Table Column'
        },inplace=True)
        documents = []
        for _, row in df_history.iterrows():
            content = f"Source Table: {row['Source Table']}, Source Column: {row['Source Associated Table Column']}, "
            content += f"Target Table: {row['Target Table']}, Target Column: {row['Target Associated Table Column']}"
            documents.append(Document(text=content))

        parser = SimpleNodeParser.from_defaults()
        nodes = parser.get_nodes_from_documents(documents)

        vector_store = SimpleVectorStore()
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        self.index = VectorStoreIndex(
            nodes, 
            storage_context=storage_context,
            # service_context=self.service_context
        )
        

    def ingest_omop_metadata(self, metadata_file_path: str):
        df_metadata = pd.read_csv(metadata_file_path)
        documents = []
        for _, row in df_metadata.iterrows():
            content = f"Table: {row['TableName']}, Column: {row['ColumnName']}, "
            content += f"Data Type: {row['ColumnType']}, Description: {row['ColumnDesc']}"
            documents.append(Document(text=content))
        parser = SimpleNodeParser.from_defaults()
        if self.index is None:
            self.index = VectorStoreIndex.from_documents(documents)
        else:
            self.index.insert_nodes(parser.get_nodes_from_documents(documents))

    def get_mapping_output(self, table_columns: Dict[str, List[str]]) -> str:
        query_engine = self.index.as_query_engine(
            llm=self.llm,
            similarity_top_k=5,
        )

        mapping_prompt = PromptTemplate(
            "Given the following OMOP tables and columns: {table_columns}\n"
            "Provide a mapping to the corresponding source tables and columns based on the historical mappings and OMOP metadata.\n"
            "Include explanations for each mapping decision."
        )

        critique_prompt = PromptTemplate(
            "You are a data mapping expert. Review the following mapping output and provide feedback:\n"
            "{mapping_output}\n"
            "Identify any potential issues, inconsistencies, or areas for improvement in the mapping."
        )

        improvement_prompt = PromptTemplate(
            "Based on the original mapping and the critique, provide an improved mapping output:\n"
            "Original mapping: {original_mapping}\n"
            "Critique: {critique}\n"
            "Improved mapping:"
        )

        # Initial mapping
        initial_response = query_engine.query(
            mapping_prompt.format(table_columns=table_columns)
        )
        initial_mapping = initial_response.response
        print(initial_mapping)

        # Self-critique
        critique_response = query_engine.query(
            critique_prompt.format(mapping_output=initial_mapping)
        )
        critique = critique_response.response
        print(critique)

        # Improved mapping
        improvement_response = query_engine.query(
            improvement_prompt.format(
                original_mapping=initial_mapping, critique=critique
            )
        )
        improved_mapping = improvement_response.response

        return improved_mapping


# Usage example
indexer = OMOPIndexer(
    llm_client=llm,
    #   service_context=service_context
    )
indexer.ingest_historical_mappings("ingestion_data/MIMIC_to_OMOP_Mapping.csv")
indexer.ingest_omop_metadata("ingestion_data/OMOP_Schema.csv")

table_columns = {
    "person": ["person_id", "birth_datetime", "gender_concept_id"],
    "condition_occurrence": [
        "condition_occurrence_id",
        "person_id",
        "condition_concept_id",
    ],
}

mapping_output = indexer.get_mapping_output(table_columns)
print(mapping_output)

Module 'llama_index.core.global_handler' not found.
INFO:httpx:HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://innoopenaigpt4-2.openai.azure.com//openai/deployments/text-embedding

In [5]:
print(mapping_output)

The improved mapping from the source tables and columns to the OMOP tables and columns can be outlined as follows:

1. `person` table:
   - `person_id`: Continues to map to the `SUBJECT_ID` column in the `DIAGNOSES_ICD` source table, ensuring each individual's data is linked between the source and OMOP data models.
   - `birth_datetime`: Should be mapped from a source column that contains the birth date information of the individual. The specific source column should be identified and included in the mapping documentation.
   - `gender_concept_id`: Should be mapped from a source column that contains gender information, which is then converted to a standard concept ID in the OMOP model. The specific source column should be identified and included in the mapping documentation.

2. `condition_occurrence` table:
   - `condition_occurrence_id`: Remains an internally generated unique identifier for each condition occurrence and does not require mapping from the source table.
   - `person_id`