In [53]:
import pandas as pd
import json
from typing import Dict, List, Union, Any
import os
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 200)
API = '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'

In [2]:
def read_ndjson(filename):
    with open(filename, 'r') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

conditions = read_ndjson('data/Condition.ndjson')
patients = read_ndjson('data/Patient.ndjson')

In [3]:
conditions.to_json('data/Condition.json')
patients.to_json('data/Patient.json')

In [4]:
# db setup
# 
# create source tables
# create etl tables
# create destination tables - patient

# 


In [5]:
# What files do we want to generate?
#1 - Patient - schema
# sources
# - patients
# - edstays
# - admissions
# - uuid_namespace
# - map_gender
# - map_marital_status
# - fn_patient_extension
#2 - Condition

import json
from genson import SchemaBuilder


In [6]:
import streamlit as st # import the Streamlit library
from langchain.chains import LLMChain, SimpleSequentialChain # import LangChain libraries
from langchain.llms import OpenAI # import OpenAI model
from langchain.prompts import PromptTemplate # import PromptTemplate

In [7]:
# Set the title of the Streamlit app
st.title("✅ What's TRUE  : Using LangChain `SimpleSequentialChain`")

# Add a link to the Github repository that inspired this app
st.markdown("Inspired from [fact-checker](https://github.com/jagilley/fact-checker) by Jagiley")

2023-05-17 14:27:47.847 
  command:

    streamlit run /Users/smithm/Library/Caches/pypoetry/virtualenvs/hackaithon-P5VjHMQO-py3.11/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [9]:


# If an API key has been provided, create an OpenAI language model instance
if API:
    llm = OpenAI(temperature=0.7, openai_api_key=API)
else:
    # If an API key hasn't been provided, display a warning message
    st.warning("Enter your OPENAI API-KEY. Get your OpenAI API key from [here](https://platform.openai.com/account/api-keys).\n")


In [10]:
user_question = st.text_input(
    "Enter Your Question : ",
    placeholder = "Cyanobacteria can perform photosynthetsis , are they considered as plants?",
)

2023-05-17 12:57:45.466 
  command:

    streamlit run /Users/smithm/Library/Caches/pypoetry/virtualenvs/hackaithon-P5VjHMQO-py3.11/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [25]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import CSVLoader

In [26]:
loader = CSVLoader('data/mtsamples.csv')
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(openai_api_key=API)
vectordb = Chroma.from_documents(texts, embeddings, persist_directory="db/")


Using embedded DuckDB with persistence: data will be stored in: db/


In [43]:
qa = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=API), chain_type='stuff', 
                                vectorstore=vectordb)



In [48]:
query = "Which patient ids were diagnosed with heart disease?"
qa.run(query)

' 3338 and 4758'

In [54]:
def flatten_nested_dictionary(dictionary, parent_key='', flattened_dict=None):
    if flattened_dict is None:
        flattened_dict = {}

    for key, value in dictionary.items():
        new_key = f"{parent_key}.{key}" if parent_key else key

        if isinstance(value, dict):
            flatten_nested_dictionary(value, new_key, flattened_dict)
        else:
            flattened_dict[new_key] = value

    return flattened_dict

def generate_schema(ndjson_file='data/Patient.ndjson'):
    builder = SchemaBuilder()

    with open(ndjson_file, 'r') as f:
        for line in f:
            builder.add_object(json.loads(line))

    schema = builder.to_schema()
    return schema

def data_mapping_plan(files, destination_sample_file):
    desired_schema = flatten_nested_dictionary(generate_schema(destination_sample_file))
    sample = pd.read_json(destination_sample_file, lines=True)
    df_normalized = sample.apply(lambda x: pd.json_normalize(x) if isinstance(x, dict) else x)  # type: Union[pd.DataFrame, Any]

    loaded_source_files = {}
    for file in files:
        loaded_source_files[os.path.splitext(os.path.basename(file))[0]] = pd.read_csv(file)

    destination_table = os.path.splitext(os.path.basename(destination_sample_file))[0]
    
    command = f"""
    You are a professional data engineer and healthcare expert. Create a SQL transformation to transform records from the {loaded_source_files.keys()} into records for the {destination_table} table. Don't generate the entire SQL transformation yet.
    """
    destination_prompt = f"""The {destination_table} table schema:"""
    for name, value in desired_schema.items():
        destination_prompt = destination_prompt + f"- {name}:{value}"
    
    destination_sample_prompt = f"""{destination_table} sample:\n {str(df_normalized.head(5))}"""

    source_prompts = []
    for source_name, df in loaded_source_files.items():
        source_prompts.append(f"""{source_name} sample:\n {str(df.head(5))}""")

    context_prompt = '\n'.join([command, destination_prompt, destination_sample_prompt, *source_prompts])
    unique_key = list(desired_schema.keys())[0]
    template = context_prompt + """\n 
        {chat_history}
        Generate the SQL query for id, {desired_column}
    """

    prompt = PromptTemplate(
        input_variables=["chat_history", "desired_column"], 
        template=template
    )
    memory = ConversationBufferMemory(memory_key="chat_history", input_key="desired_column")
    chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=API), chain_type="stuff", memory=memory, prompt=prompt)
    return chain({"input_documents": desired_schema.keys(), "human_input": query}, return_only_outputs=True)
    # prompt chain
        # ask what field should be used to map two files together, foreign keys?
        # provide destination records
        # provide sample inputs

dest_sample = 'data/Patient.ndjson'
source_files=[
    'data/mimic-iv-clinical-database-demo-2.2/hosp/admissions.csv',
    'data/mimic-iv-clinical-database-demo-2.2/hosp/patients.csv',
]
plan = data_mapping_plan(
    files=source_files,
    destination_sample_file=dest_sample
)


    

ValueError: too many values to unpack (expected 2)

In [33]:
dest_schema

{'$schema': 'http://json-schema.org/schema#',
 'type': 'object',
 'properties': {'resourceType': {'type': 'string'},
  'id': {'type': 'string'},
  'meta': {'type': 'object',
   'properties': {'versionId': {'type': 'string'},
    'lastUpdated': {'type': 'string'},
    'source': {'type': 'string'},
    'profile': {'type': 'array', 'items': {'type': 'string'}}},
   'required': ['lastUpdated', 'profile', 'source', 'versionId']},
  'text': {'type': 'object',
   'properties': {'status': {'type': 'string'}, 'div': {'type': 'string'}},
   'required': ['div', 'status']},
  'extension': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'url': {'type': 'string'},
     'extension': {'type': 'array',
      'items': {'type': 'object',
       'properties': {'url': {'type': 'string'},
        'valueCoding': {'type': 'object',
         'properties': {'system': {'type': 'string'},
          'code': {'type': 'string'},
          'display': {'type': 'string'}},
         'required': ['cod

key1: value1
key2.key3: value3
key2.key4.key5: value5
key6: value6
