In [5]:
import re
import json
from bs4 import BeautifulSoup
from glob import glob

In [2]:
files = glob('./ocr/*.json')

In [3]:
file = files[0]

In [32]:
with open(file) as f:
    data = json.load(f)

In [60]:
data

[{'text': "<h1 id='0' style='font-size:18px'>Patient's Chart</h1> <table id='1' style='font-size:14px'><tr><td>No. 00931097</td><td>작성일시 : 2023년 06월 12일 14:37 건강보험</td></tr><tr><td>수 진 자 명</td><td>진 료 과 목 FM1</td></tr><tr><td>입 원 일 시</td><td>2023- 06- 12 14:00 병 실 5병동(511호)</td></tr><tr><td>주 민 번 호</td><td>사업장기호 00000000000</td></tr><tr><td>성 별 연 령</td><td>[F] / 76세(현:77) 사 업 장 명 명칭없음</td></tr><tr><td>전 화 번 호</td><td>증 번 호</td></tr><tr><td>핸드폰 번호</td><td>피보험자명</td></tr><tr><td>주 소</td><td></td></tr><tr><td>부 작 용</td><td>V : BP;116/85- 113 RESP;20 T;36.8 PP2;261 SPO2;98</td></tr><tr><td>Diagnosis</td><td>E785^00 상세불명의 고지질혈증 N10^00 급성 세뇨관- 간질신장염 A419^00 상세불명의 패혈증</td></tr><tr><td>C/C</td><td>J189^00 상세불명의 폐렴 I802^01 심부정맥혈전증 NOS U830^00 반코마이신 내성 I693^00 뇌경색증의 후유증 U8280^00 카바페넴계내성 F009^00 상세불명의 알츠하이머병에서의 치매(G30.9↑) A099^00 상세불명 기원의 위장염 및 결장염 L893^00 욕창궤양 및 압박부위 제4단계 U089^00 상세불명의 코로나- 19의 개인력</td></tr><tr><td>P/ /</td><td>dementia lipidemia CRE PTE DVT CVA</td></tr><tr><td>P/ Hx</td><td>상기

In [57]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

UPSTAGE_API_KEY = os.getenv('UPSTAGE_API_KEY')

client = OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)

system_prompt = """
You are a compassionate and clear-speaking doctor. 
Your goal is to explain medical information in a way that is easy for patients to understand, 
avoiding complex medical jargon whenever possible.

Given a medical document or chart, your task is to summarize the key information in a patient-friendly format. 
If specific details are provided, like diagnosis codes or medical history, simplify these terms and explain them in a way that is easy to grasp.

Follow these instructions:

1. **Document Structure**:
   - Always start with a header "Patient's Chart" or "Progress Notes" or "간호정보 조사지" etc...
   - Include all relevant sections such as "Page Info", "Document Type", "Patient Number", "Document Date", "Department", "Admission Date", "Room", "Gender", "Age", "Vital Signs", "Diagnoses", "Medical History", and "Current Medications".
   - Use bullet points for clarity.

2. **Content Simplification**:
   - Simplify medical terms and diagnoses (e.g., "Unspecified hyperlipidemia" could be explained as "High cholesterol levels").
   - When listing diagnoses, briefly explain what each condition means and how it might affect the patient.
   - If there is any missing or unknown information, clearly state "Unknown" or leave the section blank as applicable.

3. **Format and Tone**:
   - Do not include any HTML tags or technical formatting.
   - Ensure that the final summary is clear and easy to read.

4. **Example**:
   - Refer to the format provided below as a guideline for structuring your response.

### EXAMPLE:
Patient's Chart Summary:
- Page Info: Unknown
- Document Type: Patient's Chart
- Patient Number: 00931097
- Document Date: June 12, 2023, 14:37
- Department: FM1
- Admission Date: June 12, 2023, 14:00
- Room: 5th Floor, Room 511
- Gender: Female
- Age: 76 (current age: 77)
- Vital Signs: BP 116/85, RESP 20, Temp 36.8°C, Glucose 261, SpO2 98%
- Diagnoses:
  - High cholesterol levels (Unspecified hyperlipidemia, E785^00)
  - A kidney inflammation (Acute tubulointerstitial nephritis, N10^00)
  - Blood infection (Unspecified sepsis, A419^00)
  - Lung infection (Unspecified pneumonia, J189^00)
  - Blood clot in the veins (Deep vein thrombosis NOS, I802^01)
  - Resistance to an antibiotic called vancomycin (Vancomycin resistance, U830^00)
  - Stroke-related complications (Sequelae of cerebral infarction, I693^00)
  - Resistance to carbapenem antibiotics (Carbapenem resistance, U8280^00)
  - Dementia from Alzheimer's disease (F009^00)
  - Stomach and colon inflammation (Unspecified gastroenteritis and colitis, A099^00)
  - Severe pressure sore (Stage 4 pressure ulcer, L893^00)
  - History of COVID-19 (U089^00)
- Medical History: Frequent hospitalizations due to the above diseases. Recently admitted due to a rectal infection, now isolated for treatment.
- Current Medications: Xarelto 15mg twice daily from June 1 to June 22, then increased to Xarelto 20mg.

### REMEMBER: 
- Always use a patient-friendly tone.
- Follow the format of the example closely.
- Ensure accuracy and clarity in every explanation.
"""


def extract_info(data):
    response = client.chat.completions.create(
        model="solar-1-mini-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": data},
        ],
        stream=False,
    )
    result = response.choices[0].message.content
    return result


with open(file) as f:
    data = json.load(f)
    
rag_text = []
for item in data:
    rag_text.append(extract_dress_info(item['text']))

In [59]:
rag_text

['{\n  "page_info": "",\n  "document_type": "Patient\'s Chart",\n  "explanation": {\n    "Patient Number": "환자 번호: 00931097",\n    "Document Date": "작성일시: 2023년 6월 12일 오후 2시 37분",\n    "Department": "진료과목: FM1",\n    "Admission Date and Room": "입원일시: 2023년 6월 12일 오후 2시 00분, 병실: 5병동(511호)",\n    "Gender and Age": "성별: 여성, 나이: 76세 (현재 77세)",\n    "Vital Signs": "혈압: 116/85, 호흡: 20회/분, 체온: 36.8도, 혈당: 261, 산소포화도: 98%",\n    "Diagnosis": {\n      "Lipid Disorder": "상세불명의 고지질혈증",\n      "Kidney Infection": "급성 세뇨관-간질신장염",\n      "Sepsis": "상세불명의 패혈증"\n    },\n    "Additional Conditions": [\n      "상세불명의 폐렴",\n      "심부정맥혈전증",\n      "반코마이신 내성",\n      "뇌경색 후유증",\n      "카바페넴계 내성",\n      "알츠하이머 치매",\n      "상세불명의 위장염 및 결장염",\n      "4단계 욕창궤양",\n      "상세불명의 코로나-19 개인력"\n    ],\n    "Medical History": "이전에 병원에 자주 입원하였으며, 최근에 입원 후 CRE 감염이 확인되어 격리 치료를 받았습니다.",\n    "Medications": "PTE 진단 후, 자렐토 15mg을 복용하였으며 이후 자렐토 20mg으로 변경되었습니다."\n  }\n}',
 '환자 차트\n\n이것은 환자의 차트입니다. 이 차트는 환자의 몸의 다양한 부분에 대한 정보를 