In [1]:
import pandas as pd

dataset = pd.read_excel("./sample.xlsx")

In [2]:
print(dataset.iloc[0]['검사결과'])


DIAGNOSIS : 
FA-C) Lung, ( right upper lobe, posterior segment ), segmentectomy:
           - INVASIVE ADENOCARCINOMA, MODERATELY DIFFERENTIATED,
             PAPILLARY, MULTIPLE (x4), 2.4 x 2 x 2 cm, 0.9 cm IN GREATEST DIMENSION, 
             0.6 cm IN GREATEST DIMENSION, AND 0.3 cm IN GREATEST DIMENSION,
             RIGHT UPPER LOBE POSTERIOR SEGMENT,
                 with 1) tumor focality: separate nodules in same lobe (x3)
                          2) confinement to lung
                          3) pleural invasion: no
                          4) lymphovascular invasion: not identified
                          5) perineural invasion: not identified
                          6) bronchial margin: clear
                          7) vascular margin: clear
                          8) parenchymal margin: clear
                          9) no metastasis in 10 lymph nodes ( 0/10 )
                               ( LN #2R, 0/1; LN #4R, 0/5; LN #7, 0/1; LN #10, 0/1; LN #11S, 0/2 ).
  

In [3]:
import re

def normalize_loc(s: str) -> str:
    s = re.sub(r'\s+', ' ', s.strip())
    s = re.sub(r'\s*-\s*margin\b', '', s, flags=re.I)  # "- margin" 제거
    s = s.strip(' ,;.')
    return s

def parse_path(text: str) -> dict:
    out = {"Site": None, "수술위치": None, "수술방법": None}

    # 1) DIAGNOSIS 섹션 같은 한 줄 패턴: "Lung, ( right upper lobe, posterior segment ), segmentectomy:"
    m = re.search(
        r'(?im)^\s*(?:[A-Z]{1,3}(?:-[A-Z])?\)\s*)?(?P<site>Lung)\s*,\s*\(\s*(?P<loc>[^)]+?)\s*\)\s*,\s*(?P<proc>segmentectomy)\s*:',
        text,
        flags=re.I | re.M,
    )
    if m:
        out["Site"] = m.group("site").strip().title()
        out["수술위치"] = normalize_loc(m.group("loc"))
        out["수술방법"] = m.group("proc").strip().lower()
        return out

    # 2) Procedure 라인 보조
    m_proc = re.search(r'(?im)^\s*Procedure\s*:\s*(?P<proc>[^\n]+)', text)
    if m_proc:
        out["수술방법"] = m_proc.group("proc").strip().lower()

    # 3) Specimen 라인 보조: "Specimen: Lung (upper lobe) (posterior segment- margin)"
    m_spec = re.search(
        r'(?im)^\s*Specimen\s*:\s*(?P<site>Lung)\s*\(\s*(?P<lobe>[^)]+)\)\s*\(\s*(?P<seg>[^)]+)\)',
        text,
    )
    if m_spec:
        out["Site"] = m_spec.group("site").strip().title()
        lobe = normalize_loc(m_spec.group("lobe"))
        seg = normalize_loc(m_spec.group("seg"))
        out["수술위치"] = normalize_loc(f"{lobe}, {seg}")

    # 4) DIAGNOSIS 라인에서 Site/위치만 별도로 보조 추출 (콜론 없이 끝나는 경우 고려)
    if not out["Site"] or not out["수술위치"]:
        m_diag = re.search(
            r'(?im)\bLung\s*,\s*\(\s*(?P<loc>[^)]+?)\s*\)',
            text,
        )
        if m_diag:
            out["Site"] = out["Site"] or "Lung"
            out["수술위치"] = out["수술위치"] or normalize_loc(m_diag.group("loc"))

    return out

In [4]:
print(parse_path(dataset.iloc[0]['검사결과']))


{'Site': 'Lung', '수술위치': 'right upper lobe, posterior segment', '수술방법': 'segmentectomy'}


In [5]:
print(dataset.iloc[2]['검사결과'])

DIAGNOSIS : 
FA-D) Lung, ( left lower lobe ), lobectomy:
           - PLEOMORPHIC CARCINOMA, SINGLE, 7.9 x 5.8 x 5.4 cm, LEFT LOWER LOBE,
               with 1) tumor focality: unifocal
                        2) confinement to lung
                        3) pleural invasion: no
                        4) lymphovascular invasion: present
                        5) perineural invasion: not identified
                        6) bronchial margin: clear
                        7) vascular margin: clear
                             ( distance of invasive carcinoma from closest margin: 3.7 cm )
                        8) metastasis in 6 of 40 lymph nodes ( 6/40 )
                             ( peribronchial LN, 0/1; LN #7, 2/7; LN #4L, 0/5; LN #5, 0/5;
                               LN #6, 0/5; LN #9, 1/7; LN #10, 3/8; LN #11, 0/2 )
                             ( metastatic tumor size: 15 mm, without extranodal extension ).
           - Non-neoplastic lung, emphysema.
           - Tumor ass

In [6]:
print(parse_path(dataset.iloc[1]['검사결과']))

{'Site': 'Lung', '수술위치': 'left upper lobe and left lower lobe', '수술방법': None}


In [7]:
print(dataset.iloc[1]['검사결과'])

FA. Lung (Lymph node 4)
FB. Lung (upper lobe, bronchial resection margin & vascular margin)
C. Lung (upper lobe, bronchial resection margin & vascular margin)
D. Lung (upper lobe, superior segment margin)
E. Lung (upper lobe, lymph node 6ea)


DIAGNOSIS : 
FA-E) Lung, ( left upper lobe and left lower lobe ) and mediastinal lymph node,
           lobectomy ( left upper lobe ), segmentectomy ( left lower lobe ), and mediastinal
           lymph node dissection:
           - SQUAMOUS CELL CARCINOMA, MODERATELY DIFFERENTIATED,
             KERATINIZING, SINGLE, 5.6 x 4.7 x 4.2 cm, LEFT UPPER LOBE,
                 with 1) tumor focality: unifocal
                          2) extension to perihilar fat tissue
                          3) no involvement of left lower lobe
                          4) pleural invasion: no
                          5) lymphovascular invasion: present
                          6) perineural invasion: not identified
                          7) bronchial margin:

In [8]:
print(parse_path(dataset.iloc[2]['검사결과']))

{'Site': 'Lung', '수술위치': 'left lower lobe', '수술방법': None}


In [9]:
print(dataset.iloc[2]['검사결과'])

DIAGNOSIS : 
FA-D) Lung, ( left lower lobe ), lobectomy:
           - PLEOMORPHIC CARCINOMA, SINGLE, 7.9 x 5.8 x 5.4 cm, LEFT LOWER LOBE,
               with 1) tumor focality: unifocal
                        2) confinement to lung
                        3) pleural invasion: no
                        4) lymphovascular invasion: present
                        5) perineural invasion: not identified
                        6) bronchial margin: clear
                        7) vascular margin: clear
                             ( distance of invasive carcinoma from closest margin: 3.7 cm )
                        8) metastasis in 6 of 40 lymph nodes ( 6/40 )
                             ( peribronchial LN, 0/1; LN #7, 2/7; LN #4L, 0/5; LN #5, 0/5;
                               LN #6, 0/5; LN #9, 1/7; LN #10, 3/8; LN #11, 0/2 )
                             ( metastatic tumor size: 15 mm, without extranodal extension ).
           - Non-neoplastic lung, emphysema.
           - Tumor ass