## DCM 데이터셋을 LLaMA Factory 형식에 맞게 전처리

이 노트북은 DCM 환자 데이터를 LLaMA Factory의 ShareGPT 형식으로 변환합니다.
3가지 예측 모드별로 별도의 JSON 파일을 생성합니다:
- delta_only: JOA score 변화 예측
- postop_only: 수술 후 JOA score 예측
- binary_only: 환자 개선 여부 예측 (회복률 >= 60%)

**핵심 특징**: 이 노트북은 데이터 로딩과 파일 I/O만 처리하며,
**모든 프롬프트 관련 로직은 `src/prompts.py`에서 가져옵니다.**

따라서 프롬프트를 수정하려면:
1. `src/prompts.py` 편집
2. 이 노트북 재실행
3. 변경사항이 자동으로 반영됨!

In [1]:
import json
import pandas as pd
from pathlib import Path
from typing import Dict, Any, List
import sys

# Add src directory to path for importing prompts
sys.path.append('../../src')

# Import ALL prompt-related functions from src/prompts.py
from prompts import (
    get_system_prompt,
    construct_user_prompt_sharegpt,
    create_ground_truth_response
)

In [2]:
# Paths configuration
DATA_DIR = Path("../../data")
PREPROCESSED_DIR = DATA_DIR / "preprocessed"
PATIENT_LIST_PATH = DATA_DIR / "patient_list.txt"
METADATA_PATH = DATA_DIR / "metadata" / "tabular_250520.csv"
TABULAR_VARIABLES_PATH = DATA_DIR / "tabular_variables.txt"

# Output paths (relative to LLaMA-Factory/data/)
OUTPUT_DIR = Path(".")
OUTPUT_FILES = {
    "delta_only": OUTPUT_DIR / "dcm_delta_only.json",
    "postop_only": OUTPUT_DIR / "dcm_postop_only.json",
    "binary_only": OUTPUT_DIR / "dcm_binary_only.json"
}

print(f"Data directory: {DATA_DIR.absolute()}")
print(f"Preprocessed directory: {PREPROCESSED_DIR.absolute()}")
print(f"Output directory: {OUTPUT_DIR.absolute()}")

Data directory: /home/seungbinyang/dcm_joa_prediction/LLaMA-Factory/data/../../data
Preprocessed directory: /home/seungbinyang/dcm_joa_prediction/LLaMA-Factory/data/../../data/preprocessed
Output directory: /home/seungbinyang/dcm_joa_prediction/LLaMA-Factory/data


In [3]:
def load_patient_list(patient_list_path: Path) -> List[str]:
    """Load patient list from text file."""
    with open(patient_list_path, 'r') as f:
        patients = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(patients)} patients")
    return patients

def load_allowed_tabular_variables(tabular_vars_path: Path) -> List[str]:
    """Load allowed tabular variables."""
    with open(tabular_vars_path, 'r') as f:
        variables = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(variables)} allowed tabular variables")
    return variables

def filter_tabular_data(tabular_content: str, allowed_variables: List[str]) -> str:
    """Filter tabular data to only include allowed variables."""
    pairs = [p.strip() for p in tabular_content.split(',')]
    filtered_pairs = []
    for pair in pairs:
        if '=' in pair:
            var_name = pair.split('=')[0].strip()
            if var_name in allowed_variables:
                filtered_pairs.append(pair)
    return ', '.join(filtered_pairs)

# Load data
patient_list = load_patient_list(PATIENT_LIST_PATH)
allowed_tabular_vars = load_allowed_tabular_variables(TABULAR_VARIABLES_PATH)
metadata_df = pd.read_csv(METADATA_PATH)
metadata_df['CaseNum'] = metadata_df['CaseNum'].astype(str)
metadata_df = metadata_df[metadata_df['CaseNum'].isin(patient_list)].copy()
print(f"Loaded metadata for {len(metadata_df)} patients")

Loaded 387 patients
Loaded 15 allowed tabular variables
Loaded metadata for 365 patients


In [4]:
def get_t2s_image_paths(patient_dir: Path) -> List[str]:
    """Get T2S image paths (6 slices in sorted order)."""
    t2s_dir = patient_dir / "MR_T2S"
    if not t2s_dir.exists():
        return []
    
    png_files = sorted([f for f in t2s_dir.iterdir() if f.suffix.lower() == '.png'])
    if len(png_files) != 6:
        return []
    
    # Return relative paths from LLaMA-Factory/data/ directory
    return [f"../../data/preprocessed/{patient_dir.name}/MR_T2S/{f.name}" for f in png_files]

def get_t2a_image_paths(patient_dir: Path) -> List[str]:
    """Get T2A image paths (top 6 most stenotic slices based on ranking)."""
    t2a_dir = patient_dir / "MR_T2A"
    ranking_path = patient_dir / "MR_T2A_ranking.json"
    
    if not t2a_dir.exists() or not ranking_path.exists():
        return []
    
    try:
        with open(ranking_path, 'r') as f:
            ranking_data = json.load(f)
    except Exception as e:
        print(f"Error reading ranking file for {patient_dir.name}: {e}")
        return []
    
    if len(ranking_data) == 0:
        return []
    
    # Sort by rank (ascending) and select top 6 most stenotic slices
    sorted_by_rank = sorted(ranking_data, key=lambda x: x['rank'])
    top_6 = sorted_by_rank[:6]
    
    # Build relative file paths
    image_paths = []
    for slice_info in top_6:
        slice_path = t2a_dir / slice_info['filename']
        if not slice_path.exists():
            return []
        image_paths.append(f"../../data/preprocessed/{patient_dir.name}/MR_T2A/{slice_info['filename']}")
    
    return image_paths

def load_tabular_data(patient_dir: Path, allowed_vars: List[str]) -> str:
    """Load and filter tabular data."""
    tabular_path = patient_dir / "tabular_data_note.txt"
    if not tabular_path.exists() or tabular_path.stat().st_size == 0:
        return "Not Provided"
    
    try:
        content = tabular_path.read_text().strip()
        if not content or content in ["Not Available", "Error Reading File"]:
            return "Not Provided"
        
        # Filter to allowed variables
        filtered = filter_tabular_data(content, allowed_vars)
        return filtered if filtered else "Not Provided"
    except Exception as e:
        print(f"Error reading tabular data for {patient_dir.name}: {e}")
        return "Not Provided"

def load_text_reports(patient_dir: Path) -> Dict[str, str]:
    """Load text reports (MR/XR reports, admission/operative notes)."""
    text_files = {
        "MR_REPORT": patient_dir / "MR_report.txt",
        "XR_REPORT": patient_dir / "XR_report.txt",
        "ADMISSION_NOTE": patient_dir / "admission_note.txt",
        "OPERATIVE_NOTE": patient_dir / "operative_note.txt"
    }
    
    valid_reports = {}
    for key, file_path in text_files.items():
        if file_path.exists() and file_path.stat().st_size > 0:
            try:
                content = file_path.read_text().strip()
                if content and content not in ["Not Available", "Error Reading File"]:
                    valid_reports[key] = content
            except Exception as e:
                print(f"Error reading {key} for {patient_dir.name}: {e}")
    
    return valid_reports

print("Data loading functions defined")

Data loading functions defined


In [5]:
def process_patient_for_target(patient_id: str, patient_metadata: pd.Series, 
                               prediction_target: str) -> Dict[str, Any]:
    """
    Process a single patient and create dataset entry for specific prediction target.
    
    This function:
    1. Loads patient data (images, tabular, text)
    2. Uses src/prompts.py functions to generate prompts:
       - get_system_prompt() for system message
       - construct_user_prompt_sharegpt() for user message
       - create_ground_truth_response() for assistant message
    3. Returns ShareGPT format entry
    
    All prompt text and logic comes from src/prompts.py!
    """
    
    patient_dir = PREPROCESSED_DIR / patient_id
    if not patient_dir.exists():
        return None
    
    # Load all patient data
    t2s_paths = get_t2s_image_paths(patient_dir)
    t2a_paths = get_t2a_image_paths(patient_dir)
    tabular_data = load_tabular_data(patient_dir, allowed_tabular_vars)
    text_reports = load_text_reports(patient_dir)
    
    # Validate required data
    if not t2s_paths or not t2a_paths:
        print(f"Skipping patient {patient_id}: Missing required image data")
        return None
    
    # === ALL PROMPTS FROM src/prompts.py ===
    
    # System prompt from src/prompts.py
    system_content = get_system_prompt(prediction_target)
    
    # User prompt from src/prompts.py
    image_paths_dict = {
        "MR_T2S": t2s_paths,
        "MR_T2A": t2a_paths
    }
    modality_keys = ["MR_T2S", "MR_T2A", "tabular", "text"]
    
    user_content = construct_user_prompt_sharegpt(
        image_paths_dict,
        tabular_data,
        text_reports,
        modality_keys
    )
    
    # Assistant response (ground truth) from src/prompts.py
    assistant_content = create_ground_truth_response(
        patient_metadata['preop_joa'],
        patient_metadata['delta_joa'],
        prediction_target
    )
    
    # === END OF PROMPTS ===
    
    # Combine all image paths
    all_image_paths = t2s_paths + t2a_paths
    
    # Create dataset entry in ShareGPT format
    entry = {
        "messages": [
            {
                "content": system_content,
                "role": "system"
            },
            {
                "content": user_content,
                "role": "user"
            },
            {
                "content": assistant_content,
                "role": "assistant"
            }
        ],
        "images": all_image_paths
    }
    
    return entry

print("Patient processing function defined")

Patient processing function defined


In [6]:
# Generate datasets for all 3 prediction targets
prediction_targets = ["delta_only", "postop_only", "binary_only"]

print("="*60)
print("Dataset Generation - Fully Integrated with src/prompts.py")
print("="*60)
print("")
print("This notebook uses the following functions from src/prompts.py:")
print("  1. get_system_prompt() - System message")
print("  2. construct_user_prompt_sharegpt() - User message")
print("  3. create_ground_truth_response() - Assistant message")
print("")
print("NO prompt text is hardcoded in this notebook.")
print("To modify any prompt, edit src/prompts.py and re-run.")
print("="*60)

for target in prediction_targets:
    print(f"\n{'='*60}")
    print(f"Processing dataset for: {target}")
    print(f"{'='*60}")
    
    dataset = []
    skipped_count = 0
    
    for idx, row in metadata_df.iterrows():
        patient_id = str(row['CaseNum'])
        
        entry = process_patient_for_target(patient_id, row, target)
        
        if entry is not None:
            dataset.append(entry)
            if len(dataset) % 50 == 0:
                print(f"Processed {len(dataset)} patients...")
        else:
            skipped_count += 1
    
    # Save dataset
    output_path = OUTPUT_FILES[target]
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Saved {len(dataset)} samples to {output_path}")
    print(f"  Skipped {skipped_count} patients due to missing data")
    
    # Show sample entry
    if len(dataset) > 0:
        print(f"\n  Sample entry structure:")
        print(f"  - Number of images: {len(dataset[0]['images'])}")
        print(f"  - Number of messages: {len(dataset[0]['messages'])}")
        print(f"  - System message length: {len(dataset[0]['messages'][0]['content'])} chars")
        print(f"  - User message length: {len(dataset[0]['messages'][1]['content'])} chars")
        print(f"  - Assistant response: {dataset[0]['messages'][2]['content'][:100]}...")

print(f"\n{'='*60}")
print("All datasets generated successfully!")
print("All prompts sourced from src/prompts.py")
print(f"{'='*60}")

Dataset Generation - Fully Integrated with src/prompts.py

This notebook uses the following functions from src/prompts.py:
  1. get_system_prompt() - System message
  2. construct_user_prompt_sharegpt() - User message
  3. create_ground_truth_response() - Assistant message

NO prompt text is hardcoded in this notebook.
To modify any prompt, edit src/prompts.py and re-run.

Processing dataset for: delta_only
Skipping patient 13: Missing required image data
Skipping patient 27: Missing required image data
Skipping patient 29: Missing required image data
Skipping patient 50: Missing required image data
Processed 50 patients...
Skipping patient 71: Missing required image data
Skipping patient 82: Missing required image data
Skipping patient 83: Missing required image data
Skipping patient 89: Missing required image data
Skipping patient 94: Missing required image data
Skipping patient 98: Missing required image data
Skipping patient 108: Missing required image data
Skipping patient 109: Mi

In [7]:
# Verify dataset format
print("\nVerifying dataset format...\n")

for target, output_path in OUTPUT_FILES.items():
    with open(output_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"{target}:")
    print(f"  Total samples: {len(data)}")
    
    if len(data) > 0:
        sample = data[0]
        print(f"  Sample validation:")
        print(f"    - Has 'messages' field: {('messages' in sample)}")
        print(f"    - Has 'images' field: {('images' in sample)}")
        print(f"    - Messages count: {len(sample['messages'])} (should be 3: system, user, assistant)")
        print(f"    - Images count: {len(sample['images'])}")
        print(f"    - Message roles: {[msg['role'] for msg in sample['messages']]}")
        print(f"    - First image path: {sample['images'][0]}")
        print(f"    - System prompt preview: {sample['messages'][0]['content'][:100]}...")
        print(f"    - Assistant response: {sample['messages'][2]['content'][:150]}...")
    print()

print("✓ Format verification complete!")
print("\n" + "="*60)
print("SUMMARY: Complete Integration with src/prompts.py")
print("="*60)
print("")
print("This notebook contains:")
print("  ✓ Data loading logic")
print("  ✓ File I/O operations")
print("  ✗ NO prompt text (all in src/prompts.py)")
print("")
print("To modify prompts:")
print("  1. Edit src/prompts.py")
print("  2. Re-run this notebook")
print("  3. Done! Changes automatically reflected")
print("")
print("Functions used from src/prompts.py:")
print("  - get_system_prompt()")
print("  - construct_user_prompt_sharegpt()")
print("  - create_ground_truth_response()")
print("="*60)


Verifying dataset format...

delta_only:
  Total samples: 287
  Sample validation:
    - Has 'messages' field: True
    - Has 'images' field: True
    - Messages count: 3 (should be 3: system, user, assistant)
    - Images count: 12
    - Message roles: ['system', 'user', 'assistant']
    - First image path: ../../data/preprocessed/1/MR_T2S/004.png
    - System prompt preview: 
<PROMPT>
<ROLE>
You are an expert clinical reasoning AI specializing in neurosurgery, radiology, an...
    - Assistant response: {"rationale": "Based on the provided imaging and clinical data, the predicted JOA change is -1.0.", "change": -1}...

postop_only:
  Total samples: 287
  Sample validation:
    - Has 'messages' field: True
    - Has 'images' field: True
    - Messages count: 3 (should be 3: system, user, assistant)
    - Images count: 12
    - Message roles: ['system', 'user', 'assistant']
    - First image path: ../../data/preprocessed/1/MR_T2S/004.png
    - System prompt preview: 
<PROMPT>
<ROLE>
Yo