In [3]:
import fastai
from fastai.vision.all import *
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
import os
import torch
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix

# 경로 설정
DATA_PATH = '/userHome/userhome4/kyoungmin/code/Xray/dataset'
OUTPUT_DIR = '/userHome/userhome4/kyoungmin/code/Xray/CoAtNet/output'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 새 모델 경로 설정 (.pkl 파일)
NEW_MODEL_PATH = '/userHome/userhome4/kyoungmin/code/Xray/NIH_EXP/Project/pkl_data/all_labels_14.pkl'  # <-- 여기에 새 모델 경로 입력

# 시드 설정
SEED = 85
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

# GPU 설정
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 질병 레이블 정의
disease_labels = ['No finding', 'Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 
                  'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
                  'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

# 데이터 로드
print("Loading data...")
labels_df = pd.read_csv(f'{DATA_PATH}/Data_Entry_2017.csv')
labels_df.columns = ['Image_Index', 'Finding_Labels', 'Follow_Up_#', 'Patient_ID',
                    'Patient_Age', 'Patient_Gender', 'View_Position',
                    'Original_Image_Width', 'Original_Image_Height',
                    'Original_Image_Pixel_Spacing_X',
                    'Original_Image_Pixel_Spacing_Y', 'dfd']

# 질병 레이블 원-핫 인코딩
for disease in tqdm(disease_labels, desc="Encoding disease labels"):
    labels_df[disease] = labels_df['Finding_Labels'].map(lambda result: 1 if disease in result else 0)

# 이미지 파일 경로 매핑
print("Mapping image file paths...")
num_glob = glob(f'{DATA_PATH}/*/images/*.png')
img_path = {os.path.basename(x): x for x in num_glob}
labels_df['Paths'] = labels_df['Image_Index'].map(img_path.get)

# 경로가 유효한 데이터만 필터링
valid_paths = labels_df['Paths'].notna()
labels_df = labels_df[valid_paths].reset_index(drop=True)
print(f"Number of images with valid paths: {len(labels_df)}")

# 병변별로 균형 잡힌 테스트셋 생성
print("Creating balanced test set by disease...")
test_indices = set()

for disease in disease_labels:
    # 각 질병에 대해 양성(1) 샘플과 음성(0) 샘플을 찾음
    positive_samples = labels_df[labels_df[disease] == 1].index.tolist()
    negative_samples = labels_df[labels_df[disease] == 0].index.tolist()
    
    # 양성 샘플의 50%를 테스트셋으로 선택
    pos_test_size = min(len(positive_samples) // 2, 1000)  # 최대 1,000개 제한
    pos_test_indices = random.sample(positive_samples, pos_test_size)
    
    # 음성 샘플도 비슷한 수를 테스트셋으로 선택
    neg_test_size = min(len(negative_samples) // 10, pos_test_size)  # 양성 샘플 수와 비슷하게
    neg_test_indices = random.sample(negative_samples, neg_test_size)
    
    # 테스트 인덱스에 추가
    test_indices.update(pos_test_indices)
    test_indices.update(neg_test_indices)

# 테스트셋과 학습셋 분리
test_indices = list(test_indices)
test_df = labels_df.loc[test_indices].reset_index(drop=True)

# 데이터셋 비율 계산
total_dataset_size = len(labels_df)
test_dataset_size = len(test_df)
dataset_percentage = int((test_dataset_size / total_dataset_size) * 100)
print(f"Test dataset size: {len(test_df)} ({dataset_percentage}% of total data)")

# DataBlock 설정
item_transforms = [Resize((224, 224))]
batch_transforms = [Normalize.from_stats(*imagenet_stats)]

def get_x(row):
    return row['Paths']

def get_y(row):
    labels = row[disease_labels].tolist()
    return labels

test_dblock = DataBlock(
    blocks=(ImageBlock, MultiCategoryBlock(encoded=True, vocab=disease_labels)),
    get_x=get_x,
    get_y=get_y,
    item_tfms=item_transforms,
    batch_tfms=batch_transforms
)

# 데이터 로더 생성
print("Creating data loaders...")
test_dls = test_dblock.dataloaders(test_df, bs=32, shuffle=False)
print(f"Number of items in test dataset: {len(test_dls.train.dataset)}")

# ----- 모델 로드 부분 (수정됨) -----
print("Loading new model...")
try:
    # 방법 1: fastai의 load_learner 사용 (.pkl 파일 로드)
    learn = load_learner(NEW_MODEL_PATH)
    print("✅ Model loaded successfully using load_learner!")
except Exception as e:
    print(f"Error loading with load_learner: {e}")
    try:
        # 방법 2: 모델 구조 생성 후 state_dict 로드
        learn = vision_learner(test_dls, 'coatnet_2_rw_224', metrics=[accuracy_multi])
        learn.model.to(device)
        state_dict = torch.load(NEW_MODEL_PATH, map_location=device)
        
        # 로드된 데이터가 state_dict 형태인지 확인
        if hasattr(state_dict, 'model'):
            learn.model.load_state_dict(state_dict.model)
        else:
            learn.model.load_state_dict(state_dict)
        print("✅ Model loaded successfully using load_state_dict!")
    except Exception as e2:
        print(f"Error loading with state_dict: {e2}")
        print("Trying default fastai loader...")
        
        # 방법 3: 기본 fastai 로더 사용
        learn = vision_learner(test_dls, 'coatnet_2_rw_224', metrics=[accuracy_multi])
        learn.model.to(device)
        learn.load(NEW_MODEL_PATH)
        print("✅ Model loaded successfully using default fastai loader!")

# ----- 이하 코드는 동일 -----
# 모델 평가
print("Making predictions...")
learn.model.eval()

# 예측 수행
with torch.no_grad():
    preds, targs = learn.get_preds(dl=test_dls.train)
print(f"Targets tensor shape: {targs.shape}")
print(f"Predictions tensor shape: {preds.shape}")

# PyTorch Tensor → NumPy 배열 변환
probs = preds.detach().cpu().numpy()
targets = targs.detach().cpu().numpy()

# 병변별 샘플 개수 출력
disease_counts = np.sum(targets, axis=0)
disease_count_dict = dict(zip(disease_labels, disease_counts))
print("\n📌 Actual disease sample counts:")
for disease, count in disease_count_dict.items():
    print(f"{disease}: {int(count)} samples")

# (이하 분석 코드는 동일하게 유지)


Using device: cuda:0
Loading data...


Encoding disease labels: 100%|██████████| 15/15 [00:00<00:00, 31.83it/s]


Mapping image file paths...
Number of images with valid paths: 112120
Creating balanced test set by disease...
Test dataset size: 22637 (20% of total data)
Creating data loaders...
Number of items in test dataset: 18110
Loading new model...
Error loading with load_learner: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error loading with state_dict: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Trying default fastai loader...


FileNotFoundError: [Errno 2] No such file or directory: '/userHome/userhome4/kyoungmin/code/Xray/NIH_EXP/Project/pkl_data/all_labels_14.pkl.pth'

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from glob import glob
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

train_val_df = pd.read_csv("../../data/train_val.csv")
test_df = pd.read_csv("../../data/test.csv")

# Extract the paths and labels
disease_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']
train_val_labels = train_val_df[disease_labels].values
test_labels = test_df[disease_labels].values

FileNotFoundError: [Errno 2] No such file or directory: '../../data/train_val.csv'

In [1]:
import pickle

# Pickle 파일에서 numpy 배열 불러오기
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/all_labels_14.pkl', 'rb') as file:
    all_labels = pickle.load(file)
    
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/all_outputs.pkl', 'rb') as file:
    all_outputs = pickle.load(file)
    
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/binary_outputs.pkl', 'rb') as file:
    binary_outputs = pickle.load(file)

In [2]:
# one error check
value_dict = {i:0 for i in disease_labels}
for b in all_outputs:
    value_dict[disease_labels[list(b).index(max(b))]] += 1

value_dict

NameError: name 'disease_labels' is not defined