In [3]:
import fastai
from fastai.vision.all import *
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
import os
import torch
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix

# Í≤ΩÎ°ú ÏÑ§Ï†ï
DATA_PATH = '/userHome/userhome4/kyoungmin/code/Xray/dataset'
OUTPUT_DIR = '/userHome/userhome4/kyoungmin/code/Xray/CoAtNet/output'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ÏÉà Î™®Îç∏ Í≤ΩÎ°ú ÏÑ§Ï†ï (.pkl ÌååÏùº)
NEW_MODEL_PATH = '/userHome/userhome4/kyoungmin/code/Xray/NIH_EXP/Project/pkl_data/all_labels_14.pkl'  # <-- Ïó¨Í∏∞Ïóê ÏÉà Î™®Îç∏ Í≤ΩÎ°ú ÏûÖÎ†•

# ÏãúÎìú ÏÑ§Ï†ï
SEED = 85
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

# GPU ÏÑ§Ï†ï
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ÏßàÎ≥ë Î†àÏù¥Î∏î Ï†ïÏùò
disease_labels = ['No finding', 'Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 
                  'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
                  'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
print("Loading data...")
labels_df = pd.read_csv(f'{DATA_PATH}/Data_Entry_2017.csv')
labels_df.columns = ['Image_Index', 'Finding_Labels', 'Follow_Up_#', 'Patient_ID',
                    'Patient_Age', 'Patient_Gender', 'View_Position',
                    'Original_Image_Width', 'Original_Image_Height',
                    'Original_Image_Pixel_Spacing_X',
                    'Original_Image_Pixel_Spacing_Y', 'dfd']

# ÏßàÎ≥ë Î†àÏù¥Î∏î Ïõê-Ìï´ Ïù∏ÏΩîÎî©
for disease in tqdm(disease_labels, desc="Encoding disease labels"):
    labels_df[disease] = labels_df['Finding_Labels'].map(lambda result: 1 if disease in result else 0)

# Ïù¥ÎØ∏ÏßÄ ÌååÏùº Í≤ΩÎ°ú Îß§Ìïë
print("Mapping image file paths...")
num_glob = glob(f'{DATA_PATH}/*/images/*.png')
img_path = {os.path.basename(x): x for x in num_glob}
labels_df['Paths'] = labels_df['Image_Index'].map(img_path.get)

# Í≤ΩÎ°úÍ∞Ä Ïú†Ìö®Ìïú Îç∞Ïù¥ÌÑ∞Îßå ÌïÑÌÑ∞ÎßÅ
valid_paths = labels_df['Paths'].notna()
labels_df = labels_df[valid_paths].reset_index(drop=True)
print(f"Number of images with valid paths: {len(labels_df)}")

# Î≥ëÎ≥ÄÎ≥ÑÎ°ú Í∑†Ìòï Ïû°Ìûå ÌÖåÏä§Ìä∏ÏÖã ÏÉùÏÑ±
print("Creating balanced test set by disease...")
test_indices = set()

for disease in disease_labels:
    # Í∞Å ÏßàÎ≥ëÏóê ÎåÄÌï¥ ÏñëÏÑ±(1) ÏÉòÌîåÍ≥º ÏùåÏÑ±(0) ÏÉòÌîåÏùÑ Ï∞æÏùå
    positive_samples = labels_df[labels_df[disease] == 1].index.tolist()
    negative_samples = labels_df[labels_df[disease] == 0].index.tolist()
    
    # ÏñëÏÑ± ÏÉòÌîåÏùò 50%Î•º ÌÖåÏä§Ìä∏ÏÖãÏúºÎ°ú ÏÑ†ÌÉù
    pos_test_size = min(len(positive_samples) // 2, 1000)  # ÏµúÎåÄ 1,000Í∞ú Ï†úÌïú
    pos_test_indices = random.sample(positive_samples, pos_test_size)
    
    # ÏùåÏÑ± ÏÉòÌîåÎèÑ ÎπÑÏä∑Ìïú ÏàòÎ•º ÌÖåÏä§Ìä∏ÏÖãÏúºÎ°ú ÏÑ†ÌÉù
    neg_test_size = min(len(negative_samples) // 10, pos_test_size)  # ÏñëÏÑ± ÏÉòÌîå ÏàòÏôÄ ÎπÑÏä∑ÌïòÍ≤å
    neg_test_indices = random.sample(negative_samples, neg_test_size)
    
    # ÌÖåÏä§Ìä∏ Ïù∏Îç±Ïä§Ïóê Ï∂îÍ∞Ä
    test_indices.update(pos_test_indices)
    test_indices.update(neg_test_indices)

# ÌÖåÏä§Ìä∏ÏÖãÍ≥º ÌïôÏäµÏÖã Î∂ÑÎ¶¨
test_indices = list(test_indices)
test_df = labels_df.loc[test_indices].reset_index(drop=True)

# Îç∞Ïù¥ÌÑ∞ÏÖã ÎπÑÏú® Í≥ÑÏÇ∞
total_dataset_size = len(labels_df)
test_dataset_size = len(test_df)
dataset_percentage = int((test_dataset_size / total_dataset_size) * 100)
print(f"Test dataset size: {len(test_df)} ({dataset_percentage}% of total data)")

# DataBlock ÏÑ§Ï†ï
item_transforms = [Resize((224, 224))]
batch_transforms = [Normalize.from_stats(*imagenet_stats)]

def get_x(row):
    return row['Paths']

def get_y(row):
    labels = row[disease_labels].tolist()
    return labels

test_dblock = DataBlock(
    blocks=(ImageBlock, MultiCategoryBlock(encoded=True, vocab=disease_labels)),
    get_x=get_x,
    get_y=get_y,
    item_tfms=item_transforms,
    batch_tfms=batch_transforms
)

# Îç∞Ïù¥ÌÑ∞ Î°úÎçî ÏÉùÏÑ±
print("Creating data loaders...")
test_dls = test_dblock.dataloaders(test_df, bs=32, shuffle=False)
print(f"Number of items in test dataset: {len(test_dls.train.dataset)}")

# ----- Î™®Îç∏ Î°úÎìú Î∂ÄÎ∂Ñ (ÏàòÏ†ïÎê®) -----
print("Loading new model...")
try:
    # Î∞©Î≤ï 1: fastaiÏùò load_learner ÏÇ¨Ïö© (.pkl ÌååÏùº Î°úÎìú)
    learn = load_learner(NEW_MODEL_PATH)
    print("‚úÖ Model loaded successfully using load_learner!")
except Exception as e:
    print(f"Error loading with load_learner: {e}")
    try:
        # Î∞©Î≤ï 2: Î™®Îç∏ Íµ¨Ï°∞ ÏÉùÏÑ± ÌõÑ state_dict Î°úÎìú
        learn = vision_learner(test_dls, 'coatnet_2_rw_224', metrics=[accuracy_multi])
        learn.model.to(device)
        state_dict = torch.load(NEW_MODEL_PATH, map_location=device)
        
        # Î°úÎìúÎêú Îç∞Ïù¥ÌÑ∞Í∞Ä state_dict ÌòïÌÉúÏù∏ÏßÄ ÌôïÏù∏
        if hasattr(state_dict, 'model'):
            learn.model.load_state_dict(state_dict.model)
        else:
            learn.model.load_state_dict(state_dict)
        print("‚úÖ Model loaded successfully using load_state_dict!")
    except Exception as e2:
        print(f"Error loading with state_dict: {e2}")
        print("Trying default fastai loader...")
        
        # Î∞©Î≤ï 3: Í∏∞Î≥∏ fastai Î°úÎçî ÏÇ¨Ïö©
        learn = vision_learner(test_dls, 'coatnet_2_rw_224', metrics=[accuracy_multi])
        learn.model.to(device)
        learn.load(NEW_MODEL_PATH)
        print("‚úÖ Model loaded successfully using default fastai loader!")

# ----- Ïù¥Ìïò ÏΩîÎìúÎäî ÎèôÏùº -----
# Î™®Îç∏ ÌèâÍ∞Ä
print("Making predictions...")
learn.model.eval()

# ÏòàÏ∏° ÏàòÌñâ
with torch.no_grad():
    preds, targs = learn.get_preds(dl=test_dls.train)
print(f"Targets tensor shape: {targs.shape}")
print(f"Predictions tensor shape: {preds.shape}")

# PyTorch Tensor ‚Üí NumPy Î∞∞Ïó¥ Î≥ÄÌôò
probs = preds.detach().cpu().numpy()
targets = targs.detach().cpu().numpy()

# Î≥ëÎ≥ÄÎ≥Ñ ÏÉòÌîå Í∞úÏàò Ï∂úÎ†•
disease_counts = np.sum(targets, axis=0)
disease_count_dict = dict(zip(disease_labels, disease_counts))
print("\nüìå Actual disease sample counts:")
for disease, count in disease_count_dict.items():
    print(f"{disease}: {int(count)} samples")

# (Ïù¥Ìïò Î∂ÑÏÑù ÏΩîÎìúÎäî ÎèôÏùºÌïòÍ≤å Ïú†ÏßÄ)


Using device: cuda:0
Loading data...


Encoding disease labels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:00<00:00, 31.83it/s]


Mapping image file paths...
Number of images with valid paths: 112120
Creating balanced test set by disease...
Test dataset size: 22637 (20% of total data)
Creating data loaders...
Number of items in test dataset: 18110
Loading new model...
Error loading with load_learner: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error loading with state_dict: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Trying default fastai loader...


FileNotFoundError: [Errno 2] No such file or directory: '/userHome/userhome4/kyoungmin/code/Xray/NIH_EXP/Project/pkl_data/all_labels_14.pkl.pth'

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from glob import glob
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

train_val_df = pd.read_csv("../../data/train_val.csv")
test_df = pd.read_csv("../../data/test.csv")

# Extract the paths and labels
disease_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']
train_val_labels = train_val_df[disease_labels].values
test_labels = test_df[disease_labels].values

FileNotFoundError: [Errno 2] No such file or directory: '../../data/train_val.csv'

In [1]:
import pickle

# Pickle ÌååÏùºÏóêÏÑú numpy Î∞∞Ïó¥ Î∂àÎü¨Ïò§Í∏∞
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/all_labels_14.pkl', 'rb') as file:
    all_labels = pickle.load(file)
    
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/all_outputs.pkl', 'rb') as file:
    all_outputs = pickle.load(file)
    
with open('/userHome/userhome4/kyoungmin/code/Xray/CTransCNN/pkl_data/binary_outputs.pkl', 'rb') as file:
    binary_outputs = pickle.load(file)

In [2]:
# one error check
value_dict = {i:0 for i in disease_labels}
for b in all_outputs:
    value_dict[disease_labels[list(b).index(max(b))]] += 1

value_dict

NameError: name 'disease_labels' is not defined