## 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 라이브러리 임포트

In [None]:
# 필요한 라이브러리 import
import os
import json
import datetime
import shutil
import time
import copy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import numpy as np

## 중간 모델 저장 장치 반영된 모델링 코드

In [None]:
# 매핑 파일을 사용하는 수정된 ImageDataset 클래스
import json
from torch.utils.data import Dataset
from PIL import Image
import os

class ImageDatasetWithMapping(Dataset):
    def __init__(self, data_dir, mapping_file_path, class_names, transform=None):
        self.data_dir = data_dir
        self.class_names = class_names
        self.transform = transform
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}

        # 매핑 파일 로드
        with open(mapping_file_path, 'r', encoding='utf-8') as f:
            self.filename_to_label = json.load(f)

        self.image_paths = []
        self.labels = []

        self._load_from_mapping()

    def _load_from_mapping(self):
        """매핑 파일을 사용해서 이미지와 라벨 로드"""
        valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')

        for file_name in os.listdir(self.data_dir):
            if file_name.lower().endswith(valid_extensions):
                # 매핑 파일에서 라벨 찾기
                if file_name in self.filename_to_label:
                    label = self.filename_to_label[file_name]

                    # 클래스 인덱스 확인
                    if label in self.class_to_idx:
                        image_path = os.path.join(self.data_dir, file_name)
                        self.image_paths.append(image_path)
                        self.labels.append(self.class_to_idx[label])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]

        try:
            image = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # 에러 발생시 기본 이미지 생성
            image = Image.new('RGB', (224, 224), color='white')

        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# 수정된 create_data_loaders 함수
def create_data_loaders_with_mapping(class_names):
    """매핑 파일을 사용하는 데이터 로더 생성 함수"""
    from torchvision import transforms
    from torch.utils.data import DataLoader
    import torch

    # Config 클래스 (기존과 동일)
    dataset_dir = '/content/drive/MyDrive/의장공정/데이터/image_classification/dataset'

    # 데이터 변환 정의
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop((224, 224)),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # 데이터셋 생성
    loaders = {}

    for split in ['train', 'valid', 'test']:
        split_dir = os.path.join(dataset_dir, split)
        mapping_file_path = f'/content/{split}_mapping.json'

        if os.path.exists(split_dir) and os.path.exists(mapping_file_path):
            dataset = ImageDatasetWithMapping(
                data_dir=split_dir,
                mapping_file_path=mapping_file_path,
                class_names=class_names,
                transform=train_transform if split == 'train' else test_transform
            )

            if len(dataset) > 0:
                loaders[split] = DataLoader(
                    dataset,
                    batch_size=32,
                    shuffle=(split == 'train'),
                    num_workers=2,
                    pin_memory=True if torch.cuda.is_available() else False
                )

                print(f"{split.capitalize()} dataset: {len(dataset)} samples")

    return loaders.get('train'), loaders.get('valid'), loaders.get('test')



def create_model(num_classes):
    """ResNet50 모델을 생성합니다."""
    print(f"모델 생성 중... (클래스 수: {num_classes})")

    # pretrained=True로 사전 훈련된 가중치 사용
    model = models.resnet50(pretrained=True)

    # 마지막 fully connected layer를 클래스 수에 맞게 교체
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    return model

def train_model(model, train_loader, valid_loader, num_classes):
    """모델을 학습합니다."""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=Config.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

    # 학습 기록
    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = []

    best_valid_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    print("학습 시작...")
    start_time = time.time()

    for epoch in range(Config.num_epochs):
        print(f'\nEpoch {epoch+1}/{Config.num_epochs}')
        print('-' * 50)

        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = running_corrects.double() / len(train_loader.dataset)

        # Validation phase
        if valid_loader is not None:
            model.eval()
            running_loss = 0.0
            running_corrects = 0

            with torch.no_grad():
                for inputs, labels in valid_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

            epoch_valid_loss = running_loss / len(valid_loader.dataset)
            epoch_valid_acc = running_corrects.double() / len(valid_loader.dataset)

            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')
            print(f'Valid Loss: {epoch_valid_loss:.4f} Acc: {epoch_valid_acc:.4f}')

            # 기록 저장
            valid_losses.append(epoch_valid_loss)
            valid_accs.append(epoch_valid_acc.cpu().numpy())

            # Best model 저장
            if epoch_valid_acc > best_valid_acc:
                best_valid_acc = epoch_valid_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        else:
            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')

        # 기록 저장
        train_losses.append(epoch_train_loss)
        train_accs.append(epoch_train_acc.cpu().numpy())

        scheduler.step()

    # 최적 모델 로드
    if valid_loader is not None:
        model.load_state_dict(best_model_wts)

    # 모델 저장
    model_path = os.path.join(Config.model_dir, f'best_model_{Config.timestamp}.pth')
    torch.save(model.state_dict(), model_path)
    print(f'\n최적 모델 저장: {model_path}')
    if valid_loader is not None:
        print(f'최고 검증 정확도: {best_valid_acc:.4f}')

    training_time = time.time() - start_time
    print(f'학습 완료 시간: {training_time/60:.2f}분')

    return model, train_losses, train_accs, valid_losses, valid_accs

def evaluate_model(model, test_loader, class_names):
    """모델을 평가합니다."""
    if test_loader is None:
        print("테스트 데이터가 없어 평가를 건너뜁니다.")
        return None

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # 평가 지표 계산
    accuracy = accuracy_score(all_labels, all_preds)
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    f1_micro = f1_score(all_labels, all_preds, average='micro')

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)

    # Classification Report
    report = classification_report(
        all_labels,
        all_preds,
        target_names=class_names,
        output_dict=True
    )

    print(f'\n=== 테스트 결과 ===')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1-Score (Macro): {f1_macro:.4f}')
    print(f'F1-Score (Micro): {f1_micro:.4f}')

    # 결과 저장
    results = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'confusion_matrix': cm.tolist(),
        'classification_report': report,
        'timestamp': Config.timestamp
    }

    result_path = os.path.join(Config.result_dir, f'test_results_{Config.timestamp}.json')
    with open(result_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f'결과 저장: {result_path}')

    return results

In [None]:
# 매핑 파일을 사용하는 수정된 ImageDataset 클래스
import json
from torch.utils.data import Dataset
from PIL import Image
import os

class ImageDatasetWithMapping(Dataset):
    def __init__(self, data_dir, mapping_file_path, class_names, transform=None):
        self.data_dir = data_dir
        self.class_names = class_names
        self.transform = transform
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}

        # 매핑 파일 로드
        with open(mapping_file_path, 'r', encoding='utf-8') as f:
            self.filename_to_label = json.load(f)

        self.image_paths = []
        self.labels = []

        self._load_from_mapping()

    def _load_from_mapping(self):
        """매핑 파일을 사용해서 이미지와 라벨 로드"""
        valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')

        for file_name in os.listdir(self.data_dir):
            if file_name.lower().endswith(valid_extensions):
                # 매핑 파일에서 라벨 찾기
                if file_name in self.filename_to_label:
                    label = self.filename_to_label[file_name]

                    # 클래스 인덱스 확인
                    if label in self.class_to_idx:
                        image_path = os.path.join(self.data_dir, file_name)
                        self.image_paths.append(image_path)
                        self.labels.append(self.class_to_idx[label])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]

        try:
            image = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # 에러 발생시 기본 이미지 생성
            image = Image.new('RGB', (224, 224), color='white')

        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# 수정된 create_data_loaders 함수
def create_data_loaders_with_mapping(class_names):
    """매핑 파일을 사용하는 데이터 로더 생성 함수"""
    from torchvision import transforms
    from torch.utils.data import DataLoader
    import torch

    # Config 클래스 (기존과 동일)
    dataset_dir = '/content/drive/MyDrive/의장공정/데이터/image_classification/dataset'

    # 데이터 변환 정의
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop((224, 224)),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # 데이터셋 생성
    loaders = {}

    for split in ['train', 'valid', 'test']:
        split_dir = os.path.join(dataset_dir, split)
        mapping_file_path = f'/content/{split}_mapping.json'

        if os.path.exists(split_dir) and os.path.exists(mapping_file_path):
            dataset = ImageDatasetWithMapping(
                data_dir=split_dir,
                mapping_file_path=mapping_file_path,
                class_names=class_names,
                transform=train_transform if split == 'train' else test_transform
            )

            if len(dataset) > 0:
                loaders[split] = DataLoader(
                    dataset,
                    batch_size=32,
                    shuffle=(split == 'train'),
                    num_workers=2,
                    pin_memory=True if torch.cuda.is_available() else False
                )

                print(f"{split.capitalize()} dataset: {len(dataset)} samples")

    return loaders.get('train'), loaders.get('valid'), loaders.get('test')

def create_model(num_classes):
    """ResNet50 모델을 생성합니다."""
    print(f"모델 생성 중... (클래스 수: {num_classes})")

    # pretrained=True로 사전 훈련된 가중치 사용
    model = models.resnet50(pretrained=True)

    # 마지막 fully connected layer를 클래스 수에 맞게 교체
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    return model

def train_model(model, train_loader, valid_loader, num_classes):
    """모델을 학습합니다."""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=Config.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

    # 학습 기록
    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = []

    best_valid_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    print("학습 시작...")
    start_time = time.time()

    for epoch in range(Config.num_epochs):
        print(f'\nEpoch {epoch+1}/{Config.num_epochs}')
        print('-' * 50)

        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = running_corrects.double() / len(train_loader.dataset)

        # Validation phase
        if valid_loader is not None:
            model.eval()
            running_loss = 0.0
            running_corrects = 0

            with torch.no_grad():
                for inputs, labels in valid_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

            epoch_valid_loss = running_loss / len(valid_loader.dataset)
            epoch_valid_acc = running_corrects.double() / len(valid_loader.dataset)

            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')
            print(f'Valid Loss: {epoch_valid_loss:.4f} Acc: {epoch_valid_acc:.4f}')

            # 기록 저장
            valid_losses.append(epoch_valid_loss)
            valid_accs.append(epoch_valid_acc.cpu().numpy())

            # Best model 저장
            if epoch_valid_acc > best_valid_acc:
                best_valid_acc = epoch_valid_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        else:
            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')

        # 기록 저장
        train_losses.append(epoch_train_loss)
        train_accs.append(epoch_train_acc.cpu().numpy())

        scheduler.step()

    # 최적 모델 로드
    if valid_loader is not None:
        model.load_state_dict(best_model_wts)

    # 모델 저장
    model_path = os.path.join(Config.model_dir, f'best_model_{Config.timestamp}.pth')
    torch.save(model.state_dict(), model_path)
    print(f'\n최적 모델 저장: {model_path}')
    if valid_loader is not None:
        print(f'최고 검증 정확도: {best_valid_acc:.4f}')

    training_time = time.time() - start_time
    print(f'학습 완료 시간: {training_time/60:.2f}분')

    return model, train_losses, train_accs, valid_losses, valid_accs

def evaluate_model(model, test_loader, class_names):
    """모델을 평가합니다."""
    if test_loader is None:
        print("테스트 데이터가 없어 평가를 건너뜁니다.")
        return None

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # 평가 지표 계산
    accuracy = accuracy_score(all_labels, all_preds)
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    f1_micro = f1_score(all_labels, all_preds, average='micro')

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)

    # Classification Report
    report = classification_report(
        all_labels,
        all_preds,
        target_names=class_names,
        output_dict=True
    )

    print(f'\n=== 테스트 결과 ===')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1-Score (Macro): {f1_macro:.4f}')
    print(f'F1-Score (Micro): {f1_micro:.4f}')

    # 결과 저장
    results = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'confusion_matrix': cm.tolist(),
        'classification_report': report,
        'timestamp': Config.timestamp
    }

    result_path = os.path.join(Config.result_dir, f'test_results_{Config.timestamp}.json')
    with open(result_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f'결과 저장: {result_path}')

    return results

# 체크포인트 저장 기능이 추가된 개선된 Config 클래스
class Config:
    # 경로 설정
    work_dir = '/content/drive/MyDrive/의장공정/데이터/image_classification'
    dataset_dir = os.path.join(work_dir, 'dataset')
    model_dir = os.path.join(work_dir, 'models')
    result_dir = os.path.join(work_dir, 'results')
    checkpoint_dir = os.path.join(work_dir, 'checkpoints')  # 체크포인트 저장 경로

    # 학습 설정
    IMG_SIZE = (224, 224)
    batch_size = 32
    num_workers = 2
    lr = 1e-4
    num_epochs = 50

    # 체크포인트 설정
    save_checkpoint_every = 5  # 5 에포크마다 저장

    # 현재 시간
    now = datetime.datetime.now()
    timestamp = '250802_1554'

# 체크포인트 저장/로드 함수들
def save_checkpoint(model, optimizer, scheduler, epoch, train_losses, train_accs,
                   valid_losses, valid_accs, best_valid_acc, checkpoint_dir, timestamp):
    """체크포인트 저장"""
    os.makedirs(checkpoint_dir, exist_ok=True)

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_losses': train_losses,
        'train_accs': train_accs,
        'valid_losses': valid_losses,
        'valid_accs': valid_accs,
        'best_valid_acc': best_valid_acc,
        'timestamp': timestamp
    }

    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}_{timestamp}.pth')
    torch.save(checkpoint, checkpoint_path)
    print(f'체크포인트 저장: {checkpoint_path}')

    # 최신 체크포인트 경로도 별도 저장 (복구 시 쉽게 찾기 위해)
    latest_path = os.path.join(checkpoint_dir, f'latest_checkpoint_{timestamp}.pth')
    torch.save(checkpoint, latest_path)

    return checkpoint_path

def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
    """체크포인트 로드"""
    print(f"체크포인트 로드: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, weights_only=False)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    return (checkpoint['epoch'], checkpoint['train_losses'], checkpoint['train_accs'],
            checkpoint['valid_losses'], checkpoint['valid_accs'], checkpoint['best_valid_acc'])

def find_latest_checkpoint(checkpoint_dir, timestamp):
    """가장 최근 체크포인트 찾기"""
    latest_path = os.path.join(checkpoint_dir, f'latest_checkpoint_{timestamp}.pth')
    if os.path.exists(latest_path):
        return latest_path

    # 해당 타임스탬프의 체크포인트들 찾기
    checkpoints = []
    if os.path.exists(checkpoint_dir):
        for file in os.listdir(checkpoint_dir):
            if file.startswith('checkpoint_epoch_') and timestamp in file:
                epoch_num = int(file.split('_')[2])
                checkpoints.append((epoch_num, os.path.join(checkpoint_dir, file)))

    if checkpoints:
        checkpoints.sort(reverse=True)  # 에포크 번호 내림차순 정렬
        return checkpoints[0][1]  # 가장 최근 체크포인트 반환

    return None

# 개선된 학습 함수 (체크포인트 기능 포함)
def train_model_with_checkpoints(model, train_loader, valid_loader, num_classes,
                                resume_from_checkpoint=None):
    """체크포인트 저장 기능이 있는 모델 학습"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=Config.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

    # 학습 기록 초기화
    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = []
    best_valid_acc = 0.0
    start_epoch = 0

    # 체크포인트에서 복구
    if resume_from_checkpoint:
        try:
            (start_epoch, train_losses, train_accs, valid_losses,
             valid_accs, best_valid_acc) = load_checkpoint(resume_from_checkpoint, model, optimizer, scheduler)
            print(f"에포크 {start_epoch}부터 학습 재개")
            start_epoch += 1  # 다음 에포크부터 시작
        except Exception as e:
            print(f"체크포인트 로드 실패: {e}")
            print("처음부터 학습을 시작합니다.")
            start_epoch = 0

    best_model_wts = copy.deepcopy(model.state_dict())

    print("학습 시작..." if start_epoch == 0 else f"학습 재개... (에포크 {start_epoch}부터)")
    start_time = time.time()

    for epoch in range(start_epoch, Config.num_epochs):
        print(f'\nEpoch {epoch+1}/{Config.num_epochs}')
        print('-' * 50)

        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = running_corrects.double() / len(train_loader.dataset)

        # Validation phase
        if valid_loader is not None:
            model.eval()
            running_loss = 0.0
            running_corrects = 0

            with torch.no_grad():
                for inputs, labels in valid_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

            epoch_valid_loss = running_loss / len(valid_loader.dataset)
            epoch_valid_acc = running_corrects.double() / len(valid_loader.dataset)

            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')
            print(f'Valid Loss: {epoch_valid_loss:.4f} Acc: {epoch_valid_acc:.4f}')

            # 기록 저장
            valid_losses.append(epoch_valid_loss)
            valid_accs.append(epoch_valid_acc.cpu().numpy())

            # Best model 저장
            if epoch_valid_acc > best_valid_acc:
                best_valid_acc = epoch_valid_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                print(f"새로운 최고 성능! Valid Acc: {best_valid_acc:.4f}")
        else:
            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')

        # 기록 저장
        train_losses.append(epoch_train_loss)
        train_accs.append(epoch_train_acc.cpu().numpy())

        scheduler.step()

        # 체크포인트 저장 (매 N 에포크마다)
        if (epoch + 1) % Config.save_checkpoint_every == 0:
            save_checkpoint(model, optimizer, scheduler, epoch, train_losses, train_accs,
                          valid_losses, valid_accs, best_valid_acc, Config.checkpoint_dir, Config.timestamp)
            print(f"에포크 {epoch+1} 체크포인트 저장 완료")

    # 최적 모델 로드
    if valid_loader is not None:
        model.load_state_dict(best_model_wts)

    # 최종 모델 저장
    os.makedirs(Config.model_dir, exist_ok=True)
    model_path = os.path.join(Config.model_dir, f'best_model_{Config.timestamp}.pth')
    torch.save(model.state_dict(), model_path)
    print(f'\n최적 모델 저장: {model_path}')
    if valid_loader is not None:
        print(f'최고 검증 정확도: {best_valid_acc:.4f}')

    # 최종 체크포인트도 저장
    save_checkpoint(model, optimizer, scheduler, Config.num_epochs-1, train_losses, train_accs,
                   valid_losses, valid_accs, best_valid_acc, Config.checkpoint_dir, Config.timestamp)

    training_time = time.time() - start_time
    print(f'학습 완료 시간: {training_time/60:.2f}분')

    return model, train_losses, train_accs, valid_losses, valid_accs

# 체크포인트 복구를 위한 헬퍼 함수
def resume_training_from_latest():
    """가장 최근 체크포인트에서 학습 재개"""
    latest_checkpoint = find_latest_checkpoint(Config.checkpoint_dir, Config.timestamp)

    if latest_checkpoint:
        print(f"체크포인트 발견: {latest_checkpoint}")
        return latest_checkpoint
    else:
        print("체크포인트를 찾을 수 없습니다. 처음부터 시작합니다.")
        return None

# 수정된 실행 함수
def run_training_with_checkpoints():
    """체크포인트 기능을 포함한 학습 실행"""

    # 체크포인트 디렉토리 생성
    os.makedirs(Config.checkpoint_dir, exist_ok=True)

    # 최근 체크포인트 확인
    resume_checkpoint = resume_training_from_latest()

    if resume_checkpoint:
        response = input("체크포인트에서 재개하시겠습니까? (y/n): ")
        if response.lower() != 'y':
            resume_checkpoint = None

    # 모델 생성
    model = create_model(len(class_names))

    # 학습 시작
    trained_model, train_losses, train_accs, valid_losses, valid_accs = train_model_with_checkpoints(
        model, train_loader, valid_loader, len(class_names), resume_checkpoint
    )

    # 평가
    results = evaluate_model(trained_model, test_loader, class_names)

    return trained_model, results

print("체크포인트 저장 기능이 추가된 학습 시스템이 준비되었습니다!")
print(f"매 {Config.save_checkpoint_every} 에포크마다 자동으로 저장됩니다.")
print("런타임이 끊어져도 마지막 체크포인트에서 재개할 수 있습니다!")

class_names = [
    '고정 불량_불량품', '고정 불량_양품', '고정핀 불량_불량품', '고정핀 불량_양품',
    '단차_불량품', '단차_양품', '스크래치_불량품', '스크래치_양품',
    '실링 불량_불량품', '실링 불량_양품', '연계 불량_불량품', '연계 불량_양품',
    '외관 손상_불량품', '외관 손상_양품', '유격 불량_불량품', '유격 불량_양품',
    '장착 불량_불량품', '장착 불량_양품', '체결 불량_불량품', '체결 불량_양품',
    '헤밍 불량_불량품', '헤밍 불량_양품', '홀 변형_불량품', '홀 변형_양품'
]

train_loader, valid_loader, test_loader = create_data_loaders_with_mapping(class_names)

# 7. 모델 학습
trained_model, results = run_training_with_checkpoints()

체크포인트 저장 기능이 추가된 학습 시스템이 준비되었습니다!
매 5 에포크마다 자동으로 저장됩니다.
런타임이 끊어져도 마지막 체크포인트에서 재개할 수 있습니다!
Train dataset: 15076 samples
Valid dataset: 3232 samples
Test dataset: 3245 samples
체크포인트 발견: /content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/latest_checkpoint_250802_1554.pth
체크포인트에서 재개하시겠습니까? (y/n): y
모델 생성 중... (클래스 수: 24)
Using device: cuda
체크포인트 로드: /content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/latest_checkpoint_250802_1554.pth
에포크 14부터 학습 재개
학습 재개... (에포크 15부터)

Epoch 16/50
--------------------------------------------------




Train Loss: 0.1749 Acc: 0.9388
Valid Loss: 0.8084 Acc: 0.7509
새로운 최고 성능! Valid Acc: 0.7509

Epoch 17/50
--------------------------------------------------




Train Loss: 0.1286 Acc: 0.9583
Valid Loss: 0.8225 Acc: 0.7559
새로운 최고 성능! Valid Acc: 0.7559

Epoch 18/50
--------------------------------------------------




Train Loss: 0.1059 Acc: 0.9645
Valid Loss: 0.8324 Acc: 0.7571
새로운 최고 성능! Valid Acc: 0.7571

Epoch 19/50
--------------------------------------------------




Train Loss: 0.0931 Acc: 0.9693
Valid Loss: 0.8670 Acc: 0.7624
새로운 최고 성능! Valid Acc: 0.7624

Epoch 20/50
--------------------------------------------------




Train Loss: 0.0838 Acc: 0.9724
Valid Loss: 0.9215 Acc: 0.7562
체크포인트 저장: /content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/checkpoint_epoch_19_250802_1554.pth
에포크 20 체크포인트 저장 완료

Epoch 21/50
--------------------------------------------------




Train Loss: 0.0810 Acc: 0.9730
Valid Loss: 0.9260 Acc: 0.7630
새로운 최고 성능! Valid Acc: 0.7630

Epoch 22/50
--------------------------------------------------




Train Loss: 0.0712 Acc: 0.9767
Valid Loss: 0.9237 Acc: 0.7633
새로운 최고 성능! Valid Acc: 0.7633

Epoch 23/50
--------------------------------------------------




Train Loss: 0.0653 Acc: 0.9797
Valid Loss: 0.9359 Acc: 0.7587

Epoch 24/50
--------------------------------------------------




Train Loss: 0.0626 Acc: 0.9806
Valid Loss: 0.9779 Acc: 0.7605

Epoch 25/50
--------------------------------------------------




Train Loss: 0.0558 Acc: 0.9828
Valid Loss: 0.9865 Acc: 0.7574
체크포인트 저장: /content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/checkpoint_epoch_24_250802_1554.pth
에포크 25 체크포인트 저장 완료

Epoch 26/50
--------------------------------------------------


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78ac33e3f9c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78ac33e3f9c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 16

KeyboardInterrupt: 

In [None]:
!ls -la /content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/

total 1106029
-rw------- 1 root root 283144042 Aug  3 09:50 checkpoint_epoch_14_250802_1554.pth
-rw------- 1 root root 283141930 Aug  2 22:00 checkpoint_epoch_4_250802_1554.pth
-rw------- 1 root root 283143018 Aug  3 03:50 checkpoint_epoch_9_250802_1554.pth
-rw------- 1 root root 283144042 Aug  3 09:50 latest_checkpoint_250802_1554.pth


In [None]:
# weights_only=False 옵션 추가
checkpoint = torch.load('/content/drive/MyDrive/의장공정/데이터/image_classification/checkpoints/latest_checkpoint_250802_1554.pth', weights_only=False)
print(f"저장된 에포크: {checkpoint['epoch']}")
print(f"최고 검증 정확도: {checkpoint['best_valid_acc']}")
print(f"학습 기록 길이: {len(checkpoint['train_losses'])}")

저장된 에포크: 14
최고 검증 정확도: 0.7301980198019802
학습 기록 길이: 15


## 고도화 1:  과적합 방지 장치 추가


In [None]:
# 새 폴더로 완전히 처음부터 시작하는 코드

import warnings
import json
import os
import time
import copy
import datetime
import gc

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# PIL 설정
warnings.filterwarnings("ignore", category=Image.DecompressionBombWarning)
Image.MAX_IMAGE_PIXELS = None

# GPU 설정
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"GPU: {torch.cuda.get_device_name()}")
else:
    print("CPU 모드")

# 새로운 Config - 과적합 방지 강화
class ConfigV2:
    # 새로운 경로 설정
    work_dir = '/content/drive/MyDrive/의장공정/데이터/image_classification'
    dataset_dir = os.path.join(work_dir, 'dataset')

    # 새 폴더들 - v2로 구분
    model_dir = os.path.join(work_dir, 'models_v2')
    result_dir = os.path.join(work_dir, 'results_v2')
    checkpoint_dir = os.path.join(work_dir, 'checkpoints_v2')

    # 과적합 방지를 위한 학습 설정
    IMG_SIZE = (224, 224)
    batch_size = 12  # 더 작게
    num_workers = 0  # 멀티프로세싱 비활성화
    lr = 3e-5  # 더 낮은 학습률
    num_epochs = 40  # 더 적은 에포크

    # 정규화 설정
    weight_decay = 1e-4  # L2 정규화
    dropout_rate = 0.7

    # 체크포인트 설정
    save_checkpoint_every = 3

    # 새로운 타임스탬프
    now = datetime.datetime.now()
    timestamp = now.strftime('%y%m%d_%H%M')  # 현재 시간으로

    # Early stopping 설정
    patience = 8  # 8 에포크 동안 개선 없으면 중단
    min_delta = 0.001  # 최소 개선 임계값

print(f"새 실험 시작 - 타임스탬프: {ConfigV2.timestamp}")
print(f"저장 폴더: models_v2, results_v2, checkpoints_v2")

# 개선된 Dataset 클래스
class ImageDatasetV2(Dataset):
    def __init__(self, data_dir, mapping_file_path, class_names, transform=None, max_size=(800, 800)):
        self.data_dir = data_dir
        self.class_names = class_names
        self.transform = transform
        self.max_size = max_size
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}

        with open(mapping_file_path, 'r', encoding='utf-8') as f:
            self.filename_to_label = json.load(f)

        self.image_paths = []
        self.labels = []
        self._load_from_mapping()

    def _load_from_mapping(self):
        valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')

        for file_name in os.listdir(self.data_dir):
            if file_name.lower().endswith(valid_extensions):
                if file_name in self.filename_to_label:
                    label = self.filename_to_label[file_name]
                    if label in self.class_to_idx:
                        image_path = os.path.join(self.data_dir, file_name)
                        self.image_paths.append(image_path)
                        self.labels.append(self.class_to_idx[label])

        print(f"로드된 이미지: {len(self.image_paths)}개")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]

        try:
            image = Image.open(image_path).convert('RGB')

            # 큰 이미지 자동 리사이즈 (성능 향상)
            if image.size[0] * image.size[1] > 5000000:  # 5MP 이상
                image.thumbnail(self.max_size, Image.Resampling.LANCZOS)

        except Exception as e:
            print(f"이미지 오류: {image_path}")
            image = Image.new('RGB', (224, 224), color='white')

        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# 과적합 방지 강화 데이터 로더
def create_robust_loaders(class_names):
    """과적합 방지를 위한 강화된 데이터 로더"""

    # 매우 강한 데이터 증강
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop((224, 224), padding=32, padding_mode='reflect'),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomVerticalFlip(0.3),
        transforms.RandomRotation(25),
        transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25),
        transforms.RandomGrayscale(p=0.15),
        transforms.RandomPerspective(distortion_scale=0.3, p=0.4),
        transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        transforms.RandomErasing(p=0.25, scale=(0.02, 0.33), ratio=(0.3, 3.3))
    ])

    valid_test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    loaders = {}

    for split in ['train', 'valid', 'test']:
        split_dir = os.path.join(ConfigV2.dataset_dir, split)
        mapping_file_path = f'/content/{split}_mapping.json'

        if os.path.exists(split_dir) and os.path.exists(mapping_file_path):
            dataset = ImageDatasetV2(
                data_dir=split_dir,
                mapping_file_path=mapping_file_path,
                class_names=class_names,
                transform=train_transform if split == 'train' else valid_test_transform
            )

            if len(dataset) > 0:
                loaders[split] = DataLoader(
                    dataset,
                    batch_size=ConfigV2.batch_size,
                    shuffle=(split == 'train'),
                    num_workers=ConfigV2.num_workers,
                    pin_memory=True if torch.cuda.is_available() else False,
                    drop_last=(split == 'train')
                )
                print(f"{split}: {len(dataset)} samples")

    return loaders.get('train'), loaders.get('valid'), loaders.get('test')

# 과적합 방지 모델 - 더 보수적
def create_anti_overfit_model(num_classes):
    """과적합 방지에 특화된 모델"""
    print(f"과적합 방지 모델 생성 (클래스: {num_classes})")

    model = models.resnet50(pretrained=True)

    # 대부분의 레이어 고정 (과적합 방지)
    for name, param in model.named_parameters():
        if 'layer4' not in name and 'fc' not in name:  # layer4와 fc만 학습
            param.requires_grad = False
        else:
            param.requires_grad = True

    # 매우 보수적인 분류기
    model.fc = nn.Sequential(
        nn.Dropout(ConfigV2.dropout_rate),
        nn.Linear(model.fc.in_features, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.BatchNorm1d(128),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(128, num_classes)
    )

    return model

# Early Stopping 클래스
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = copy.deepcopy(model.state_dict())

# 체크포인트 저장 함수
def save_checkpoint_v2(model, optimizer, scheduler, epoch, train_losses, train_accs,
                      valid_losses, valid_accs, best_valid_acc):
    """V2 체크포인트 저장"""
    os.makedirs(ConfigV2.checkpoint_dir, exist_ok=True)

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_losses': train_losses,
        'train_accs': train_accs,
        'valid_losses': valid_losses,
        'valid_accs': valid_accs,
        'best_valid_acc': best_valid_acc,
        'config': {
            'lr': ConfigV2.lr,
            'batch_size': ConfigV2.batch_size,
            'dropout_rate': ConfigV2.dropout_rate,
            'weight_decay': ConfigV2.weight_decay
        },
        'timestamp': ConfigV2.timestamp
    }

    checkpoint_path = os.path.join(ConfigV2.checkpoint_dir, f'checkpoint_epoch_{epoch}_{ConfigV2.timestamp}.pth')
    torch.save(checkpoint, checkpoint_path)

    latest_path = os.path.join(ConfigV2.checkpoint_dir, f'latest_checkpoint_{ConfigV2.timestamp}.pth')
    torch.save(checkpoint, latest_path)

    print(f'✓ 체크포인트 저장: epoch {epoch+1}')
    return checkpoint_path

# 개선된 학습 함수
def train_anti_overfit_model(model, train_loader, valid_loader, num_classes):
    """과적합 방지에 특화된 학습 함수"""

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # L2 정규화가 추가된 옵티마이저
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Label smoothing 추가
    optimizer = optim.AdamW(model.parameters(), lr=ConfigV2.lr, weight_decay=ConfigV2.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)

    # Early stopping
    early_stopping = EarlyStopping(patience=ConfigV2.patience, min_delta=0.001)

    # 기록
    train_losses, train_accs = [], []
    valid_losses, valid_accs = [], []
    best_valid_acc = 0.0

    print(f"=== 과적합 방지 학습 시작 ===")
    print(f"학습률: {ConfigV2.lr}, 배치크기: {ConfigV2.batch_size}")
    print(f"Weight Decay: {ConfigV2.weight_decay}, Dropout: {ConfigV2.dropout_rate}")

    start_time = time.time()

    for epoch in range(ConfigV2.num_epochs):
        epoch_start = time.time()
        print(f'\nEpoch {epoch+1}/{ConfigV2.num_epochs}')
        print('-' * 60)

        # Training
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping (과적합 방지)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

            # 메모리 정리
            if batch_idx % 50 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = running_corrects.double() / len(train_loader.dataset)

        # Validation
        if valid_loader is not None:
            model.eval()
            running_loss = 0.0
            running_corrects = 0

            with torch.no_grad():
                for inputs, labels in valid_loader:
                    inputs = inputs.to(device, non_blocking=True)
                    labels = labels.to(device, non_blocking=True)

                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

            epoch_valid_loss = running_loss / len(valid_loader.dataset)
            epoch_valid_acc = running_corrects.double() / len(valid_loader.dataset)

            # 스케줄러 업데이트
            scheduler.step(epoch_valid_loss)

            # 결과 출력
            epoch_time = time.time() - epoch_start
            overfitting = epoch_train_acc - epoch_valid_acc

            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')
            print(f'Valid Loss: {epoch_valid_loss:.4f} Acc: {epoch_valid_acc:.4f}')
            print(f'과적합 정도: {overfitting:.4f} | 시간: {epoch_time:.1f}초')
            print(f'현재 학습률: {optimizer.param_groups[0]["lr"]:.2e}')

            # 기록 저장
            train_losses.append(epoch_train_loss)
            train_accs.append(epoch_train_acc.cpu().numpy())
            valid_losses.append(epoch_valid_loss)
            valid_accs.append(epoch_valid_acc.cpu().numpy())

            # Best model 업데이트
            if epoch_valid_acc > best_valid_acc:
                best_valid_acc = epoch_valid_acc
                print(f'★ 새로운 최고 성능! Valid Acc: {best_valid_acc:.4f}')

            # Early stopping 체크
            if early_stopping(epoch_valid_loss, model):
                print(f"\n⏹ Early stopping at epoch {epoch+1}")
                break

            # 체크포인트 저장
            if (epoch + 1) % ConfigV2.save_checkpoint_every == 0:
                save_checkpoint_v2(model, optimizer, scheduler, epoch,
                                  train_losses, train_accs, valid_losses, valid_accs, best_valid_acc)

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    gc.collect()

    # 최종 모델 저장
    os.makedirs(ConfigV2.model_dir, exist_ok=True)
    model_path = os.path.join(ConfigV2.model_dir, f'best_model_{ConfigV2.timestamp}.pth')
    torch.save(model.state_dict(), model_path)

    training_time = time.time() - start_time
    print(f'\n=== 학습 완료 ===')
    print(f'최고 검증 정확도: {best_valid_acc:.4f}')
    print(f'학습 시간: {training_time/60:.1f}분')
    print(f'모델 저장: {model_path}')

    return model, train_losses, train_accs, valid_losses, valid_accs

# 클래스 이름
class_names = [
    '고정 불량_불량품', '고정 불량_양품', '고정핀 불량_불량품', '고정핀 불량_양품',
    '단차_불량품', '단차_양품', '스크래치_불량품', '스크래치_양품',
    '실링 불량_불량품', '실링 불량_양품', '연계 불량_불량품', '연계 불량_양품',
    '외관 손상_불량품', '외관 손상_양품', '유격 불량_불량품', '유격 불량_양품',
    '장착 불량_불량품', '장착 불량_양품', '체결 불량_불량품', '체결 불량_양품',
    '헤밍 불량_불량품', '헤밍 불량_양품', '홀 변형_불량품', '홀 변형_양품'
]

print("=== 새 실험 준비 완료 ===")
print("다음 단계:")
print("1. train_loader, valid_loader, test_loader = create_robust_loaders(class_names)")
print("2. model = create_anti_overfit_model(len(class_names))")
print("3. trained_model, losses, accs, val_losses, val_accs = train_anti_overfit_model(model, train_loader, valid_loader, len(class_names))")
print("")
print("주요 개선사항:")
print("- 새 폴더로 완전히 새 시작")
print("- 매우 강한 정규화 (Dropout 0.7, Weight Decay, Label Smoothing)")
print("- 강화된 데이터 증강")
print("- Early Stopping")
print("- 대부분 레이어 고정 (layer4만 학습)")
print("- 작은 배치 크기 (12)")
print("- 낮은 학습률 (3e-5)")

train_loader, valid_loader, test_loader = create_robust_loaders(class_names)
model = create_anti_overfit_model(len(class_names))
trained_model, *_ = train_anti_overfit_model(model, train_loader, valid_loader, len(class_names))

GPU: Tesla T4
새 실험 시작 - 타임스탬프: 250804_2305
저장 폴더: models_v2, results_v2, checkpoints_v2
=== 새 실험 준비 완료 ===
다음 단계:
1. train_loader, valid_loader, test_loader = create_robust_loaders(class_names)
2. model = create_anti_overfit_model(len(class_names))
3. trained_model, losses, accs, val_losses, val_accs = train_anti_overfit_model(model, train_loader, valid_loader, len(class_names))

주요 개선사항:
- 새 폴더로 완전히 새 시작
- 매우 강한 정규화 (Dropout 0.7, Weight Decay, Label Smoothing)
- 강화된 데이터 증강
- Early Stopping
- 대부분 레이어 고정 (layer4만 학습)
- 작은 배치 크기 (12)
- 낮은 학습률 (3e-5)
로드된 이미지: 15076개
train: 15076 samples
로드된 이미지: 3232개
valid: 3232 samples
로드된 이미지: 3245개
test: 3245 samples
과적합 방지 모델 생성 (클래스: 24)




=== 과적합 방지 학습 시작 ===
학습률: 3e-05, 배치크기: 12
Weight Decay: 0.0001, Dropout: 0.7

Epoch 1/40
------------------------------------------------------------
Train Loss: 2.9595 Acc: 0.1482
Valid Loss: 2.3569 Acc: 0.3159
과적합 정도: -0.1677 | 시간: 16878.1초
현재 학습률: 3.00e-05
★ 새로운 최고 성능! Valid Acc: 0.3159

Epoch 2/40
------------------------------------------------------------
Train Loss: 2.4502 Acc: 0.2885
Valid Loss: 1.9853 Acc: 0.3957
과적합 정도: -0.1073 | 시간: 10862.7초
현재 학습률: 3.00e-05
★ 새로운 최고 성능! Valid Acc: 0.3957

Epoch 3/40
------------------------------------------------------------
이미지 오류: /content/drive/MyDrive/의장공정/데이터/image_classification/dataset/train/204_101_10_4a61c180-d8ef-4311-923e-0ebea02a1368.jpg
Train Loss: 2.2265 Acc: 0.3408
Valid Loss: 1.8157 Acc: 0.4542
과적합 정도: -0.1134 | 시간: 11747.6초
현재 학습률: 3.00e-05
★ 새로운 최고 성능! Valid Acc: 0.4542
✓ 체크포인트 저장: epoch 3

Epoch 4/40
------------------------------------------------------------


## 고도화 2) 과적합 방지 장치 + 좀 더 좁은 간격으로 중간 모델 저장
- 3 에포크마다 저장
- checkpoint v2 폴더에 모델 저장 + 해당 폴더에 저장된 모델 불러와서 학습 이어서 진행 하는 코드
- 의장공정/데이터/image_classification/ 위치에 3개의 _mapping.json 업로드 해둠


 -> 해당 json 파일을 코랩 로컬에 업로드 후 아래의 훈련 셀 실행해야함 (+ 구글 드라이브 마운트 코드도 실행해야함)

In [None]:
import warnings
import json
import os
import time
import copy
import datetime
import gc

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# PIL 설정
warnings.filterwarnings("ignore", category=Image.DecompressionBombWarning)
Image.MAX_IMAGE_PIXELS = None

# GPU 설정
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"GPU: {torch.cuda.get_device_name()}")
else:
    print("CPU 모드")


# 개선된 Dataset 클래스
class ImageDatasetV2(Dataset):
    def __init__(self, data_dir, mapping_file_path, class_names, transform=None, max_size=(800, 800)):
        self.data_dir = data_dir
        self.class_names = class_names
        self.transform = transform
        self.max_size = max_size
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}

        with open(mapping_file_path, 'r', encoding='utf-8') as f:
            self.filename_to_label = json.load(f)

        self.image_paths = []
        self.labels = []
        self._load_from_mapping()

    def _load_from_mapping(self):
        valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')

        for file_name in os.listdir(self.data_dir):
            if file_name.lower().endswith(valid_extensions):
                if file_name in self.filename_to_label:
                    label = self.filename_to_label[file_name]
                    if label in self.class_to_idx:
                        image_path = os.path.join(self.data_dir, file_name)
                        self.image_paths.append(image_path)
                        self.labels.append(self.class_to_idx[label])

        print(f"로드된 이미지: {len(self.image_paths)}개")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]

        try:
            image = Image.open(image_path).convert('RGB')

            # 큰 이미지 자동 리사이즈 (성능 향상)
            if image.size[0] * image.size[1] > 5000000:  # 5MP 이상
                image.thumbnail(self.max_size, Image.Resampling.LANCZOS)

        except Exception as e:
            print(f"이미지 오류: {image_path}")
            image = Image.new('RGB', (224, 224), color='white')

        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# 과적합 방지 강화 데이터 로더
def create_robust_loaders(class_names):
    """과적합 방지를 위한 강화된 데이터 로더"""

    # 매우 강한 데이터 증강
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop((224, 224), padding=32, padding_mode='reflect'),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomVerticalFlip(0.3),
        transforms.RandomRotation(25),
        transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25),
        transforms.RandomGrayscale(p=0.15),
        transforms.RandomPerspective(distortion_scale=0.3, p=0.4),
        transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        transforms.RandomErasing(p=0.25, scale=(0.02, 0.33), ratio=(0.3, 3.3))
    ])

    valid_test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    loaders = {}

    for split in ['train', 'valid', 'test']:
        split_dir = os.path.join(ConfigV2.dataset_dir, split)
        mapping_file_path = f'/content/{split}_mapping.json'

        if os.path.exists(split_dir) and os.path.exists(mapping_file_path):
            dataset = ImageDatasetV2(
                data_dir=split_dir,
                mapping_file_path=mapping_file_path,
                class_names=class_names,
                transform=train_transform if split == 'train' else valid_test_transform
            )

            if len(dataset) > 0:
                loaders[split] = DataLoader(
                    dataset,
                    batch_size=ConfigV2.batch_size,
                    shuffle=(split == 'train'),
                    num_workers=ConfigV2.num_workers,
                    pin_memory=True if torch.cuda.is_available() else False,
                    drop_last=(split == 'train')
                )
                print(f"{split}: {len(dataset)} samples")

    return loaders.get('train'), loaders.get('valid'), loaders.get('test')

# 과적합 방지 모델 - 더 보수적
def create_anti_overfit_model(num_classes):
    """과적합 방지에 특화된 모델"""
    print(f"과적합 방지 모델 생성 (클래스: {num_classes})")

    model = models.resnet50(pretrained=True)

    # 대부분의 레이어 고정 (과적합 방지)
    for name, param in model.named_parameters():
        if 'layer4' not in name and 'fc' not in name:  # layer4와 fc만 학습
            param.requires_grad = False
        else:
            param.requires_grad = True

    # 매우 보수적인 분류기
    model.fc = nn.Sequential(
        nn.Dropout(ConfigV2.dropout_rate),
        nn.Linear(model.fc.in_features, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.BatchNorm1d(128),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(128, num_classes)
    )

    return model

# Early Stopping 클래스
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = copy.deepcopy(model.state_dict())

# 타임스탬프 고정 (이전 실험과 동일하게)
class ConfigV2:
    # 새로운 경로 설정
    work_dir = '/content/drive/MyDrive/의장공정/데이터/image_classification'
    dataset_dir = os.path.join(work_dir, 'dataset')

    # 새 폴더들 - v2로 구분
    model_dir = os.path.join(work_dir, 'models_v2')
    result_dir = os.path.join(work_dir, 'results_v2')
    checkpoint_dir = os.path.join(work_dir, 'checkpoints_v2')

    # 과적합 방지를 위한 학습 설정
    IMG_SIZE = (224, 224)
    batch_size = 12  # 더 작게
    num_workers = 0  # 멀티프로세싱 비활성화
    lr = 3e-5  # 더 낮은 학습률
    num_epochs = 40  # 더 적은 에포크

    # 정규화 설정
    weight_decay = 1e-4  # L2 정규화
    dropout_rate = 0.7

    # 체크포인트 설정
    save_checkpoint_every = 3

    # 이전 실험의 타임스탬프로 고정
    timestamp = '250804_2305'  # 고정!

    # Early stopping 설정
    patience = 8  # 8 에포크 동안 개선 없으면 중단
    min_delta = 0.001  # 최소 개선 임계값

print(f"이전 실험 재개 - 타임스탬프: {ConfigV2.timestamp}")
print(f"저장 폴더: models_v2, results_v2, checkpoints_v2")

# 체크포인트 찾기 및 로드 함수들
def find_latest_checkpoint_v2():
    """이전 실험의 최신 체크포인트 찾기"""
    checkpoint_dir = ConfigV2.checkpoint_dir
    timestamp = ConfigV2.timestamp

    print(f"체크포인트 디렉토리 확인: {checkpoint_dir}")

    # 최신 체크포인트 경로 먼저 확인
    latest_path = os.path.join(checkpoint_dir, f'latest_checkpoint_{timestamp}.pth')
    if os.path.exists(latest_path):
        print(f"✓ 최신 체크포인트 발견: {latest_path}")
        return latest_path

    # 개별 체크포인트들 찾기
    checkpoints = []
    if os.path.exists(checkpoint_dir):
        for file in os.listdir(checkpoint_dir):
            if file.startswith('checkpoint_epoch_') and timestamp in file:
                try:
                    # 파일명에서 에포크 번호 추출
                    parts = file.split('_')
                    epoch_num = int(parts[2])
                    checkpoints.append((epoch_num, os.path.join(checkpoint_dir, file)))
                    print(f"  발견된 체크포인트: {file} (에포크 {epoch_num})")
                except (IndexError, ValueError):
                    continue

    if checkpoints:
        checkpoints.sort(reverse=True)  # 에포크 번호 내림차순 정렬
        latest_checkpoint = checkpoints[0][1]
        print(f"✓ 가장 최근 체크포인트: {latest_checkpoint} (에포크 {checkpoints[0][0]})")
        return latest_checkpoint

    print("✗ 체크포인트를 찾을 수 없습니다.")
    return None

def load_checkpoint_v2(checkpoint_path, model, optimizer, scheduler):
    """체크포인트 로드"""
    print(f"체크포인트 로드 중: {checkpoint_path}")

    try:
        checkpoint = torch.load(checkpoint_path, weights_only=False, map_location='cpu')

        # 모델 상태 로드
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

        # 학습 기록 로드
        start_epoch = checkpoint['epoch'] + 1  # 다음 에포크부터 시작
        train_losses = checkpoint['train_losses']
        train_accs = checkpoint['train_accs']
        valid_losses = checkpoint['valid_losses']
        valid_accs = checkpoint['valid_accs']
        best_valid_acc = checkpoint['best_valid_acc']

        print(f"✓ 체크포인트 로드 완료!")
        print(f"  - 재개 에포크: {start_epoch}")
        print(f"  - 최고 검증 정확도: {best_valid_acc:.4f}")
        print(f"  - 기존 학습 기록: {len(train_losses)}개 에포크")

        return start_epoch, train_losses, train_accs, valid_losses, valid_accs, best_valid_acc

    except Exception as e:
        print(f"✗ 체크포인트 로드 실패: {e}")
        return None

# 수정된 학습 함수 (체크포인트 재개 기능 추가)
def train_anti_overfit_model_resume(model, train_loader, valid_loader, num_classes):
    """체크포인트에서 재개 가능한 학습 함수"""

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # 옵티마이저와 스케줄러 설정
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=ConfigV2.lr, weight_decay=ConfigV2.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)

    # 체크포인트에서 재개 시도
    latest_checkpoint = find_latest_checkpoint_v2()

    if latest_checkpoint:
        resume_data = load_checkpoint_v2(latest_checkpoint, model, optimizer, scheduler)
        if resume_data:
            start_epoch, train_losses, train_accs, valid_losses, valid_accs, best_valid_acc = resume_data
            print(f"\n🔄 에포크 {start_epoch}부터 학습 재개!")
        else:
            print("\n❌ 체크포인트 로드 실패 - 처음부터 시작")
            start_epoch = 0
            train_losses, train_accs = [], []
            valid_losses, valid_accs = [], []
            best_valid_acc = 0.0
    else:
        print("\n🆕 체크포인트 없음 - 처음부터 시작")
        start_epoch = 0
        train_losses, train_accs = [], []
        valid_losses, valid_accs = [], []
        best_valid_acc = 0.0

    # Early stopping 초기화
    early_stopping = EarlyStopping(patience=ConfigV2.patience, min_delta=0.001)

    print(f"\n=== 학습 시작/재개 ===")
    print(f"시작 에포크: {start_epoch + 1}/{ConfigV2.num_epochs}")
    print(f"학습률: {ConfigV2.lr}, 배치크기: {ConfigV2.batch_size}")
    print(f"Weight Decay: {ConfigV2.weight_decay}, Dropout: {ConfigV2.dropout_rate}")

    start_time = time.time()

    for epoch in range(start_epoch, ConfigV2.num_epochs):
        epoch_start = time.time()
        print(f'\nEpoch {epoch+1}/{ConfigV2.num_epochs}')
        print('-' * 60)

        # Training
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping (과적합 방지)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

            # 메모리 정리
            if batch_idx % 50 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = running_corrects.double() / len(train_loader.dataset)

        # Validation
        if valid_loader is not None:
            model.eval()
            running_loss = 0.0
            running_corrects = 0

            with torch.no_grad():
                for inputs, labels in valid_loader:
                    inputs = inputs.to(device, non_blocking=True)
                    labels = labels.to(device, non_blocking=True)

                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

            epoch_valid_loss = running_loss / len(valid_loader.dataset)
            epoch_valid_acc = running_corrects.double() / len(valid_loader.dataset)

            # 스케줄러 업데이트
            scheduler.step(epoch_valid_loss)

            # 결과 출력
            epoch_time = time.time() - epoch_start
            overfitting = epoch_train_acc - epoch_valid_acc

            print(f'Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f}')
            print(f'Valid Loss: {epoch_valid_loss:.4f} Acc: {epoch_valid_acc:.4f}')
            print(f'과적합 정도: {overfitting:.4f} | 시간: {epoch_time:.1f}초')
            print(f'현재 학습률: {optimizer.param_groups[0]["lr"]:.2e}')

            # 기록 저장
            train_losses.append(epoch_train_loss)
            train_accs.append(epoch_train_acc.cpu().numpy())
            valid_losses.append(epoch_valid_loss)
            valid_accs.append(epoch_valid_acc.cpu().numpy())

            # Best model 업데이트
            if epoch_valid_acc > best_valid_acc:
                best_valid_acc = epoch_valid_acc
                print(f'★ 새로운 최고 성능! Valid Acc: {best_valid_acc:.4f}')

            # Early stopping 체크
            if early_stopping(epoch_valid_loss, model):
                print(f"\n⏹ Early stopping at epoch {epoch+1}")
                break

            # 체크포인트 저장
            if (epoch + 1) % ConfigV2.save_checkpoint_every == 0:
                save_checkpoint_v2(model, optimizer, scheduler, epoch,
                                  train_losses, train_accs, valid_losses, valid_accs, best_valid_acc)

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    gc.collect()

    # 최종 모델 저장
    os.makedirs(ConfigV2.model_dir, exist_ok=True)
    model_path = os.path.join(ConfigV2.model_dir, f'best_model_{ConfigV2.timestamp}.pth')
    torch.save(model.state_dict(), model_path)

    training_time = time.time() - start_time
    print(f'\n=== 학습 완료 ===')
    print(f'최고 검증 정확도: {best_valid_acc:.4f}')
    print(f'학습 시간: {training_time/60:.1f}분')
    print(f'모델 저장: {model_path}')

    return model, train_losses, train_accs, valid_losses, valid_accs

def save_checkpoint_v2(model, optimizer, scheduler, epoch, train_losses, train_accs,
                      valid_losses, valid_accs, best_valid_acc):
    """V2 체크포인트 저장"""
    os.makedirs(ConfigV2.checkpoint_dir, exist_ok=True)

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_losses': train_losses,
        'train_accs': train_accs,
        'valid_losses': valid_losses,
        'valid_accs': valid_accs,
        'best_valid_acc': best_valid_acc,
        'config': {
            'lr': ConfigV2.lr,
            'batch_size': ConfigV2.batch_size,
            'dropout_rate': ConfigV2.dropout_rate,
            'weight_decay': ConfigV2.weight_decay
        },
        'timestamp': ConfigV2.timestamp
    }

    checkpoint_path = os.path.join(ConfigV2.checkpoint_dir, f'checkpoint_epoch_{epoch}_{ConfigV2.timestamp}.pth')
    torch.save(checkpoint, checkpoint_path)

    latest_path = os.path.join(ConfigV2.checkpoint_dir, f'latest_checkpoint_{ConfigV2.timestamp}.pth')
    torch.save(checkpoint, latest_path)

    print(f'✓ 체크포인트 저장: epoch {epoch+1}')
    return checkpoint_path

# 실행 코드
print("=== 체크포인트에서 학습 재개 준비 완료 ===")
print("실행 순서:")
print("1. train_loader, valid_loader, test_loader = create_robust_loaders(class_names)")
print("2. model = create_anti_overfit_model(len(class_names))")
print("3. trained_model, *_ = train_anti_overfit_model_resume(model, train_loader, valid_loader, len(class_names))")


# 클래스 이름
class_names = [
    '고정 불량_불량품', '고정 불량_양품', '고정핀 불량_불량품', '고정핀 불량_양품',
    '단차_불량품', '단차_양품', '스크래치_불량품', '스크래치_양품',
    '실링 불량_불량품', '실링 불량_양품', '연계 불량_불량품', '연계 불량_양품',
    '외관 손상_불량품', '외관 손상_양품', '유격 불량_불량품', '유격 불량_양품',
    '장착 불량_불량품', '장착 불량_양품', '체결 불량_불량품', '체결 불량_양품',
    '헤밍 불량_불량품', '헤밍 불량_양품', '홀 변형_불량품', '홀 변형_양품'
]

# 자동 실행 (기존 코드와 동일한 방식)
train_loader, valid_loader, test_loader = create_robust_loaders(class_names)
model = create_anti_overfit_model(len(class_names))
trained_model, *_ = train_anti_overfit_model_resume(model, train_loader, valid_loader, len(class_names))