<a href="https://colab.research.google.com/github/serizard/Maching-Learning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 이미지 압축 해제

In [1]:
import os
from glob import glob

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
directory_path = '/content/SceneImages'

if not os.path.exists(directory_path):
    os.makedirs(directory_path)

%cd $directory_path
!unzip -qq '/content/drive/MyDrive/ML_Project3/SceneImages.zip'

/content/SceneImages


In [4]:
file_path = list(glob('/content/SceneImages/*.jpg'))
len(file_path)

4225

## 데이터셋 생성

In [5]:
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision.transforms.functional import InterpolationMode

In [6]:
train_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/train.csv')
test_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/test.csv')

In [7]:
train_set, val_set = train_test_split(train_set, test_size=0.2, stratify=train_set['label'], shuffle=True, random_state=42)
val_set, test_set = train_test_split(val_set, test_size=0.5, stratify=val_set['label'], shuffle=True, random_state=42)

In [8]:
class ImageDataset(Dataset):
    def __init__(self, labels, img_dir, img_list, transform=None):
        self.img_labels = np.array(labels)
        self.img_dir = img_dir
        self.img_paths = np.array([os.path.join(self.img_dir, img_name) for img_name in img_list])
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.img_labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [9]:
# train_transform = transforms.Compose([
#     transforms.Resize(size=(150 , 150)) ,
#     transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
#     transforms.RandomRotation(degrees=15),
#     transforms.RandomHorizontalFlip(p=0.5) ,
#     # transforms.RandomCrop(size=(150,150)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
# ])

# val_transform = transforms.Compose([
#     transforms.Resize((150, 150)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
# ])

train_transform = transforms.Compose([
    transforms.Resize((128,128)),
    transforms.RandomHorizontalFlip(.5),
    transforms.RandomRotation(degrees = (-45 , 45),interpolation=InterpolationMode.NEAREST),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Resize((128 , 128)),
    transforms.ToTensor()
])

In [10]:
train_dataset = ImageDataset(labels = train_set['label'],
                             img_dir = '/content/SceneImages',
                             img_list = train_set['image_name'],
                             transform = train_transform)

val_dataset = ImageDataset(labels = val_set['label'],
                           img_dir = '/content/SceneImages',
                           img_list = val_set['image_name'],
                           transform = test_transform)

test_dataset = ImageDataset(labels = test_set['label'],
                           img_dir = '/content/SceneImages',
                           img_list = test_set['image_name'],
                           transform = test_transform)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## 모델링

In [11]:
import torch
import torch.nn as nn
from transformers import ViTModel, Trainer, TrainingArguments
import torchvision.models as models
from transformers import ViTFeatureExtractor, ViTForImageClassification

In [14]:
class CustomEfficientNet(nn.Module):
    def __init__(self, num_classes=6):
        super(CustomEfficientNet, self).__init__()
        self.base_model = models.efficientnet_v2_l(weights=models.EfficientNet_V2_L_Weights.IMAGENET1K_V1)
        self.base_model.features = nn.Sequential(*list(self.base_model.features.children())[:-1])
        self.batch_norm = nn.BatchNorm1d(640, eps=0.001, momentum=0.01)
        self.fc1 = nn.Linear(640, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.base_model.features(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.batch_norm(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.softmax(x, dim=1)

class CustomResNet152(nn.Module):
    def __init__(self, num_classes=6):
        super(CustomResNet152, self).__init__()
        self.model = models.resnet50(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

        # # 모든 파라미터를 고정
        # for param in self.model.parameters():
        #     param.requires_grad = False

        # # 마지막 fc 레이어의 파라미터만 학습 가능하도록 설정
        # for param in self.model.fc.parameters():
        #     param.requires_grad = True

    def forward(self, x):
        return self.model(x)



In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = CustomResNet152().to(device)


model = models.swin_v2_b(weights= 'DEFAULT')

model.head = nn.Linear(in_features= model.head.in_features ,
                        out_features = 6)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [16]:
import torch
from tqdm import tqdm

num_epochs = 10

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

with tqdm(total=(len(train_dataloader) + len(val_dataloader)) * num_epochs, desc='Fine-tuning') as pbar:
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0

        for images, labels in train_dataloader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)

            # Train accuracy
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            pbar.update(1)

        train_loss = train_loss / len(train_dataset)
        train_accuracy = correct_train / len(train_dataset)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        correct_val = 0

        with torch.no_grad():
            for images, labels in val_dataloader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)

                # Validation accuracy
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == labels).sum().item()
                pbar.update(1)

        val_loss = val_loss / len(val_dataset)
        val_accuracy = correct_val / len(val_dataset)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f" Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}")


Fine-tuning:  10%|█         | 54/540 [00:25<01:45,  4.59it/s]

 Epoch 1/10: Train Loss: 0.6473, Val Loss: 0.2459, Train Acc: 0.7592, Val Acc: 0.8995


Fine-tuning:  20%|██        | 108/540 [00:51<01:33,  4.60it/s]

 Epoch 2/10: Train Loss: 0.2701, Val Loss: 0.2157, Train Acc: 0.9003, Val Acc: 0.9259


Fine-tuning:  30%|███       | 162/540 [01:17<01:22,  4.56it/s]

 Epoch 3/10: Train Loss: 0.2082, Val Loss: 0.2500, Train Acc: 0.9234, Val Acc: 0.9339


Fine-tuning:  40%|████      | 216/540 [01:44<01:11,  4.55it/s]

 Epoch 4/10: Train Loss: 0.1830, Val Loss: 0.2297, Train Acc: 0.9339, Val Acc: 0.9259


Fine-tuning:  50%|█████     | 270/540 [02:10<00:59,  4.52it/s]

 Epoch 5/10: Train Loss: 0.1382, Val Loss: 0.2425, Train Acc: 0.9538, Val Acc: 0.9233


Fine-tuning:  60%|██████    | 324/540 [02:36<00:49,  4.40it/s]

 Epoch 6/10: Train Loss: 0.1204, Val Loss: 0.2123, Train Acc: 0.9594, Val Acc: 0.9312


Fine-tuning:  70%|███████   | 378/540 [03:03<00:35,  4.55it/s]

 Epoch 7/10: Train Loss: 0.1006, Val Loss: 0.2571, Train Acc: 0.9627, Val Acc: 0.9233


Fine-tuning:  80%|████████  | 432/540 [03:29<00:23,  4.54it/s]

 Epoch 8/10: Train Loss: 0.0955, Val Loss: 0.2566, Train Acc: 0.9653, Val Acc: 0.9339


Fine-tuning:  90%|█████████ | 486/540 [03:56<00:11,  4.54it/s]

 Epoch 9/10: Train Loss: 0.0944, Val Loss: 0.2831, Train Acc: 0.9633, Val Acc: 0.9180


Fine-tuning: 100%|██████████| 540/540 [04:22<00:00,  2.05it/s]

 Epoch 10/10: Train Loss: 0.0751, Val Loss: 0.2823, Train Acc: 0.9749, Val Acc: 0.9339





In [21]:
def test(model, test_dataloader):
    model.eval()
    test_loss = 0.0
    correct_test = 0

    with torch.no_grad():
        for images, labels in test_dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * images.size(0)

            _, preds = torch.max(outputs, 1)
            correct_test += (preds == labels).sum().item()

    test_loss = test_loss / len(test_dataset)
    test_accuracy = correct_test / len(test_dataset)

    print(test_accuracy)
    print(test_loss)

test(model, test_dataloader)

ValueError: Input image size (128*128) doesn't match model (224*224).

## ViT 모델

In [1]:
import torch
import torch.nn as nn
from transformers import ViTModel, Trainer, TrainingArguments
import torchvision.models as models
from transformers import ViTFeatureExtractor, ViTForImageClassification
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [6]:
train_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/train.csv')
test_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/test.csv')

In [7]:
train_set, val_set = train_test_split(train_set, test_size=0.1, stratify=train_set['label'], shuffle=True, random_state=42)

In [22]:
class ViTImageDataset(Dataset):
    def __init__(self, labels, img_dir, img_list, feature_extractor):
        self.img_labels = np.array(labels)
        self.img_dir = img_dir
        self.img_paths = np.array([os.path.join(self.img_dir, img_name) for img_name in img_list])
        self.fe = feature_extractor

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.img_labels[idx]
        image = self.fe(image)['pixel_values'][0]
        return image, label

In [23]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k', return_tensors='pt')

train_dataset = ViTImageDataset(labels = train_set['label'],
                             img_dir = '/content/SceneImages',
                             img_list = train_set['image_name'],
                             feature_extractor = feature_extractor)

val_dataset = ViTImageDataset(labels = val_set['label'],
                           img_dir = '/content/SceneImages',
                           img_list = val_set['image_name'],
                           feature_extractor = feature_extractor)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)



In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
import torch
from tqdm import tqdm

num_epochs = 10

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

with tqdm(total=(len(train_dataloader) + len(val_dataloader)) * num_epochs, desc='Fine-tuning') as pbar:
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0

        for images, labels in train_dataloader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)

            # Train accuracy
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            pbar.update(1)

        train_loss = train_loss / len(train_dataset)
        train_accuracy = correct_train / len(train_dataset)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        correct_val = 0

        with torch.no_grad():
            for images, labels in val_dataloader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images).logits
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)

                # Validation accuracy
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == labels).sum().item()
                pbar.update(1)

        val_loss = val_loss / len(val_dataset)
        val_accuracy = correct_val / len(val_dataset)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}")


Fine-tuning:  10%|█         | 60/600 [00:58<04:40,  1.92it/s]

Epoch 1/10: Train Loss: 0.8552, Val Loss: 1.0605, Train Acc: 0.6923, Val Acc: 0.6412


Fine-tuning:  20%|██        | 120/600 [01:57<04:06,  1.95it/s]

Epoch 2/10: Train Loss: 0.5741, Val Loss: 0.4839, Train Acc: 0.8089, Val Acc: 0.8311


Fine-tuning:  30%|███       | 180/600 [02:55<03:35,  1.95it/s]

Epoch 3/10: Train Loss: 0.3878, Val Loss: 0.5095, Train Acc: 0.8717, Val Acc: 0.8311


Fine-tuning:  40%|████      | 240/600 [03:54<03:05,  1.94it/s]

Epoch 4/10: Train Loss: 0.3067, Val Loss: 0.4220, Train Acc: 0.9019, Val Acc: 0.8443


Fine-tuning:  50%|█████     | 300/600 [04:52<02:34,  1.94it/s]

Epoch 5/10: Train Loss: 0.2497, Val Loss: 0.6160, Train Acc: 0.9213, Val Acc: 0.7863


Fine-tuning:  60%|██████    | 360/600 [05:51<02:04,  1.93it/s]

Epoch 6/10: Train Loss: 0.2293, Val Loss: 0.3834, Train Acc: 0.9287, Val Acc: 0.8760


Fine-tuning:  70%|███████   | 420/600 [06:49<01:33,  1.93it/s]

Epoch 7/10: Train Loss: 0.2149, Val Loss: 0.5061, Train Acc: 0.9266, Val Acc: 0.8338


Fine-tuning:  80%|████████  | 480/600 [07:47<01:01,  1.94it/s]

Epoch 8/10: Train Loss: 0.2268, Val Loss: 0.6991, Train Acc: 0.9245, Val Acc: 0.7678


Fine-tuning:  90%|█████████ | 540/600 [08:46<00:30,  1.94it/s]

Epoch 9/10: Train Loss: 0.2113, Val Loss: 0.5522, Train Acc: 0.9301, Val Acc: 0.8232


Fine-tuning: 100%|██████████| 600/600 [09:44<00:00,  1.03it/s]

Epoch 10/10: Train Loss: 0.1950, Val Loss: 0.9273, Train Acc: 0.9363, Val Acc: 0.7573





In [23]:
import torch
import torch.nn as nn
from transformers import ViTModel, Trainer, TrainingArguments
import torchvision.models as models
from transformers import ViTFeatureExtractor, ViTForImageClassification
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

train_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/train.csv')
test_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/test.csv')

train_set, test_set = train_test_split(train_set, test_size=0.2, stratify=train_set['label'], shuffle=True, random_state=42)
val_set, test_set = train_test_split(test_set, test_size=0.5, stratify=test_set['label'], shuffle=True, random_state=42)


class ViTImageDataset(Dataset):
    def __init__(self, labels, img_dir, img_list, transform=None):
        self.img_labels = np.array(labels)
        self.img_dir = img_dir
        self.img_paths = np.array([os.path.join(self.img_dir, img_name) for img_name in img_list])
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert("RGB")
        label = self.img_labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(.5),
    transforms.RandomRotation(degrees = (-45 , 45),interpolation=InterpolationMode.NEAREST),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Resize((224 , 224)),
    transforms.ToTensor()
])


feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k', return_tensors='pt')

train_dataset = ViTImageDataset(labels = train_set['label'],
                             img_dir = '/content/SceneImages',
                             img_list = train_set['image_name'], transform=train_transform)

val_dataset = ViTImageDataset(labels = val_set['label'],
                           img_dir = '/content/SceneImages',
                           img_list = val_set['image_name'], transform=test_transform)

test_dataset = ViTImageDataset(labels = test_set['label'],
                           img_dir = '/content/SceneImages',
                           img_list = test_set['image_name'], transform=test_transform)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

early_stopping = EarlyStopping(patience=5)

import torch
from tqdm import tqdm

num_epochs = 30

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

with tqdm(total=(len(train_dataloader) + len(val_dataloader)) * num_epochs, desc='Fine-tuning') as pbar:
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0

        for images, labels in train_dataloader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)

            # Train accuracy
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            pbar.update(1)

        train_loss = train_loss / len(train_dataset)
        train_accuracy = correct_train / len(train_dataset)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        correct_val = 0

        with torch.no_grad():
            for images, labels in val_dataloader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images).logits
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)

                # Validation accuracy
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == labels).sum().item()
                pbar.update(1)

        val_loss = val_loss / len(val_dataset)
        val_accuracy = correct_val / len(val_dataset)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f" Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}")

        scheduler.step()

        early_stopping(val_loss)
        if early_stopping.early_stop:
            print("Early stopping")
            break


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fine-tuning:   3%|▎         | 54/1620 [00:48<11:30,  2.27it/s]

 Epoch 1/30: Train Loss: 0.7761, Val Loss: 0.4167, Train Acc: 0.8279, Val Acc: 0.9259


Fine-tuning:   7%|▋         | 108/1620 [01:36<11:14,  2.24it/s]

 Epoch 2/30: Train Loss: 0.2956, Val Loss: 0.3214, Train Acc: 0.9346, Val Acc: 0.9074


Fine-tuning:  10%|█         | 162/1620 [02:25<11:01,  2.20it/s]

 Epoch 3/30: Train Loss: 0.2265, Val Loss: 0.2533, Train Acc: 0.9422, Val Acc: 0.9233


Fine-tuning:  13%|█▎        | 216/1620 [03:15<10:36,  2.21it/s]

 Epoch 4/30: Train Loss: 0.1494, Val Loss: 0.2464, Train Acc: 0.9647, Val Acc: 0.9312


Fine-tuning:  17%|█▋        | 270/1620 [04:04<10:16,  2.19it/s]

 Epoch 5/30: Train Loss: 0.1095, Val Loss: 0.2426, Train Acc: 0.9775, Val Acc: 0.9286


Fine-tuning:  20%|██        | 324/1620 [04:54<09:47,  2.21it/s]

 Epoch 6/30: Train Loss: 0.0909, Val Loss: 0.2282, Train Acc: 0.9848, Val Acc: 0.9365


Fine-tuning:  23%|██▎       | 378/1620 [05:44<09:22,  2.21it/s]

 Epoch 7/30: Train Loss: 0.0804, Val Loss: 0.2204, Train Acc: 0.9894, Val Acc: 0.9392


Fine-tuning:  27%|██▋       | 432/1620 [06:34<09:01,  2.20it/s]

 Epoch 8/30: Train Loss: 0.0725, Val Loss: 0.2167, Train Acc: 0.9914, Val Acc: 0.9418


Fine-tuning:  30%|███       | 486/1620 [07:24<08:34,  2.20it/s]

 Epoch 9/30: Train Loss: 0.0678, Val Loss: 0.2182, Train Acc: 0.9931, Val Acc: 0.9418


Fine-tuning:  33%|███▎      | 540/1620 [08:14<08:11,  2.20it/s]

 Epoch 10/30: Train Loss: 0.0682, Val Loss: 0.2196, Train Acc: 0.9921, Val Acc: 0.9392


Fine-tuning:  37%|███▋      | 594/1620 [09:04<07:49,  2.18it/s]

 Epoch 11/30: Train Loss: 0.0668, Val Loss: 0.2196, Train Acc: 0.9924, Val Acc: 0.9392


Fine-tuning:  40%|████      | 648/1620 [09:54<07:33,  2.14it/s]

 Epoch 12/30: Train Loss: 0.0673, Val Loss: 0.2206, Train Acc: 0.9921, Val Acc: 0.9392


Fine-tuning:  43%|████▎     | 702/1620 [10:44<14:02,  1.09it/s]

 Epoch 13/30: Train Loss: 0.0665, Val Loss: 0.2208, Train Acc: 0.9924, Val Acc: 0.9392
Early stopping





In [26]:
def test(model, test_dataloader):
    model.eval()
    test_loss = 0.0
    correct_test = 0

    with torch.no_grad():
        for images, labels in test_dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            test_loss += loss.item() * images.size(0)

            _, preds = torch.max(outputs, 1)
            correct_test += (preds == labels).sum().item()

    test_loss = test_loss / len(test_dataset)
    test_accuracy = correct_test / len(test_dataset)

    print(test_accuracy)
    print(test_loss)

test(model, test_dataloader)

0.941952506596306
0.21461144600820414


In [27]:
torch.save(model.state_dict(), '/content/drive/MyDrive/fine_tuned_vit_model.pth')

In [31]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pandas as pd
import numpy as np
from PIL import Image
import os

# Dataset for submission
class ViTImageDatasetForSubmission(Dataset):
    def __init__(self, img_dir, img_list, transform=None):
        self.img_dir = img_dir
        self.img_paths = np.array([os.path.join(self.img_dir, img_name) for img_name in img_list])
        self.transform = transform

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

# Load submission data
submission_set = pd.read_csv('/content/drive/MyDrive/ML_Project3/test.csv')

# Create submission dataset and dataloader
submission_dataset = ViTImageDatasetForSubmission(
    img_dir='/content/SceneImages',
    img_list=submission_set['image_name'],
    transform=test_transform
)

submission_dataloader = DataLoader(submission_dataset, batch_size=len(submission_dataset), shuffle=False)

# Set the model to evaluation mode
model.eval()

# Get predictions
all_preds = []
with torch.no_grad():
    for images in submission_dataloader:
        images = images.to(device)
        outputs = model(images).logits
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())

# Prepare the submission DataFrame
submission_set['label'] = all_preds

# Save to CSV
submission_set[['image_name', 'label']].to_csv('/content/drive/MyDrive/ML_Project3/submission.csv', index=False)
