# 7/25更新
- 可以使用pretrain weight嗎? Ans: No
- 一定要使用Resnet18 model嗎? Ans: 不一定，可以使用其他model

# 常見問題：
- 沒有GPU或其他環境問題：請你使用 colab
- Train 很慢：請你檢查 get_device() 看有沒有成功使用到 GPU
- 被 colab 限制 GPU 用量：好像使用超過12小時會被 google 限制用量，
請你開其他 google 帳號，或是等個幾小時再繼續
- dimension 對不起來：請你檢查 tensor.shape
- dataset permission denied：google 有流量限制，請你嘗試其他下載連結

# Install & import packages

In [1]:
!pip install torchtoolbox

Collecting torchtoolbox
  Downloading torchtoolbox-0.1.5-py3-none-any.whl (58 kB)
[?25l[K     |█████▋                          | 10 kB 19.5 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 11.4 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 8.8 MB/s eta 0:00:01[K     |██████████████████████▍         | 40 kB 8.0 MB/s eta 0:00:01[K     |████████████████████████████    | 51 kB 4.1 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.6 MB/s 
Installing collected packages: torchtoolbox
Successfully installed torchtoolbox-0.1.5


In [2]:
import os
import torch
import torchvision
import tarfile
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torchvision.datasets import ImageFolder, CIFAR100
from torchtoolbox.tools import summary
from torch.utils.data import DataLoader
import torchvision.transforms as tt
from torch.utils.data import random_split
from torchvision.utils import make_grid
import matplotlib
import matplotlib.pyplot as plt
from collections import OrderedDict

%matplotlib inline

matplotlib.rcParams['figure.facecolor'] = '#ffffff'

# Check GPU

In [3]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

device = get_default_device()
print(device)
!nvidia-smi

cuda
Tue Aug 24 10:01:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    32W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| P

# Download dataset

In [4]:
!pip install opendatasets


Collecting opendatasets
  Downloading opendatasets-0.1.20-py3-none-any.whl (14 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.20


In [6]:
import opendatasets as od

od.download('https://www.kaggle.com/c/2021-ai-training-final-project/data')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: shihyuhuang
Your Kaggle Key: ··········
Downloading 2021-ai-training-final-project.zip to ./2021-ai-training-final-project


100%|██████████| 140M/140M [00:06<00:00, 23.4MB/s]



Extracting archive ./2021-ai-training-final-project/2021-ai-training-final-project.zip to ./2021-ai-training-final-project


# Dataloader

In [7]:
data_dir = './2021-ai-training-final-project/CIFAR100'
classes = os.listdir(data_dir + "/TRAIN")
print(classes)
print(len(classes))

['mouse', 'oak_tree', 'camel', 'raccoon', 'snake', 'seal', 'crab', 'boy', 'crocodile', 'squirrel', 'tractor', 'man', 'possum', 'rose', 'willow_tree', 'keyboard', 'lamp', 'apple', 'poppy', 'caterpillar', 'wardrobe', 'bed', 'cloud', 'fox', 'flatfish', 'bicycle', 'sea', 'wolf', 'pear', 'table', 'tulip', 'dinosaur', 'mushroom', 'rabbit', 'clock', 'mountain', 'palm_tree', 'train', 'beaver', 'dolphin', 'shrew', 'whale', 'streetcar', 'bridge', 'castle', 'bear', 'lawn_mower', 'cup', 'couch', 'cockroach', 'tank', 'otter', 'woman', 'tiger', 'pine_tree', 'house', 'turtle', 'shark', 'road', 'sunflower', 'pickup_truck', 'leopard', 'rocket', 'skyscraper', 'bus', 'orchid', 'hamster', 'butterfly', 'trout', 'orange', 'maple_tree', 'television', 'skunk', 'girl', 'bottle', 'lion', 'telephone', 'plate', 'bowl', 'sweet_pepper', 'plain', 'spider', 'worm', 'can', 'motorcycle', 'bee', 'chimpanzee', 'lizard', 'snail', 'beetle', 'chair', 'forest', 'kangaroo', 'lobster', 'elephant', 'ray', 'cattle', 'porcupine',

In [8]:
# Data transforms (normalization & data augmentation)
normalize = tt.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276])
train_tfms = tt.Compose([
        tt.RandomCrop(32, padding=4),
        tt.RandomHorizontalFlip(),
        tt.ToTensor(),
        normalize,
])

valid_tfms = tt.Compose([
        tt.ToTensor(),
        normalize,
])

In [9]:
# PyTorch datasets
train_ds = ImageFolder(root=data_dir+"/TRAIN",transform=train_tfms)
valid_ds = ImageFolder(root=data_dir+"/TEST",transform=valid_tfms)

In [10]:
# PyTorch data loaders
BATCH_SIZE=64
train_dl = DataLoader(train_ds,batch_size=BATCH_SIZE,shuffle=True, num_workers=2)
valid_dl = DataLoader(valid_ds,batch_size=64, shuffle=False, num_workers=2)

In [11]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [12]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

# Model

In [60]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

def adjust_learning_rate(optimizer, epoch):
    if epoch < 100:
        lrr = lr
    elif epoch < 150:
        lrr = lr * 0.1
    else:
        lrr = lr * 0.01
    for param_group in optimizer.param_groups:
        param_group['lr'] = lrr

class ImageClassificationBase(nn.Module):
    def training_step(self,batch):
        images,labels = batch
        out = self(images)
        loss = F.cross_entropy(out,labels)
        return loss
    
    def validation_step(self,batch):
        images,labels = batch
        out = self(images)
        loss = F.cross_entropy(out,labels)
        acc = accuracy(out,labels)
        return {'val_loss': loss.detach(), 'val_acc': acc}
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies

        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}

    def epoch_end(self, epoch, result):
        print("Epoch [{}], last_lr: {:.5f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))
        

In [61]:
class PatchEmbed(nn.Module):
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2


        self.proj = nn.Conv2d(
                in_chans,
                embed_dim,
                kernel_size=patch_size,
                stride=patch_size,
        )

    def forward(self, x):
        x = self.proj(
                x
            )  # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
        x = x.flatten(2)  # (n_samples, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (n_samples, n_patches, embed_dim)

        return x


class Attention(nn.Module):
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):
        n_samples, n_tokens, dim = x.shape

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x)  # (n_samples, n_patches + 1, 3 * dim)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim
        )  # (n_smaples, n_patches + 1, 3, n_heads, head_dim)
        qkv = qkv.permute(
                2, 0, 3, 1, 4
        )  # (3, n_samples, n_heads, n_patches + 1, head_dim)

        q, k, v = qkv[0], qkv[1], qkv[2]
        k_t = k.transpose(-2, -1)  # (n_samples, n_heads, head_dim, n_patches + 1)
        dp = (
           q @ k_t
        ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = dp.softmax(dim=-1)  # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = self.attn_drop(attn)

        weighted_avg = attn @ v  # (n_samples, n_heads, n_patches +1, head_dim)
        weighted_avg = weighted_avg.transpose(
                1, 2
        )  # (n_samples, n_patches + 1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2)  # (n_samples, n_patches + 1, dim)

        x = self.proj(weighted_avg)  # (n_samples, n_patches + 1, dim)
        x = self.proj_drop(x)  # (n_samples, n_patches + 1, dim)

        return x


class MLP(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        x = self.fc1(
                x
        ) # (n_samples, n_patches + 1, hidden_features)
        x = self.act(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.fc2(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)

        return x


class Transformer(nn.Module):
    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
                dim,
                n_heads=n_heads,
                qkv_bias=qkv_bias,
                attn_p=attn_p,
                proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(
                in_features=dim,
                hidden_features=hidden_features,
                out_features=dim,
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))

        return x


class ViT(ImageClassificationBase):
    def __init__(
            self,
            img_size=32,
            patch_size=32,
            in_chans=3,
            n_classes=100,
            embed_dim=768,
            depth=12,
            n_heads=12,
            mlp_ratio=4.,
            qkv_bias=True,
            p=0.,
            attn_p=0.,
    ):
        super().__init__()

        self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
        )
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
                torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)

        self.Transformer = nn.ModuleList(
            [
                Transformer(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p,
                )
                for _ in range(depth)
            ]
        )

        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_classes)


    def forward(self, x):
        n_samples = x.shape[0]
        x = self.patch_embed(x)

        cls_token = self.cls_token.expand(
                n_samples, -1, -1
        )  # (n_samples, 1, embed_dim)
        x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, embed_dim)
        x = x + self.pos_embed  # (n_samples, 1 + n_patches, embed_dim)
        x = self.pos_drop(x)

        for Transformer in self.Transformer:
            x = Transformer(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]  # just the CLS token
        x = self.head(cls_token_final)

        return x



In [62]:
model = ViT().to(device)

# Set Config

In [63]:
epochs=200
lr=0.1
momentum=0.9
weight_decay=1e-4
optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)

# Training


In [64]:
def Train (epochs,train_dl,valid_dl,model,optimizer):
    torch.cuda.empty_cache()
    
    history = []
        
    for epoch in range(epochs):
        adjust_learning_rate(optimizer, epoch)
        model.train()
        train_loss = []
        lrs = []
        
        for batch in train_dl:
            loss = model.training_step(batch)
            train_loss.append(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lrs.append(get_lr(optimizer))
   
        result = evaluate(model,valid_dl)
        result["train_loss"] = torch.stack(train_loss).mean().item()
        result["lrs"] = lrs
        
        model.epoch_end(epoch,result)
        history.append(result)
        
    return history
            

@torch.no_grad()
def evaluate(model,valid_dl):
    model.eval()
    outputs = [model.validation_step(batch) for batch in valid_dl]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [65]:
%%time
history = []
history += Train(epochs=epochs,train_dl=train_dl,valid_dl=valid_dl,model=model,optimizer=optimizer)

Epoch [0], last_lr: 0.10000, train_loss: 5.5023, val_loss: 4.5672, val_acc: 0.0231
Epoch [1], last_lr: 0.10000, train_loss: 4.4231, val_loss: 4.4221, val_acc: 0.0397
Epoch [2], last_lr: 0.10000, train_loss: 4.3256, val_loss: 4.3806, val_acc: 0.0451
Epoch [3], last_lr: 0.10000, train_loss: 4.2360, val_loss: 4.3341, val_acc: 0.0509
Epoch [4], last_lr: 0.10000, train_loss: 4.1606, val_loss: 4.2309, val_acc: 0.0522
Epoch [5], last_lr: 0.10000, train_loss: 4.0910, val_loss: 4.1952, val_acc: 0.0738
Epoch [6], last_lr: 0.10000, train_loss: 4.0338, val_loss: 4.1095, val_acc: 0.0703
Epoch [7], last_lr: 0.10000, train_loss: 3.9983, val_loss: 4.1253, val_acc: 0.0770
Epoch [8], last_lr: 0.10000, train_loss: 3.9491, val_loss: 4.0378, val_acc: 0.0864
Epoch [9], last_lr: 0.10000, train_loss: 3.9039, val_loss: 3.9563, val_acc: 0.0970
Epoch [10], last_lr: 0.10000, train_loss: 3.8609, val_loss: 3.9406, val_acc: 0.0983
Epoch [11], last_lr: 0.10000, train_loss: 3.8281, val_loss: 3.8482, val_acc: 0.1111
Ep

KeyboardInterrupt: ignored

# Model / Parameter statistics

In [None]:
# Print model
print(model)

# Print parameter
net = ViT().to(device)
input = torch.randn(1,3, 32, 32)
input = input.to(device)
summary(net, input)

# Plot Learning curve

In [None]:
def plot_loss(history):
    plt.plot([x.get("train_loss") for x in history], "-bx")
    plt.plot([x["val_loss"] for x in history],"-rx")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend(["train loss","val loss"])

plot_loss(history)

# Testing

In [None]:
def test(model,data_loader):
    with torch.no_grad():
        model.eval()
        valid_loss = 0
        correct = 0
        bs = 64
        result = []
        check_names = []
        for i, (data, target) in enumerate(valid_dl):
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
            arr = pred.data.cpu().numpy()
            for j in range(pred.size()[0]):
                file_name = valid_ds.samples[i*bs+j][0].split('/')[-1]
                result.append((file_name,pred[j].cpu().numpy()[0])) 
        
    return result

In [None]:
result = test(model, valid_dl)

# Save Result

In [None]:
with open ('ID_result.csv','w') as f:
    f.write('Id,Category\n')
    for data in result:
        f.write(data[0]+','+str(data[1])+'\n')