In [1]:
%%time
#%%writefile cifar-10_pytorch.py

# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import math
import time
import random
import yaml
import shutil
import glob
import pickle
import pathlib
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter, OrderedDict

import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import accuracy_score, log_loss
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
import torch.backends.cudnn as cudnn
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import CIFAR10
from torchvision import transforms
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence  # 文字列の長さを揃えてくれる関数
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    ReduceLROnPlateau,
)
from torch.cuda.amp import autocast, GradScaler
#sys.path.append(r'C:\Users\yokoi.shingo\GitHub\Ranger-Deep-Learning-Optimizer')
#sys.path.append(r'C:\Users\yokoi.shingo\GitHub\pytorch-optimizer')
#from torch_optimizer import RAdam, Lookahead

import timm
print("timm version:", timm.__version__)

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def train_fn(
    train_loader, model, criterion, optimizer, epoch, scheduler, device, scaler
):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = images.size(0)
        #print("images.shape, labels.shape, batch_size:", images.shape, labels.shape, batch_size)
        with autocast():
            logits = model(images)
            #print("logits.shape, logits:", logits.shape, logits)
            loss = criterion(logits, labels)
            #print("loss:", loss)
            # record loss
            losses.update(loss.item(), batch_size)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            scaler.scale(loss).backward()
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), CFG.max_grad_norm, norm_type=2.0
                )
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad Norm: {grad_norm:.4f}  "
                "LR: {lr:.4e}  ".format(
                    epoch + 1,
                    step,
                    len(train_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    remain=timeSince(start, float(step + 1) / len(train_loader)),
                    grad_norm=grad_norm,
                    lr=scheduler.get_lr()[0],
                )
            )

            # セッション切れても大丈夫なように都度driveに保存する
            torch.save(
                {
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "scheduler": scheduler.state_dict(),
                },
                "per_steps.pth",
            )
            shutil.copyfile("per_steps.pth", CP_DIR + f"/{NAME}_per_steps.pth")

    return losses.avg


def valid_fn(valid_loader, model, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        batch_size = images.size(0)
        with torch.no_grad():
            #predictions = model.forward_argmax(images)
            predictions = model(images).argmax(1)
        pred = predictions.detach().cpu().numpy()
        preds.append(pred)
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} ".format(
                    step,
                    len(valid_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step + 1) / len(valid_loader)),
                )
            )
    preds = np.concatenate(preds)
    return preds


# ====================================================
# Train loop
# ====================================================
def train_loop():

    # ====================================================
    # loader
    # ====================================================
    train_loader = dm.train_dataloader()
    valid_loader = dm.test_dataloader()
    valid_labels = np.array(dm.cifar_test.targets)  # dm.cifar_valがおかしいのでtest使う
    valid_ids = np.array(range(len(valid_labels)))

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer,
                mode="min",
                factor=CFG.factor,
                patience=CFG.patience,
                verbose=True,
                eps=CFG.eps,
            )
        elif CFG.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(
                optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1
            )
        elif CFG.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = TimmModel(CFG.n_classes, model_name=CFG.model_name, pretrained=True)
    model.to(device)
    
    # https://aru47.hatenablog.com/entry/2020/11/06/225052
    if len(CFG.device_ids) > 1:
        LOGGER.info(f"=> is_multiGPU {CFG.device_ids}")
        model = nn.DataParallel(model, device_ids=CFG.device_ids)  # make parallel
        cudnn.benchmark = True

    if CFG.optimizer == "adam":
        optimizer = Adam(
            model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False
        )
    elif CFG.optimizer == "radam":
        optimizer = RAdam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
        optimizer = Lookahead(optimizer, alpha=0.5, k=5)

    scheduler = get_scheduler(optimizer)
    scaler = GradScaler()

    if os.path.exists(CFG.load_model_path):
        # モデルロード
        LOGGER.info("=> loading checkpoint '{}'".format(CFG.load_model_path))
        states = torch.load(CFG.load_model_path, map_location=torch.device("cpu"))
        model.load_state_dict(states["model"])
        model.to(device)
        if CFG.is_load_opt:
            LOGGER.info("=> loading optimizer and scheduler")
            optimizer.load_state_dict(states["optimizer"])
            scheduler.load_state_dict(states["scheduler"])

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss()  # loss計算したくないクラスは, ignore_index=1 で指定できる

    best_score = -1  # np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(
            train_loader, model, criterion, optimizer, epoch, scheduler, device, scaler
        )

        # eval
        preds = valid_fn(valid_loader, model, device)
        LOGGER.info(f"labels: {valid_labels[:5]}")
        LOGGER.info(f"preds: {preds[:5]}")

        # scoring
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  lr: {scheduler.get_lr()[0]:.4e}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(score)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            best_pth = OUTPUT_DIR + f"/{NAME}_best.pth"
            #torch.save(
            #    {
            #        "model": model.state_dict(),
            #        "optimizer": optimizer.state_dict(),
            #        "scheduler": scheduler.state_dict(),
            #        "preds": preds,
            #    },
            #    best_pth,
            #)
            val_pred_df = pd.DataFrame(
                {"id": valid_ids, "label": valid_labels, "pred": preds}
            )
            #val_pred_df.to_csv(CP_DIR + f"/{NAME}_val_pred.csv", index=False)
            #
            ## セッション切れても大丈夫なように都度driveに保存する
            #shutil.copyfile(
            #    best_pth, CP_DIR + f"/{NAME}_" + Path(best_pth).name
            #)
            #shutil.copyfile(OUTPUT_DIR + "/train.log", CP_DIR + f"/{NAME}_train.log")
    
    return val_pred_df


            
            
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


def init_logger(log_file='train.log'):
    """
    学習ログファイル出す
    """
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


# ====================================================
# Model
# ====================================================
class TimmModel(nn.Module):
    def __init__(self, n_classes, model_name="resnet18", pretrained=True):
        super().__init__()
        self.cnn = timm.create_model(model_name, pretrained=pretrained)
        if "efficient" in model_name:
            self.cnn.classifier = nn.Linear(self.cnn.classifier.in_features, n_classes)
        elif "vit" in model_name:
            self.cnn.head = nn.Linear(self.cnn.head.in_features, n_classes)
        elif "nfnet" in model_name:
            self.cnn.head.fc = nn.Linear(self.cnn.head.fc.in_features, n_classes)
        else:
            self.cnn.fc = nn.Linear(self.cnn.fc.in_features, n_classes)

    def forward(self, x):
        return self.cnn(x)
    
    def forward_argmax(self, x):
        return self.cnn(x).argmax(1)
        

# ====================================================
# Data Load
# ====================================================
# cifar10 ---------------
class CIFAR10DataModule():
    def __init__(self, batch_size=512, data_dir: str = "./"):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),])
        self.dims = (3, 32, 32)
        self.num_classes = 10

    def prepare_data(self):
        # download
        CIFAR10(self.data_dir, train=True, download=True)
        CIFAR10(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)
            self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.cifar_test = CIFAR10(
                self.data_dir, train=False, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(self.cifar_train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.cifar_val, batch_size=100)

    def test_dataloader(self):
        return DataLoader(self.cifar_test, batch_size=100)


# https://github.com/sinpcw/kaggle-whale2/blob/master/models.py
def loadpth(pth: str, map_location=None) -> OrderedDict:
    """
    パラメータロードのヘルパー関数.
    DataParallel化したモデルは module.xxxx という形式で保存されるため読込み時にmodule.から始まる場合はそれを取除く.
    """
    ostate = torch.load(pth, map_location=map_location)
    nstate = OrderedDict()
    for k, v in ostate.items():
        if k.startswith('module.'):
            nstate[k[len('module.'):]] = v
        else:
            nstate[k] = v
    return nstate

    
# ====================================================
# Param
# ====================================================
CP_DIR = "output"
OUTPUT_DIR = "."
os.makedirs(CP_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
NAME = "cifar10"

epochs = 3
class Config:
    def __init__(self):  # init__()に変数入れないと、CFG.__dict__したに型がすべて文字列になってしまう
        self.num_workers = 8  # os.cpu_count()
        self.seeds = [0]
        self.n_fold = 5
        self.n_classes = 10  # len(set(train["label"].values))  # 回帰にする場合は、Dataloaderのラベルの型をfloatにする + loss と score の式を nn.MSELoss とかにする + valid の model.forward_argmax() を  model() にする
        self.lr = 1e-4
        self.min_lr = 1e-6
        self.weight_decay = 1e-6
        self.optimizer = "adam"
        self.scheduler = "CosineAnnealingLR"  # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
        self.T_max = epochs  # CosineAnnealingLR
        self.gradient_accumulation_steps = 1
        self.max_grad_norm = 5
        #self.model_name = "resnet18"
        #self.model_name = "resnet101"
        self.model_name = "tf_efficientnet_b7_ns"
        #self.model_name = "tf_efficientnet_l2_ns"
        self.load_model_path = "none"
        #self.load_model_path = "fold0_best.pth"
        self.is_load_opt = True
        self.epochs = epochs
        self.print_freq = 10000  # 学習結果をprintするstep数
        self.device_ids = [0,1]
        #self.batch_size = 512
        self.batch_size = 128
CFG = Config()

if len(CFG.device_ids) > 1:
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(_) for _ in CFG.device_ids])
    print("os.environ['CUDA_VISIBLE_DEVICES']:", os.environ["CUDA_VISIBLE_DEVICES"])
    CFG.batch_size = CFG.batch_size*2
print(CFG.__dict__)

with open("cfg.yaml", "w") as wf:
    yaml.dump(CFG.__dict__, wf)
shutil.copyfile("cfg.yaml", CP_DIR+f"/{NAME}_cfg.yaml")

# ====================================================
# Data
# ====================================================
dm = CIFAR10DataModule(batch_size=CFG.batch_size)
dm.prepare_data()
dm.setup()
n_classes = dm.num_classes

# ====================================================
# LOGGER
# ====================================================
LOGGER = init_logger(OUTPUT_DIR + "/train.log")


# ====================================================
# main
# ====================================================
def main():
    for seed in CFG.seeds:
        seed_torch(seed=seed)

        val_pred_df = train_loop()
        val_pred_df.to_csv(CP_DIR + f"{NAME}_val_pred_seed{seed}.csv", index=False)
        display(val_pred_df)
        
        
if __name__ == '__main__':
    main()
    
LOGGER.info("\ntrain finish!!!")

timm version: 0.5.5
cuda
os.environ['CUDA_VISIBLE_DEVICES']: 0,1
{'num_workers': 8, 'seeds': [0], 'n_fold': 5, 'n_classes': 10, 'lr': 0.0001, 'min_lr': 1e-06, 'weight_decay': 1e-06, 'optimizer': 'adam', 'scheduler': 'CosineAnnealingLR', 'T_max': 3, 'gradient_accumulation_steps': 1, 'max_grad_norm': 5, 'model_name': 'tf_efficientnet_b7_ns', 'load_model_path': 'none', 'is_load_opt': True, 'epochs': 3, 'print_freq': 10000, 'device_ids': [0, 1], 'batch_size': 256}
Files already downloaded and verified
Files already downloaded and verified


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b7_ns-1dbc32de.pth
=> is_multiGPU [0, 1]


Epoch: [1][0/176] Data 0.049 (0.049) Elapsed 0m 4s (remain 13m 40s) Loss: 2.4065(2.4065) Grad Norm: nan  LR: 1.0000e-04  
Epoch: [1][175/176] Data 0.029 (0.048) Elapsed 0m 58s (remain 0m 0s) Loss: 1.6381(1.9520) Grad Norm: 9.0291  LR: 1.0000e-04  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 1m 5s) 


labels: [3 8 8 0 6]
preds: [9 8 1 8 6]
Epoch 1 - avg_train_loss: 1.9520  lr: 1.0000e-04  time: 76s
Epoch 1 - Score: 0.4206
Epoch 1 - Save Best Score: 0.4206 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 10s (remain 0m 0s) 
Epoch: [2][0/176] Data 0.045 (0.045) Elapsed 0m 0s (remain 1m 22s) Loss: 1.6053(1.6053) Grad Norm: 6.9071  LR: 5.6688e-05  
Epoch: [2][175/176] Data 0.028 (0.066) Elapsed 0m 56s (remain 0m 0s) Loss: 1.2165(1.3864) Grad Norm: 7.5109  LR: 5.6688e-05  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 0m 9s) 


labels: [3 8 8 0 6]
preds: [5 8 8 0 6]
Epoch 2 - avg_train_loss: 1.3864  lr: 5.6688e-05  time: 71s
Epoch 2 - Score: 0.5298
Epoch 2 - Save Best Score: 0.5298 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 9s (remain 0m 0s) 
Epoch: [3][0/176] Data 0.043 (0.043) Elapsed 0m 0s (remain 0m 54s) Loss: 1.2380(1.2380) Grad Norm: 5.5180  LR: 9.2500e-06  
Epoch: [3][175/176] Data 0.028 (0.068) Elapsed 0m 56s (remain 0m 0s) Loss: 1.1763(1.0763) Grad Norm: 9.6154  LR: 9.2500e-06  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 0m 9s) 


labels: [3 8 8 0 6]
preds: [5 8 8 0 6]
Epoch 3 - avg_train_loss: 1.0763  lr: 9.2500e-06  time: 72s
Epoch 3 - Score: 0.5580
Epoch 3 - Save Best Score: 0.5580 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 9s (remain 0m 0s) 


Unnamed: 0,id,label,pred
0,0,3,5
1,1,8,8
2,2,8,8
3,3,0,0
4,4,6,6
...,...,...,...
9995,9995,8,0
9996,9996,3,3
9997,9997,5,5
9998,9998,1,2



train finish!!!


In [2]:
#!CUDA_VISIBLE_DEVICES=0,1 python cifar-10_pytorch.py

In [3]:
%%time

CFG.device_ids = [0]
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
CFG.batch_size = 512

dm = CIFAR10DataModule(batch_size=CFG.batch_size)
dm.prepare_data()
dm.setup()
n_classes = dm.num_classes

if __name__ == '__main__':
    main()
    
LOGGER.info("\ntrain finish!!!")

Files already downloaded and verified
Files already downloaded and verified
Epoch: [1][0/88] Data 0.132 (0.132) Elapsed 0m 3s (remain 4m 29s) Loss: 2.3471(2.3471) Grad Norm: nan  LR: 1.0000e-04  
Epoch: [1][87/88] Data 0.065 (0.094) Elapsed 0m 38s (remain 0m 0s) Loss: 1.8541(2.0792) Grad Norm: 15.4361  LR: 1.0000e-04  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 1m 9s) 


labels: [3 8 8 0 6]
preds: [5 9 8 9 3]
Epoch 1 - avg_train_loss: 2.0792  lr: 1.0000e-04  time: 52s
Epoch 1 - Score: 0.3477
Epoch 1 - Save Best Score: 0.3477 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 7s (remain 0m 0s) 
Epoch: [2][0/88] Data 0.087 (0.087) Elapsed 0m 0s (remain 0m 35s) Loss: 1.7268(1.7268) Grad Norm: 11.8674  LR: 5.6688e-05  
Epoch: [2][87/88] Data 0.066 (0.135) Elapsed 0m 36s (remain 0m 0s) Loss: 1.4726(1.6070) Grad Norm: 6.7869  LR: 5.6688e-05  
EVAL: [0/100] Data 0.016 (0.016) Elapsed 0m 0s (remain 0m 7s) 


labels: [3 8 8 0 6]
preds: [5 8 8 8 6]
Epoch 2 - avg_train_loss: 1.6070  lr: 5.6688e-05  time: 49s
Epoch 2 - Score: 0.4465
Epoch 2 - Save Best Score: 0.4465 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/88] Data 0.088 (0.088) Elapsed 0m 0s (remain 0m 33s) Loss: 1.3659(1.3659) Grad Norm: 12.9139  LR: 9.2500e-06  
Epoch: [3][87/88] Data 0.066 (0.137) Elapsed 0m 36s (remain 0m 0s) Loss: 1.3561(1.3117) Grad Norm: 8.4783  LR: 9.2500e-06  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 0m 7s) 


labels: [3 8 8 0 6]
preds: [5 8 8 8 6]
Epoch 3 - avg_train_loss: 1.3117  lr: 9.2500e-06  time: 50s
Epoch 3 - Score: 0.4674
Epoch 3 - Save Best Score: 0.4674 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 6s (remain 0m 0s) 


Unnamed: 0,id,label,pred
0,0,3,5
1,1,8,8
2,2,8,8
3,3,0,8
4,4,6,6
...,...,...,...
9995,9995,8,0
9996,9996,3,3
9997,9997,5,5
9998,9998,1,2



train finish!!!


CPU times: user 2min 20s, sys: 20.3 s, total: 2min 40s
Wall time: 2min 34s


In [4]:
%%time

CFG.device_ids = [0,1]
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
CFG.batch_size = 512*2

dm = CIFAR10DataModule(batch_size=CFG.batch_size)
dm.prepare_data()
dm.setup()
n_classes = dm.num_classes

if __name__ == '__main__':
    main()
    
LOGGER.info("\ntrain finish!!!")

Files already downloaded and verified
Files already downloaded and verified


=> is_multiGPU [0, 1]


Epoch: [1][0/44] Data 0.216 (0.216) Elapsed 0m 3s (remain 2m 21s) Loss: 2.3448(2.3448) Grad Norm: nan  LR: 1.0000e-04  
Epoch: [1][43/44] Data 0.125 (0.185) Elapsed 0m 30s (remain 0m 0s) Loss: 2.0220(2.1699) Grad Norm: 11.2781  LR: 1.0000e-04  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 0m 9s) 


labels: [3 8 8 0 6]
preds: [5 8 1 8 6]
Epoch 1 - avg_train_loss: 2.1699  lr: 1.0000e-04  time: 46s
Epoch 1 - Score: 0.2518
Epoch 1 - Save Best Score: 0.2518 Model


EVAL: [99/100] Data 0.016 (0.014) Elapsed 0m 9s (remain 0m 0s) 
Epoch: [2][0/44] Data 0.195 (0.195) Elapsed 0m 0s (remain 0m 25s) Loss: 1.9445(1.9445) Grad Norm: 9.4949  LR: 5.6688e-05  
Epoch: [2][43/44] Data 0.134 (0.263) Elapsed 0m 27s (remain 0m 0s) Loss: 1.7242(1.8052) Grad Norm: 11.5136  LR: 5.6688e-05  
EVAL: [0/100] Data 0.014 (0.014) Elapsed 0m 0s (remain 0m 14s) 


labels: [3 8 8 0 6]
preds: [8 8 8 8 2]
Epoch 2 - avg_train_loss: 1.8052  lr: 5.6688e-05  time: 42s
Epoch 2 - Score: 0.3722
Epoch 2 - Save Best Score: 0.3722 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 9s (remain 0m 0s) 
Epoch: [3][0/44] Data 0.217 (0.217) Elapsed 0m 0s (remain 0m 24s) Loss: 1.5841(1.5841) Grad Norm: 9.3706  LR: 9.2500e-06  
Epoch: [3][43/44] Data 0.137 (0.268) Elapsed 0m 27s (remain 0m 0s) Loss: 1.5785(1.5692) Grad Norm: 7.8752  LR: 9.2500e-06  
EVAL: [0/100] Data 0.015 (0.015) Elapsed 0m 0s (remain 0m 9s) 


labels: [3 8 8 0 6]
preds: [5 8 9 8 6]
Epoch 3 - avg_train_loss: 1.5692  lr: 9.2500e-06  time: 42s
Epoch 3 - Score: 0.3934
Epoch 3 - Save Best Score: 0.3934 Model


EVAL: [99/100] Data 0.014 (0.014) Elapsed 0m 9s (remain 0m 0s) 


Unnamed: 0,id,label,pred
0,0,3,5
1,1,8,8
2,2,8,9
3,3,0,8
4,4,6,6
...,...,...,...
9995,9995,8,8
9996,9996,3,3
9997,9997,5,3
9998,9998,1,7



train finish!!!


CPU times: user 2min 33s, sys: 19.2 s, total: 2min 52s
Wall time: 2min 13s


In [5]:
# https://www.programcreek.com/python/example/107676/torch.nn.DataParallel
def main(batch_size=512, device_ids=[0]):
    best_acc = 0

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

    dataset_train = CIFAR10(root='.', train=True, download=True, 
                            transform=transforms_train)

    train_loader = DataLoader(dataset_train, batch_size=batch_size, 
                              shuffle=True, num_workers=8)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 
               'dog', 'frog', 'horse', 'ship', 'truck')

    print('==> Making model..')

    net = TimmModel(10, model_name="resnet50", pretrained=True)
    net.to(device)
    
    if len(device_ids) > 1:
        net = nn.DataParallel(net, device_ids=device_ids)
        net = net.to(device)
    
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(net.parameters(), lr=1e-3)
    # optimizer = SGD(net.parameters(), lr=1e-3, 
    #                       momentum=0.9, weight_decay=1e-4)
    
    train(net, criterion, optimizer, train_loader, device) 
    
main()

==> Preparing data..
Files already downloaded and verified
==> Making model..
The number of parameters of model is 23528522


NameError: name 'train' is not defined