In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
import pdb
import cv2
import time
import glob
import timm
import random
from PIL import Image
from glob import glob
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torch.nn.functional as F
import albumentations as A

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold, KFold

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [2]:
# 计算batch size的方法
# 1. 首先计算每张图片的size(MB) = L * W * Channel * float / 8 (bit -> Byte)
image_size = 512 * 512 * 3 * 32 / 8  # Byte
image_size = image_size / (1024 * 1024) # Byte -> MB
# 2. batch size的大小建议为GPU显存的1/4到1/2，最高不能超过1/2，因为training时需要同时计算正向梯度图和反向传播的梯度。假设GPU的显存为8G，用1/4作为training，那么就是2G用来存放一个batch。
batch_size_train = math.floor(2000 / image_size)
print(batch_size_train)
# 3. validation阶段的batch size = batch_size_train * 2 原因是validation阶段不需要做反向传播，所以batch size
batch_size_val = batch_size_train * 2

666


In [3]:
def set_seed(seed=42):
    random.seed(seed) # python
    np.random.seed(seed) # numpy
    torch.manual_seed(seed) # pytorch
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True # 当使用CuDNN（CUDA的深度神经网络库）时，通过将此设置为True，确保了卷积操作的输出在每次运行时是确定性的。
    torch.backends.cudnn.benchmark = False # 关闭CuDNN的自动调整功能，以确保卷积等操作的计算时间在每次运行时相同。
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    # 设置PyTorch使用确定性算法。在深度学习中，有时候我们希望训练过程是确定性的，即相同的输入和参数会产生相同的输出。这有助于实现实验的可重复性
    # 启用此选项会使用确定性的（但通常比非确定性的算法慢）算法，以确保相同的输入和参数在不同运行中产生相同的结果。
    torch.use_deterministic_algorithms(True) 
    # 设置CUDA（GPU加速计算）库（cuBLAS）的工作空间配置。cuBLAS是NVIDIA提供的针对线性代数计算的GPU加速库。
    # 该设置涉及到cuBLAS的内存配置，这个设置的目的是优化GPU上的cuBLAS操作，以提高性能。在一些情况下，手动调整这些配置可以对模型的训练速度产生影响。具体而言：
    # :4096 这表示设置cuBLAS的默认工作空间大小为4 MB。这个值可能需要根据你的任务和硬件进行调整。
    # 8 这是一个控制器限制的值，它与cuBLAS的性能有关。这个值也可能需要根据你的硬件和任务进行调整。
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [4]:
class CFG:
    # step 1: hyperparameter
    seed = 42
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # step 2: data
    n_fold = 4
    img_size = [512, 512]
    train_bs = 32
    valid_bs = train_bs * 2
    # step 3: model
    backbone = 'efficientnet_b0'
    num_classes = 5
    label_def = {"HGSC":0, 'EC':1, 'CC':2, 'LGSC':3, 'MC':4}
    label_def_re = {0:"HGSC", 1:"EC", 2:'CC', 3:"LGSC", 4:'MC'}
    # step 4: optimizer
    epoch = 10
    lr = 1e-3
    wd = 1e-5
    lr_drop = 8
    # step 5: infer
    TTA = True
    # step 6: files
    ckpt_fold = 'ckpt-1120'
    ckpt_name = "efficientnetb0_img512512_bs32_fold4_epoch10_lr1e3"  # for submit
    train_path = "/kaggle/input/ubc-ocean-512/train_images"

In [5]:
def build_transforms(CFG):
    data_transforms = {
        "train": A.Compose([
            # # dimension should be multiples of 32
            # ref: https://github.com/facebookresearch/detr/blob/main/datasets/coco.py
            A.Resize(*CFG.img_size, interpolation=cv2.INTER_NEAREST, p=1.0),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], p=1.0),  # ImageNet的pretrain经验参数
            # A.HorizontalFlip(p=0.5),
            # A.VerticalFlip(p=0.5),
            # A.ShiftScaleRotate(shift_limit=0.625, scale_limit=0.05, rotate_limit=10, p=0.5),
#             A.OneOf([
#                 A.GridDistortion(num_steps=5, distort_limit=0.05, p=1.0), 
#                 A.OpticalDistortion(distort_limit=0.05, shift_limit=0.05, p=1.0),
#                 A.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=1.0)
#             ], p=0.25),
#             A.CoarseDropout(max_holes=8, max_height=CFG.img_size[0]//20, max_width=CFG.img_size[1]//20,
#                            min_holes=5, fill_value=0, mask_fill_value=0, p=0.5),
            
        ]),
        "valid_test": A.Compose([
            A.Resize(*CFG.img_size, interpolation=cv2.INTER_NEAREST, p=1.0),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], p=1.0),
        ])
    }
    return data_transforms

In [6]:
class build_dataset(Dataset):
    def __init__(self, df, train_val_flag=True, transforms=None):
        self.df = df
        self.train_val_flag = train_val_flag
        self.img_paths = df['img_path'].tolist()
        self.ids = df['image_id'].tolist()
        self.transforms = transforms
        
        if self.train_val_flag:
            self.label = df['img_label'].tolist()
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        id = self.ids[index]
        img = Image.open(self.img_paths[index]).convert("RGB")
        
        if self.train_val_flag: 
            data = self.transforms(image=np.array(img))
            img = np.transpose(data['image'], (2, 0, 1))  # [c, h, w]
            label = self.label[index]
            return torch.tensor(img), torch.from_numpy(np.array(label).astype(int))  # 训练过程中包含ground truth
        else: # test
            ### augmentations
            data = self.transforms(image=np.array(img))
            img = np.transpose(data['image'], (2, 0, 1))
            return torch.tensor(img), id # 测试过程中不能添加ground truth，但要保留id

In [7]:
def build_dataloader(df, fold, data_transforms):
    train_df = df.query("fold!=@fold").reset_index(drop=True)
    valid_df = df.query("fold==@fold").reset_index(drop=True)
    train_dataset = build_dataset(train_df, train_val_flag=True, transforms=data_transforms['train'])
    valid_dataset = build_dataset(valid_df, train_val_flag=True, transforms=data_transforms['valid_test'])
    
    train_loader = DataLoader(train_dataset, batch_size=CFG.train_bs, num_workers=0, shuffle=True, pin_memory=True, drop_last=False)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.valid_bs, num_workers=0, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [8]:
def build_model(CFG, pretrain_flag=False):
    if pretrain_flag:
        pretrain_weights = "imagenet"
    else:
        pretrain_weights = False
    model = timm.create_model(CFG.backbone,
                             pretrained=pretrain_weights,
                             num_classes=CFG.num_classes)
    model.to(CFG.device)
    return model

In [9]:
def build_loss():
    CELoss = torch.nn.CrossEntropyLoss()
    return {"CELoss": CELoss}

In [10]:
def train_one_epoch(model, train_loader, optimizer, losses_dict, CFG):
    model.train()
    scaler = amp.GradScaler()   # 开启混合精度训练
    losses_all, ce_all, total = 0, 0, 0
    
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Train ")
    for _, (images, gt) in pbar:
        optimizer.zero_grad()
        
        images = images.to(CFG.device, dtype=torch.float) # [b, c, w, h]
        gt = gt.to(CFG.device)
        
        with amp.autocast(enabled=True):
            y_preds = model(images) #[b, c, w, h]
            ce_loss = losses_dict["CELoss"](y_preds, gt.long())
            losses = ce_loss
        
        scaler.scale(losses).backward()
        scaler.step(optimizer)
        scaler.update()
        
        losses_all += losses.item()
        ce_all += ce_loss.item()
        total += gt.shape[0]
    
    current_lr = optimizer.param_groups[0]['lr']
    print("lr: {:.4f}".format(current_lr), flush=True)
    print("loss: {:.3f}, ce_all: {:.3f}".format(losses_all/total, ce_all/total, flush=True))

补充知识：任何norm中如Batch Normalization中有两个可以学习的参数scale factor和shift factor

https://blog.csdn.net/wildridder/article/details/88534844 中y_i中的gamma和beta这两个参数

In [11]:
@torch.no_grad()  # 不要梯度图
def valid_one_epoch(model, valid_loader, CFG):
    model.eval()  # 计算出当前epoch中，把Norm中的scale/shift factor固定住
    correct = 0
    total = 0
    
    pbar = tqdm(enumerate(valid_loader), total=len(valid_loader), desc="Valid ")
    for _, (images, gt) in pbar:
        images = images.to(CFG.device, dtype=torch.float) # [b, c, w, h]
        gt = gt.to(CFG.device)
        
        y_preds = model(images)
        _, y_preds = torch.max(y_preds.data, dim=1)
        correct += (y_preds==gt).sum()
        
        total += gt.shape[0]
    
    val_acc = correct / total
    print("val_acc: {:.2f}".format(val_acc), flush=True)
    
    return val_acc

@torch.no_grad()
def test_one_epoch(ckpt_paths, test_loader, CFG):
    pred_ids = []
    pred_cls = []
    pbar = tqdm(enumerate(test_loader), total=len(test_loader), desc="Test: ")
    for _, (images, ids) in pbar:
        images = images.to(CFG.device, dtype=torch.float)  # [b, c, w, h]
        model = build_model(CFG, pretrain_flag=False)
        model.load_state_dict(torch.load(ckpt_paths))
        model.eval()  # 固定 BN layer
        
        y_preds = model(images) # [bs, num_cls]  tensor([1.6443, -0.7230, -2.0762, 0.5059, -1.1870])
        y_prob = F.softmax(y_preds, dim=1) # tensor([0.6679, 0.0626, 0.0162, 0.2139, 0.0394]) 和为1的
        cls_pred = y_prob.argmax(1)  
        
        for pred in cls_pred.data.cpu().numpy():
            pred_cls.append(pred)
        for id in ids.data.cpu().numpy().tolist():
            pred_ids.append(id) # 41 即41.png
    return pred_ids, pred_cls

### 训练过程 

`train_val_flag = True`

In [12]:
set_seed(CFG.seed)
ckpt_path = f"/kaggle/working/{CFG.ckpt_fold}/{CFG.ckpt_name}"
if not os.path.exists(ckpt_path):
    os.makedirs(ckpt_path)
    
train_val_flag = False
if train_val_flag:
    ############################################
    ###### part 0: data preprocess & simple EDA
    ############################################
    # 样本筛选：这一段代码在以后的项目中并不会重复使用，仅仅是针对此项目其他图片很大，找一些小图片样本
    df_ori = pd.read_csv("/kaggle/input/train-csv/train.csv")
    # df_ori = df_ori[df_ori['is_tma'] == True].reset_index()  # 只找一些尺寸比较小的图片
    # print(df_ori)  # 25张 tma为true的图片，尺寸大约在3000*3000
    
    # Label mapping
    encoder = LabelEncoder()
    # method 1，在label那一列上原地修改
    df_ori['label_str'] = df_ori['label']
    df_ori['label'] = encoder.fit_transform(df_ori['label'])
    # method 2, 新建一个img_label列
    df_ori['img_label'] = encoder.transform(df_ori['label_str'])
    # print(df_ori)
    
    # path mapping
    FILES = sorted(glob(CFG.train_path + "/*.png")) # 查找匹配指定模式的文件路径名，返回list。
    # print(FILES) # ['/kaggle/input/UBC-OCEAN/train_images/10077.png', '/kaggle/input/UBC-OCEAN/train_images/10143.png'...]
    ID2FILE = {
        int(os.path.basename(file).split(".")[0]) : file for file in FILES
    }
    # print(ID2FILE) # {10077: '/kaggle/input/UBC-OCEAN/train_images/10077.png', 10143: '/kaggle/input/UBC-OCEAN/train_images/10143.png',...}
    def get_train_file_path(image_id):
        return ID2FILE[image_id]
    df_ori['img_path'] = df_ori['image_id'].apply(get_train_file_path)
    # print(df_ori)
    
    ############################################################
    ###### trick 1: cross validation train 
    ###### 分成几个fold分别推理融合，如果是fold==0，那就用0做training，1做valdation，如果是fold==1，那就用1做training，0做validation
    ############################################################
    # documents: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedGroupKFold.html
    kf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_ori)):
        df_ori.loc[val_idx, 'fold'] = fold
    # print(df_ori)      # 增加了一个fold列，里面有0和1（当前的k=2）
    for fold in range(CFG.n_fold):
        print(f"#"*40, flush=True)
        print(f"##### Fold: {fold}", flush=True)
        print(f"#"*40, flush=True)
        print("Device: ", CFG.device)
        
        ############################################################
        ###### step 2: combination 
        ###### build_transform & build_dataset() & build_dataloader()
        ###### build_model() & build_loss()
        ############################################################
        data_transforms = build_transforms(CFG)
        train_loader, valid_loader = build_dataloader(df_ori, fold, data_transforms)
        model = build_model(CFG, pretrain_flag=False)
        optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.wd)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, CFG.lr_drop)
        losses_dict = build_loss() # loss
        
        best_val_acc = 0
        best_epoch = 0
        
        for epoch in range(1, CFG.epoch+1):
            start_time = time.time()
            ############################################################
            ###### step 3: train & val 
            ############################################################
            train_one_epoch(model, train_loader, optimizer, losses_dict, CFG)
            lr_scheduler.step()
            val_acc = valid_one_epoch(model, valid_loader, CFG)
            # print(val_acc)
            ############################################################
            ###### step 4: Save best model
            ############################################################
            is_best = (val_acc > best_val_acc)
            best_val_acc = max(val_acc, best_val_acc)
            if is_best:
                save_path = f"{ckpt_path}/best_fold{fold}_epoch{epoch}.pth"
                if os.path.isfile(save_path):
                    os.remove(save_path)
                torch.save(model.state_dict(), save_path)
            
            epoch_time = time.time() - start_time
            print("epoch: {}, time: {:.2f}s, best: {:.2f}\n".format(epoch, epoch_time, best_val_acc), flush=True)
                

### 测试过程

`test_flag = True`

In [13]:
test_flag = True
if test_flag:
    ############################################
    ###### part 0: data preprocess & simple EDA
    ############################################
    ROOT_DIR = '/kaggle/input/'
    TEST_DIR = '/kaggle/input/test-thumbnails'
    ALT_TEST_DIR = '/kaggle/input/test_images'
    # CKPT_DIR = f"/kaggle/working/{CFG.ckpt_fold}/{CFG.ckpt_name}/best_fold1_epoch4.pth" # 需要每次都更换
    CKPT_DIR = "/kaggle/input/ubc-temp/best_fold1_epoch4.pth" # for kaggle submission
    
    def get_test_file_path(image_id):
        if os.path.exists(f"{TEST_DIR}/{image_id}_thumbnail.png"):
            return f"{TEST_DIR}/{image_id}_thumbnail.png"
        else:
            return f"{ALT_TEST_DIR}/{image_id}.png"
    
    df = pd.read_csv(f"{ROOT_DIR}/ubc-test-csv/test.csv")
    df['img_path'] = df['image_id'].apply(get_test_file_path)
    df['label'] = 0
#     df_sub = pd.read_csv(f"{ROOT_DIR}/sample_submission.csv")
    
    ############################################
    ###### part 1: data load & pred
    ############################################
    
    data_transforms = build_transforms(CFG)
    test_dataset = build_dataset(df, train_val_flag=False, transforms=data_transforms['valid_test'])
    test_loader = DataLoader(test_dataset, batch_size=CFG.valid_bs, num_workers=0, shuffle=False, pin_memory=True)
    
    pred_ids, pred_cls = test_one_epoch(CKPT_DIR, test_loader, CFG)
    ############################################
    ###### part 2: submit
    ############################################
    pred_df = pd.DataFrame({
        "image_id": pred_ids,
        "label": pred_cls,
    })
    def label_mapping(image_str):
        return CFG.label_def_re[image_str]
    pred_df['label'] = pred_df['label'].apply(label_mapping)
    pred_df.to_csv('submission.csv', index=None)

Test: 100%|██████████| 1/1 [00:08<00:00,  8.04s/it]
