In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from models.resnet import resnet50
from my_dataset import MyDataset
from my_lossfunc import JointLoss, MultilabelLoss, DiscriminativeLoss
from my_transform import data_transforms
from scipy.spatial.distance import pdist, cdist  # 一集合点距, 两集合点距
from torch.utils.data import DataLoader
from tqdm import tnrange
from tqdm import tqdm_notebook as tqdm
from utils import *

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Device:', DEVICE)


BASE = '/home/zengrui/datasets'
DUKE_DIR_TRAIN = f'{BASE}/ReID_Duke/bounding_box_train'
DUKE_DIR_TEST = f'{BASE}/ReID_Duke/bounding_box_test'
DUKE_IMG_AMOUNT = 16522
DUKE_ID_AMOUNT = 702
MARKET_DIR_TRAIN = f'{BASE}/ReID_Market/bounding_box_train'
MARKET_DIR_GALLERY = f'{BASE}/ReID_Market/bounding_box_test'
MARKET_DIR_PROBE = f'{BASE}/ReID_Market/query'
MARKET_IMG_AMOUNT = 12936
MARKET_ID_AMOUNT = 751
ML_PATH = 'data/ml_Market.dat'
PRETRAIN_PATH = 'data/pretrained_weight.pkl'
PRETRAIN_OUT_PATH = 'data/pretrained_weight_{}.pkl'

BATCH_SIZE = 96
EPOCH = 30
LR = 0.01

BETA = 0.2
LAMB1 = 2e-4
LAMB2 = 50
MARGIN = 1
SCALA_CE = 30

Device: cuda


# Prepare Work

In [2]:
# data loader
data_loader = {
    'source': DataLoader(
        dataset=MyDataset(DUKE_DIR_TRAIN, 
                          transform=data_transforms('train'),
                          require_view=False,
                          encode_label=True),
        batch_size=BATCH_SIZE,
        shuffle=True,
    ),
    'target': DataLoader(
        dataset=MyDataset(MARKET_DIR_TRAIN,
                          transform=data_transforms('train'),
                          require_view=True,
                          encode_label=True),
        batch_size=BATCH_SIZE,
        shuffle=True,
    ),
    'gallery': DataLoader(
        dataset=MyDataset(MARKET_DIR_GALLERY,
                          transform=data_transforms('test'),
                          require_view=True),
        batch_size=BATCH_SIZE,
        shuffle=False,
    ),
    'probe': DataLoader(
        dataset=MyDataset(MARKET_DIR_PROBE,
                          transform=data_transforms('test'),
                          require_view=True),
        batch_size=BATCH_SIZE,
        shuffle=False,
    ),
}
print('data_loader: ok.')

data_loader: ok.


# Trainer

In [3]:
class Trainer(object):
    def __init__(self):
        
        # 网络
        self.net = resnet50(pretrained=False, 
                            num_classes=DUKE_ID_AMOUNT)
        self.net = nn.DataParallel(self.net).to(DEVICE)
        if PRETRAIN_PATH is not None and os.path.exists(PRETRAIN_PATH):
            self.net.load_state_dict(torch.load(PRETRAIN_PATH))
            print('Pretrained model loaded.')
        else:
            print('Pretrained model not found. Train from scratch.')
            
        # 损失
        self.mdl_loss = DiscriminativeLoss(0.001).to(DEVICE)
        self.al_loss = nn.CrossEntropyLoss().to(DEVICE)
        self.rj_loss = JointLoss(MARGIN).to(DEVICE)  # lack 1 param
        self.cml_loss = MultilabelLoss(BATCH_SIZE).to(DEVICE)
        
        # 优化器
        self.optimizer = torch.optim.SGD(
            self.net.parameters(), lr=LR, momentum=0.9)
        self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            self.optimizer, milestones=[int(EPOCH / 8 * 5), int(EPOCH / 8 * 7)])
        
        # 存储器
        self.ml_mem = torch.zeros(MARKET_IMG_AMOUNT, DUKE_ID_AMOUNT)
        self.inited = self.ml_mem.sum(dim=1) != 0
    
    def train(self):
        '''进行一次完整训练.'''
        print('Training start. Epochs: %d' % EPOCH)
        self.net.train()
        for epoch in tnrange(EPOCH):
            self.train_epoch(epoch)
    
    def train_epoch(self, epoch):
        '''训练一个epoch.'''
        stats = ('total', 'src', 'st', 'ml', 'tgt')
        running_loss = {stat: AverageMeter() for stat in stats}
        
        if not self.mdl_loss.initialized:
            self.init_losses(data_loader['target'])
        
        with tqdm(total=len(data_loader['source'])) as pbar:
            tgt_iter = iter(data_loader['target'])
            for step, (ax, ay) in enumerate(data_loader['source']):
                # a - source, b - target
                ax = ax.to(DEVICE)
                ay = ay.to(DEVICE)
                try:
                    b = next(tgt_iter)
                except StopIteration:
                    tgt_iter = iter(data_loader['target'])
                    b = next(tgt_iter)
                (bx, by, b_view, b_idx) = b
                bx, by, b_view = bx.to(DEVICE), by.to(DEVICE), b_view.to(DEVICE)

                a_f, a_sim, _ = self.net(ax)
                b_f, b_sim, _ = self.net(bx)

                loss = [stat: torch.Tensor([0]).to(DEVICE) 
                        for stat in stats]
                
                loss['src'] = self.al_loss(a_sim * SCALA_CE, ay)  # 有监督 交叉熵
                
                agents = self.net.module.fc.weight.renorm(2, 0, 1e-5).mul(1e5)  # 归一化 shape=(702, 2048)
                loss['st'] = self.rj_loss(agents.detach(), a_f, a_sim.detach(), ay, 
                                          b_f, b_sim.detach())
                
                with torch.no_grad():
                    ml = F.softmax(b_f.mm(agents.t_() * SCALA_CE), dim=1)  # t_(): 转置并inplace
                loss['ml'] = self.cml_loss(torch.log(ml), b_view)
    
                if epoch > 0:  # 为什么第一轮不算 mdl_loss 呢
                    ml_cpu = ml.detach().cpu()
                    is_inited_batch = self.inited[b_idx]
                    inited_idx = b_idx[is_inited_batch]
                    uninited_idx = b_idx[~is_inited_batch]
                    self.ml_mem[uninited_idx] = ml_cpu[~is_inited_batch]  # 0标签满更新
                    self.inited[uninited_idx] = True
                    self.ml_mem[inited_idx] = 0.9 * self.ml_mem[inited_idx] \
                                            + 0.1 * ml_cpu[is_inited_batch]  # 非空标签小更新
                    loss['tgt'] = self.mdl_loss(b_f, self.ml_mem[b_idx], by)

                self.optimizer.zero_grad()
                loss['total'] = loss['tgt'] + LAMB1 * loss['ml'] \
                              + LAMB2 * (loss['src'] + BETA * loss['st'])
                loss['total'].backward()
                self.optimizer.step()

                for stat in stats:
                    loss_cpu = float(loss[stat].data.cpu().numpy())
                    running_loss[stat].update(loss_cpu)
                pbar.set_description('Loss: %.4f' % running_loss['total'].avg)
                pbar.update()

            self.lr_scheduler.step()
            pbar.set_description('Progress:')
            print('Epoch: %d, Loss: %.4f (%.4f + %.4f + %.4f + %.4f)' 
                  % (epoch, 
                     running_loss['total'].avg, 
                     running_loss['src'].avg * LAMB2, 
                     running_loss['st'].avg * LAMB2 * BETA, 
                     running_loss['ml'].avg * LAMB1, 
                     running_loss['tgt'].avg))
            
    def eval_performance(self, target_loader, gallery_loader, probe_loader):
        stats = ('r1', 'r5', 'r10', 'MAP')
        val = {stat: AverageMeter() for stat in stats}
        self.net.eval()
        
        gallery_f, gallery_y, gallery_views = extract_features(
            gallery_loader, self.net, index_feature=0)
        probe_f, probe_y, probe_views = extract_features(
            probe_loader, self.net, index_feature=0)
        dist = cdist(gallery_f, probe_f, metric='cosine')
        CMC, MAP = eval_cmc_map(
            dist, gallery_y, probe_y, gallery_views, probe_views, ignore_MAP=False)
        r1, r5, r10 = CMC[0], CMC[4], CMC[9]
        
        for stat in stats:
            val[stat].update(locals()[k].item(), BATCH_SIZE)
        return val
            
    def init_losses(self, tgt_loader):
        '''训练前初始化loss参数.'''
        print('#' * 8, 'Initializing losses', '#' * 8)
        if os.path.isfile(ML_PATH):
            (ml, views, pairwise_agreements) = torch.load(ML_PATH)
            print('Ml loaded.')
        else:
            print('Ml not found, computing...')
            sim, _, views = extract_features(
                data_loader['target'], self.net, index_feature=1, return_numpy=False)
            ml = F.softmax(sim * SCALA_CE, dim=1)
            ml_np = ml.cpu().numpy()
            pairwise_agreements = 1 - pdist(ml_np, 'minkowski', p=1) / 2  # 相似比较特征
            print('Ml saving to %s...' % ML_PATH)
            torch.save((ml, views, pairwise_agreements), ML_PATH)

        self.cml_loss.init_centers(torch.log(ml), views)
        print('Cml_loss centers inited.')
        self.mdl_loss.init_threshold(pairwise_agreements)
        print('Mdl_loss threshold inited.')
        print('#' * 8, 'OK', '#' * 8)
        
    def save_model(self, cover=False):
        '''
        保存当前模型net的参数.
        
        :param cover: True覆盖默认文件, False新增带时间戳文件
        '''
        if cover:
            torch.save(self.net.state_dict(), PRETRAIN_PATH)
        else:
            torch.save(self.net.state_dict(), PRETRAIN_OUT_PATH.format(time.time()))
        print('Model weight saved.')

# Train

In [4]:
trainer = Trainer()
trainer.train()

Pre-trained model loaded.
Training start. Epochs: 30


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

######## Initializing losses ########
Ml loaded.
Cml_loss centers inited.
Mdl_loss threshold inited.
######## OK ########


HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 0, Loss: 71.3993 (58.4740 + 12.0500 + 0.8753 + 0.0000)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 1, Loss: 69.7113 (56.9485 + 11.9994 + 0.1691 + 0.5943)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 2, Loss: 68.7956 (56.1195 + 11.9559 + 0.1456 + 0.5747)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 3, Loss: 66.7055 (54.0475 + 11.9342 + 0.1389 + 0.5849)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 4, Loss: 65.7732 (53.1588 + 11.8853 + 0.1410 + 0.5881)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 5, Loss: 64.2783 (51.7005 + 11.8379 + 0.1421 + 0.5978)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 6, Loss: 64.1345 (51.5798 + 11.8195 + 0.1424 + 0.5927)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 7, Loss: 62.4723 (49.9944 + 11.7448 + 0.1418 + 0.5914)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 8, Loss: 61.0411 (48.5983 + 11.7114 + 0.1404 + 0.5910)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 9, Loss: 59.8699 (47.4327 + 11.6880 + 0.1449 + 0.6044)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 10, Loss: 59.2143 (46.8263 + 11.6403 + 0.1417 + 0.6060)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 11, Loss: 58.0917 (45.7545 + 11.5975 + 0.1435 + 0.5962)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 12, Loss: 56.8820 (44.5744 + 11.5534 + 0.1424 + 0.6119)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 13, Loss: 55.7237 (43.4499 + 11.5211 + 0.1443 + 0.6085)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 14, Loss: 54.3753 (42.1523 + 11.4678 + 0.1448 + 0.6104)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 15, Loss: 53.0909 (40.9400 + 11.4091 + 0.1432 + 0.5987)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 16, Loss: 53.2075 (41.0686 + 11.3962 + 0.1420 + 0.6007)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 17, Loss: 51.3622 (39.2890 + 11.3300 + 0.1415 + 0.6016)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 18, Loss: 48.3347 (36.3719 + 11.2089 + 0.1413 + 0.6127)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 19, Loss: 47.3945 (35.4275 + 11.2277 + 0.1397 + 0.5997)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 20, Loss: 47.7074 (35.7434 + 11.2270 + 0.1389 + 0.5981)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 21, Loss: 47.1715 (35.1937 + 11.2285 + 0.1394 + 0.6099)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 22, Loss: 47.2792 (35.3174 + 11.2165 + 0.1428 + 0.6026)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 23, Loss: 46.6880 (34.7435 + 11.2100 + 0.1406 + 0.5939)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 24, Loss: 47.1880 (35.2148 + 11.2292 + 0.1406 + 0.6033)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 25, Loss: 46.5468 (34.5813 + 11.2104 + 0.1383 + 0.6167)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 26, Loss: 46.2470 (34.2921 + 11.2072 + 0.1406 + 0.6071)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 27, Loss: 46.0727 (34.1213 + 11.2022 + 0.1406 + 0.6086)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 28, Loss: 46.1970 (34.2461 + 11.1988 + 0.1377 + 0.6143)



HBox(children=(IntProgress(value=0, max=173), HTML(value='')))

Epoch: 29, Loss: 46.1508 (34.2094 + 11.1932 + 0.1394 + 0.6089)




In [6]:
trainer.save_model()
trainer.save_model(cover=True)

Model weight saved.


# Eval

In [None]:
trainer.eval_performance(data_loader['target'], data_loader['gallery'], data_loader['probe'])