In [1]:
import matplotlib.pyplot as plt
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from models.resnet import resnet50
from my_dataset import MyDataset
from my_lossfunc import JointLoss, MultilabelLoss, DiscriminativeLoss
from my_transform import data_transforms
from PIL import Image
from scipy.spatial.distance import pdist, cdist  # 一集合点距, 两集合点距
from torch.utils.data import DataLoader
from tqdm import tnrange
from tqdm import tqdm_notebook as tqdm
from utils import *

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Device:', DEVICE)


BASE = '/home/zengrui/datasets'
DUKE_DIR_TRAIN = f'{BASE}/ReID_Duke/bounding_box_train'
DUKE_DIR_TEST = f'{BASE}/ReID_Duke/bounding_box_test'
DUKE_DIR_QUERY = f'{BASE}/ReID_Duke/query'
DUKE_IMG_AMOUNT = 16522
DUKE_ID_AMOUNT = 702
MARKET_DIR_TRAIN = f'{BASE}/ReID_Market/bounding_box_train'
MARKET_DIR_TEST = f'{BASE}/ReID_Market/bounding_box_test'
MARKET_DIR_QUERY = f'{BASE}/ReID_Market/query'
MARKET_IMG_AMOUNT = 12936
MARKET_ID_AMOUNT = 751

SOURCE_DIR_TRAIN = DUKE_DIR_TRAIN
TARGET_DIR_TRAIN = MARKET_DIR_TRAIN
TARGET_DIR_GALLERY = MARKET_DIR_TEST
TARGET_DIR_PROBE = MARKET_DIR_QUERY
SOURCE_ID_AMOUNT = DUKE_ID_AMOUNT
TARGET_IMG_AMOUNT = MARKET_IMG_AMOUNT
ML_PATH = 'data/ml_Market.dat'
PRETRAIN_PATH = 'data/pretrained_weight.pkl'
PRETRAIN_OUT_PATH = 'data/pretrained_weight_{}.pkl'

BATCH_SIZE = 135
EPOCH = 20
LR = 0.01

BETA = 0.2
LAMB1 = 2e-4
LAMB2 = 50
MARGIN = 1
SCALA_CE = 30
WD = 2.5e-2

Device: cuda


# Prepare Work

In [2]:
# data loader
data_loader = {
    'source': DataLoader(
        dataset=MyDataset(SOURCE_DIR_TRAIN, 
                          transform=data_transforms('train'),
                          require_view=False,
                          encode_label=True),
        batch_size=BATCH_SIZE,
        shuffle=True,
    ),
    'target': DataLoader(
        dataset=MyDataset(TARGET_DIR_TRAIN,
                          transform=data_transforms('train'),
                          require_view=True,
                          encode_label=True),
        batch_size=BATCH_SIZE,
        shuffle=True,
    ),
    'gallery': DataLoader(
        dataset=MyDataset(TARGET_DIR_GALLERY,
                          transform=data_transforms('val'),
                          require_view=True),
        batch_size=BATCH_SIZE,
        shuffle=False,
    ),
    'probe': DataLoader(
        dataset=MyDataset(TARGET_DIR_PROBE,
                          transform=data_transforms('val'),
                          require_view=True),
        batch_size=BATCH_SIZE,
        shuffle=False,
    ),
}
print('data_loader: ok.')

data_loader: ok.


# Trainer

In [3]:
%matplotlib inline
class Trainer(object):
    def __init__(self):
        
        # 网络
        self.net = resnet50(pretrained=False, 
                            num_classes=SOURCE_ID_AMOUNT)
        self.net = nn.DataParallel(self.net).to(DEVICE)
        if PRETRAIN_PATH is not None and os.path.exists(PRETRAIN_PATH):
            self.net.load_state_dict(torch.load(PRETRAIN_PATH))
            print('Pretrained model loaded.')
        else:
            print('Pretrained model not found. Train from scratch.')
            
        # 损失
        self.mdl_loss = DiscriminativeLoss(0.001).to(DEVICE)
        self.al_loss = nn.CrossEntropyLoss().to(DEVICE)
        self.rj_loss = JointLoss(MARGIN).to(DEVICE)  # lack 1 param
        self.cml_loss = MultilabelLoss(BATCH_SIZE).to(DEVICE)
        
        # 优化器
        bn_params, other_params = partition_params(self.net, 'bn')
        self.optimizer = torch.optim.SGD([
                {'params': bn_params, 'weight_decay': 0},
                {'params': other_params},
            ], lr=LR, momentum=0.9, weight_decay=WD
        )
        self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            self.optimizer, milestones=[int(EPOCH / 8 * 5), int(EPOCH / 8 * 7)])
        
        # 存储器
        self.ml_mem = torch.zeros(TARGET_IMG_AMOUNT, SOURCE_ID_AMOUNT)
        self.inited = self.ml_mem.sum(dim=1) != 0
    
    def train(self):
        '''进行一次完整训练.'''
        print('Training start. Epochs: %d' % EPOCH)
        self.net.train()
        for epoch in tnrange(EPOCH):
            self.train_epoch(epoch)
    
    def train_epoch(self, epoch):
        '''训练一个epoch.'''
        stats = ('total', 'src', 'st', 'ml', 'tgt')
        running_loss = {stat: AverageMeter() for stat in stats}
        
        if not self.mdl_loss.initialized:
            self.init_losses(data_loader['target'])
            self.net.train()
        
        with tqdm(total=len(data_loader['source'])) as pbar:
            tgt_iter = iter(data_loader['target'])
            for step, (ax, ay) in enumerate(data_loader['source']):
                # a - source, b - target
                ax = ax.to(DEVICE)
                ay = ay.to(DEVICE)
                try:
                    b = next(tgt_iter)
                except StopIteration:
                    tgt_iter = iter(data_loader['target'])
                    b = next(tgt_iter)
                (bx, by, b_view, b_idx) = b
                bx, by, b_view = bx.to(DEVICE), by.to(DEVICE), b_view.to(DEVICE)

                a_f, a_sim, _ = self.net(ax)
                b_f, b_sim, _ = self.net(bx)

                loss = {stat: torch.Tensor([0]).to(DEVICE) 
                        for stat in stats}
                
                loss['src'] = self.al_loss(a_sim * SCALA_CE, ay)  # 有监督 交叉熵
                
                agents = self.net.module.fc.weight.renorm(2, 0, 1e-5).mul(1e5)  # 归一化 shape=(702, 2048)
                loss['st'] = self.rj_loss(agents.detach(), a_f, a_sim.detach(), ay, 
                                          b_f, b_sim.detach())
                
                with torch.no_grad():
                    ml = F.softmax(b_f.mm(agents.t_() * SCALA_CE), dim=1)  # t_(): 转置并inplace
                loss['ml'] = self.cml_loss(torch.log(ml), b_view)
    
                if epoch > 0:  # 为什么第一轮不算 mdl_loss 呢
                    ml_cpu = ml.detach().cpu()
                    is_inited_batch = self.inited[b_idx]
                    inited_idx = b_idx[is_inited_batch]
                    uninited_idx = b_idx[~is_inited_batch]
                    self.ml_mem[uninited_idx] = ml_cpu[~is_inited_batch]  # 0标签满更新
                    self.inited[uninited_idx] = True
                    self.ml_mem[inited_idx] = 0.9 * self.ml_mem[inited_idx] \
                                            + 0.1 * ml_cpu[is_inited_batch]  # 非空标签小更新
                    loss['tgt'] = self.mdl_loss(b_f, self.ml_mem[b_idx], by)

                self.optimizer.zero_grad()
                loss['total'] = loss['tgt'] + LAMB1 * loss['ml'] \
                              + LAMB2 * (loss['src'] + BETA * loss['st'])
                loss['total'].backward()
                self.optimizer.step()

                for stat in stats:
                    loss_cpu = float(loss[stat].data.cpu().numpy())
                    running_loss[stat].update(loss_cpu, BATCH_SIZE)
                pbar.set_description('Loss: %.4f' % running_loss['total'].avg)
                pbar.update()

            self.lr_scheduler.step()
            pbar.set_description('Progress:')
            print('Epoch: %d, Loss: %.4f (%.4f + %.4f + %.4f + %.4f)' 
                  % (epoch, 
                     running_loss['total'].avg, 
                     running_loss['src'].avg * LAMB2, 
                     running_loss['st'].avg * LAMB2 * BETA, 
                     running_loss['ml'].avg * LAMB1, 
                     running_loss['tgt'].avg))
            
    def eval_performance(self, gallery_loader, probe_loader):
        stats = ('r1', 'r5', 'r10', 'MAP')
        val = {stat: AverageMeter() for stat in stats}
        self.net.eval()
        
        gallery_f, gallery_y, gallery_views = extract_features(
            gallery_loader, self.net, index_feature=0)
        probe_f, probe_y, probe_views = extract_features(
            probe_loader, self.net, index_feature=0)
        dist = cdist(gallery_f, probe_f, metric='cosine')  # 实际是 1-cos ∈ [0, 2], 越小越相似
        CMC, MAP, example = eval_cmc_map(
            dist, gallery_y, probe_y, gallery_views, probe_views, 
            ignore_MAP=False, show_example=True)
#         CMC, MAP, example = eval_cmc_map(
#             dist, gallery_y, probe_y, 
#             ignore_MAP=False, show_example=True)
        r1, r5, r10 = CMC[0], CMC[4], CMC[9]
        self.r1, self.r5, self.r10, self.MAP = r1, r5, r10, MAP
        
        for stat in stats:
            val[stat].update(locals()[stat].item(), BATCH_SIZE)
            
        # 显示rank多图
#         if show_img_result:
#             plt.subplot(1, 11, 1)
#             plt.title('Query')
#             plt.imshow(Image.open(file.path, 'r'))

#             for i in range(10):
#                 plt.subplot(1, 11, i + 2)
#                 plt.imshow(Image.open(DATA_DIR_TEST +
#                                       '\\' + sort_list[i][0], 'r'))

#             plt.show()
            
        return val
            
    def init_losses(self, tgt_loader):
        '''训练前初始化loss参数.'''
        print('#' * 8, 'Initializing losses', '#' * 8)
        if os.path.isfile(ML_PATH):
            (ml, views, pairwise_agreements) = torch.load(ML_PATH)
            print('Ml loaded.')
        else:
            print('Ml not found, computing...')
            sim, _, views = extract_features(
                data_loader['target'], self.net, index_feature=1, return_numpy=False)
            ml = F.softmax(sim * SCALA_CE, dim=1)
            ml_np = ml.cpu().numpy()
            pairwise_agreements = 1 - pdist(ml_np, 'minkowski', p=1) / 2  # 相似比较特征
            print('Ml saving to %s...' % ML_PATH)
            torch.save((ml, views, pairwise_agreements), ML_PATH)

        self.cml_loss.init_centers(torch.log(ml), views)
        print('Cml_loss centers inited.')
        self.mdl_loss.init_threshold(pairwise_agreements)
        print('Mdl_loss threshold inited.')
        print('#' * 8, 'OK', '#' * 8)
        
    def save_model(self, cover=False):
        '''
        保存当前模型net的参数.
        
        :param cover: True覆盖默认文件, False新增带时间戳文件
        '''
        if cover:
            torch.save(self.net.state_dict(), PRETRAIN_PATH)
            print('Model weight saved(cover).')
        else:
            path = PRETRAIN_OUT_PATH.format(time.time())
            torch.save(self.net.state_dict(), path)
            print('Model weight saved(%s).' % path)

# Train

In [4]:
trainer = Trainer()
trainer.train()

Pretrained model loaded.
Training start. Epochs: 20


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

######## Initializing losses ########
Ml loaded.
Cml_loss centers inited.
Mdl_loss threshold inited.
######## OK ########


HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 0, Loss: 5.8766 (1.1384 + 4.6037 + 0.1344 + 0.0000)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 1, Loss: 6.4664 (1.2119 + 4.6429 + 0.1094 + 0.5022)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 2, Loss: 6.5649 (1.2799 + 4.6522 + 0.1055 + 0.5273)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 3, Loss: 6.7088 (1.3432 + 4.7182 + 0.1120 + 0.5355)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 4, Loss: 8.6022 (2.8322 + 5.1125 + 0.1121 + 0.5453)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 5, Loss: 14.6287 (7.8522 + 6.0889 + 0.1220 + 0.5655)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 6, Loss: 29.3314 (21.1651 + 7.4341 + 0.1325 + 0.5997)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 7, Loss: 48.8997 (39.9620 + 8.2002 + 0.1427 + 0.5947)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 8, Loss: 59.8443 (50.7297 + 8.3704 + 0.1400 + 0.6043)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 9, Loss: 65.8259 (56.5264 + 8.5390 + 0.1434 + 0.6172)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 10, Loss: 64.8356 (55.4091 + 8.6538 + 0.1451 + 0.6276)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 11, Loss: 63.3450 (53.6327 + 8.9444 + 0.1439 + 0.6241)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 12, Loss: 33.8628 (25.1132 + 7.9811 + 0.1342 + 0.6343)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 13, Loss: 21.9608 (13.7871 + 7.3938 + 0.1324 + 0.6476)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 14, Loss: 19.1229 (11.0867 + 7.2617 + 0.1298 + 0.6448)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 15, Loss: 17.4543 (9.5215 + 7.1569 + 0.1319 + 0.6440)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 16, Loss: 15.8727 (8.0517 + 7.0642 + 0.1307 + 0.6261)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 17, Loss: 14.5699 (6.8455 + 6.9688 + 0.1306 + 0.6250)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 18, Loss: 14.3251 (6.6311 + 6.9395 + 0.1330 + 0.6214)



HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

Epoch: 19, Loss: 13.9450 (6.2386 + 6.9248 + 0.1297 + 0.6519)




In [5]:
trainer.save_model()
trainer.save_model(cover=True)

Model weight saved.
Model weight saved.


# Eval

In [6]:
# trainer = Trainer()

In [7]:
trainer.eval_performance(data_loader['gallery'], data_loader['probe'])

{'r1': <utils.AverageMeter at 0x7f25f3603b50>,
 'r5': <utils.AverageMeter at 0x7f25f3603d10>,
 'r10': <utils.AverageMeter at 0x7f25f3603f90>,
 'MAP': <utils.AverageMeter at 0x7f25f3603250>}

In [8]:
trainer.r1, trainer.r5, trainer.r10, trainer.MAP

(30.997624703087883, 48.07007125890736, 55.99762470308789, 15.470718581181242)

In [9]:
exit()