In [None]:
import sys
sys.path.insert(0,"../")

In [None]:
# default_exp trainer

In [None]:
#export
import builtins
import math
import os
import random
import shutil
import time
import warnings
from tqdm import tqdm
import numpy as np
import argparse

In [None]:
#export

from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.utils.data import DataLoader

In [None]:
#export

from core.model.model import MoCo
from core.dataloader import CLEVR_train, collate_boxes, CLEVR_train_onlyquery, collate_boxes_onlyquery
from core.utils import compute_features, run_kmeans, AverageMeter, ProgressMeter, adjust_learning_rate, accuracy, save_checkpoint, DoublePool_O, store_to_pool, random_retrieve_topk, plot_query_retrieval

In [None]:
parser = argparse.ArgumentParser(description='Relational 2d Training')
# parser.add_argument('data', metavar='DIR',
#                     help='path to datasets root directory')
parser.add_argument('-j', '--num-worker', default=1, type=int, metavar='N',
                    help='number of data loading workers (default: 1)')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=16, type=int,
                    metavar='N',
                    help='mini-batch size (default: 16), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.03, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int,
                    help='learning rate schedule (when to drop lr by 10x)')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum of SGD solver')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print iter frequency (default: 100)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')

parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument("--gpu", type=int, nargs='+', default=None, help='GPU id to use.')
parser.add_argument('--warmup-epoch', default=10, type=int,
                    help='number of warm-up epochs to only train with InfoNCE loss')
parser.add_argument('--cos', action='store_true',
                    help='use cosine lr schedule')
parser.add_argument('--exp-dir', default='experiment_pcl', type=str,
                    help='experiment directory to store tb logs and checkpoints')
parser.add_argument('--num-cluster', default='50,100,200', type=str, 
                    help='number of clusters (should be less than equal to number of samples)')

_StoreAction(option_strings=['--num-cluster'], dest='num_cluster', nargs=None, const=None, default='50,100,200', type=<class 'str'>, choices=None, help='number of clusters (should be less than equal to number of samples)', metavar=None)

In [None]:
import os
if not os.path.exists('./tb_logs'):
    os.makedirs('./tb_logs')

In [None]:
#export

def setup_tb(exp_name):
    tb_directory = os.path.join('./tb_logs', exp_name)
    return SummaryWriter(tb_directory)

In [None]:
#export

def run_training(args):
    
#     parser = argparse.ArgumentParser(description='Relational 2d Training')
    
#     if default_args:
#         args = parser.parse_args(default_args)
#     else:
#         args = parser.parse_args()
        
    tb_logger = setup_tb(args.exp_dir)
    
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')
        
    args.num_cluster = args.num_cluster.split(',')
    
    if not os.path.exists(args.exp_dir):
        os.mkdir(args.exp_dir)
    if not os.path.exists(os.path.join('./tb_logs',args.exp_dir)):
        os.mkdir(os.path.join('./tb_logs', args.exp_dir))
    
    ngpus_per_node = torch.cuda.device_count()
    
    gpu_devices = ','.join([str(id) for id in range(ngpus_per_node)])
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
    
    best_acc = 0

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    
    traindir = os.path.join(args.data)
    valdir = os.path.join(args.data[:-5] + 'v.txt')
    moco_train_dataset = CLEVR_train(root_dir=traindir, hyp_N=args.hyp_N)
    moco_train_loader = DataLoader(moco_train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_boxes)
    
    kmeans_train_dataset = CLEVR_train_onlyquery(root_dir=traindir, hyp_N=args.hyp_N)
    kmeans_train_loader = DataLoader(kmeans_train_dataset, batch_size=5*args.batch_size, shuffle=False, collate_fn=collate_boxes_onlyquery)
    
    pool_size = len(moco_train_dataset)
    
    isnode = False
    if args.mode=='node':
        isnode = True
        
    pool_e_train = DoublePool_O(pool_size, isnode)
    pool_g_train = DoublePool_O(pool_size, isnode)
    
    moco_val_dataset = CLEVR_train(root_dir=valdir, hyp_N=args.hyp_N)
    moco_val_loader = DataLoader(moco_val_dataset, batch_size=1, shuffle=True, collate_fn=collate_boxes)
    
    pool_size = len(moco_val_dataset)
    pool_e_val = DoublePool_O(pool_size, isnode)
    pool_g_val = DoublePool_O(pool_size, isnode)

    print('==> Making model..')

    model = MoCo(mode=args.mode, r=args.moco_r)
    #model = nn.DataParallel(model)
    model = model.to(device)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)
    
    
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
#             else:
#                 # Map model to be loaded to specified single gpu.
#                 loc = 'cuda:{}'.format(args.gpu)
#                 checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            
    if args.use_pretrained:
        if os.path.isfile(args.use_pretrained):
            print("=> loading checkpoint '{}'".format(args.use_pretrained))
            state_dict = torch.load(args.use_pretrained)['state_dict']
            del state_dict['queue']
            del state_dict['queue_ptr']

            model_dict = model.state_dict()
            model_dict.update(state_dict)

            model.load_state_dict(model_dict)
        else:
            print("=> no checkpoint found at '{}'".format(args.use_pretrained))
            
    for epoch in range(args.start_epoch, args.epochs):

        cluster_result = None
        
        if epoch>=args.warmup_epoch:
            # compute momentum features for center-cropped images
            features = compute_features(kmeans_train_loader, model, args)         

            # placeholder for clustering result
            cluster_result = {'im2cluster':[],'centroids':[],'density':[]}
            for num_cluster in args.num_cluster:
                cluster_result['im2cluster'].append(torch.zeros(len(kmeans_train_dataset),dtype=torch.long).cuda())
                cluster_result['centroids'].append(torch.zeros(int(num_cluster),256).cuda())
                cluster_result['density'].append(torch.zeros(int(num_cluster)).cuda()) 

#             features[torch.norm(features,dim=1)>1.5] /= 2 #account for the few samples that are computed twice  
            features = features.numpy()
            cluster_result = run_kmeans(features,args)  #run kmeans clustering on master node
                # save the clustering result
                # torch.save(cluster_result,os.path.join(args.exp_dir, 'clusters_%d'%epoch))  

        adjust_learning_rate(optimizer, epoch, args)

       
        train(moco_train_loader, model, criterion, optimizer, epoch, args, cluster_result, tb_logger, pool_e_train, pool_g_train)
    
        
   
        if (epoch+1)%5==0:
            val_retrieval(moco_val_loader, model, epoch, args, tb_logger, pool_e_val, pool_g_val)
        if (epoch+1)%50==0:
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, is_best=False, filename='./tb_logs/{}/checkpoint_{}.pth.tar'.format(args.exp_dir, str(epoch)))

    

In [None]:
#export

def train(train_loader, model, criterion, optimizer, epoch, args, cluster_result=None, tb_logger=None, pool_e=None, pool_g=None):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    acc_inst = AverageMeter('Acc@Inst', ':6.2f')   
    acc_proto = AverageMeter('Acc@Proto', ':6.2f')
    
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, acc_inst, acc_proto],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (feed_dict_q, feed_dict_k, metadata) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)  
        
                
        # compute output
        index = metadata["index"]
        output, target, output_proto, target_proto = model(feed_dict_q, feed_dict_k, metadata, cluster_result=cluster_result, index=index)
        
        
        # InfoNCE loss
        loss = criterion(output, target)  
        
        # ProtoNCE loss
        if output_proto is not None:
            loss_proto = 0
            for proto_out,proto_target in zip(output_proto, target_proto):
                loss_proto += criterion(proto_out, proto_target)  
                accp = accuracy(proto_out, proto_target)[0] 
                acc_proto.update(accp[0], args.batch_size)
                
            # average loss across all sets of prototypes
            loss_proto /= len(args.num_cluster) 
            loss += loss_proto   

        losses.update(loss.item(), args.batch_size)
        acc = accuracy(output, target)[0] 
        acc_inst.update(acc[0], args.batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        
        # store to pool
        store_to_pool(pool_e, pool_g, feed_dict_q, feed_dict_k, metadata, model, args)
        model.train()
 

        if i % args.print_freq == 0:
            progress.display(i)

    print("Logging to TB....")
    tb_logger.add_scalar('Train Acc Inst', acc_inst.avg, epoch)
    tb_logger.add_scalar('Train Acc Prototype', acc_proto.avg, epoch)
    tb_logger.add_scalar('Train Total Loss', losses.avg, epoch)
    
    if epoch % args.ret_freq == 0:
        figures = random_retrieve_topk(args, pool_e, pool_g, imgs_to_view=3)
        tb_logger.add_figure('Train Top10 Retrieval', figures, epoch)
        

In [None]:
run_training(args="--exp-dir test_run".split())

AttributeError: 'list' object has no attribute 'exp_dir'

In [None]:
#export 

def val_retrieval(val_loader, model, epoch, args, tb_logger=None, pool_e=None, pool_g=None):
    model.eval()
    for i, (feed_dict_q, feed_dict_k, metadata) in enumerate((val_loader)):
        with torch.no_grad():
            feat_q = model(feed_dict_q, None, metadata, is_eval=True)
            feat_k = model(feed_dict_k, None, metadata, is_eval=True)

            dim1 = feat_q.shape[0]
            img_q = torch.zeros([dim1, 3, 256, 256])
            img_k = torch.zeros([dim1, 3, 256, 256])


            if args.mode=='node':
                cnt = 0

                for b in range(feed_dict_q["objects_boxes"].shape[0]//args.hyp_N):
                    for s in range(args.hyp_N):
                        img_q[cnt] = feed_dict_q["images"][b]
                        img_k[cnt] = feed_dict_k["images"][b]
                        cnt += 1

                pool_e.update(feat_q, img_q, feed_dict_q["objects_boxes"], None)
                pool_g.update(feat_k, img_k, feed_dict_k["objects_boxes"], None)

            else:
                dim1 = feat_q.shape[0]
                subj_q = torch.zeros([dim1, 4])
                subj_k = torch.zeros([dim1, 4])
                obj_q = torch.zeros([dim1, 4])
                obj_k = torch.zeros([dim1, 4])

                cnt = 0
                for b in range(feed_dict_q["objects_boxes"].shape[0]//args.hyp_N):
                    for s in range(args.hyp_N):
                        for o in range(args.hyp_N):
                            img_q[cnt] = feed_dict_q["images"][b]
                            img_k[cnt] = feed_dict_k["images"][b]
                            cnt += 1

                cnt = 0
                for b in range(feed_dict_q["objects_boxes"].shape[0]//args.hyp_N):
                    for s in range(args.hyp_N):
                        for o in range(args.hyp_N):
                            start_idx = b*args.hyp_N
                            subj_q[cnt] = feed_dict_q["objects_boxes"][start_idx + s]
                            obj_q[cnt] = feed_dict_q["objects_boxes"][start_idx + o]
                            cnt += 1


                cnt = 0
                for b in range(feed_dict_q["objects_boxes"].shape[0]//args.hyp_N):
                    for s in range(args.hyp_N):
                        for o in range(args.hyp_N):
                            start_idx = b*args.hyp_N
                            subj_k[cnt] = feed_dict_k["objects_boxes"][start_idx + s]
                            obj_k[cnt] = feed_dict_k["objects_boxes"][start_idx + o]
                            cnt += 1

                pool_e.update(feat_q, img_q, subj_q, obj_q)
                pool_g.update(feat_k, img_k, subj_k, obj_k)
                
    figures = random_retrieve_topk(args, pool_e, pool_g, imgs_to_view=5)
    tb_logger.add_figure('Validation Top10 Retrieval', figures, epoch)
    
    return

