In [1]:
import sys
sys.path.append("../")

In [2]:
# default_exp trainer

In [3]:
#export
import builtins
import math
import os
import random
import shutil
import time
import warnings
from tqdm import tqdm
import numpy as np
import argparse

In [4]:
#export

from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.utils.data import DataLoader

In [5]:
#export

from core.model.model import MoCo
from core.dataloader import GQNDataset_pdisco, collate_boxes
from core.utils import compute_features, run_kmeans, AverageMeter, ProgressMeter, adjust_learning_rate, accuracy, save_checkpoint

In [6]:
parser = argparse.ArgumentParser(description='Relational 2d Training')
# parser.add_argument('data', metavar='DIR',
#                     help='path to datasets root directory')
parser.add_argument('-j', '--num-worker', default=1, type=int, metavar='N',
                    help='number of data loading workers (default: 1)')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=16, type=int,
                    metavar='N',
                    help='mini-batch size (default: 16), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.03, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int,
                    help='learning rate schedule (when to drop lr by 10x)')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum of SGD solver')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=100, type=int,
                    metavar='N', help='print iter frequency (default: 100)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')

parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument("--gpu", type=int, nargs='+', default=None, help='GPU id to use.')
parser.add_argument('--warmup-epoch', default=10, type=int,
                    help='number of warm-up epochs to only train with InfoNCE loss')
parser.add_argument('--cos', action='store_true',
                    help='use cosine lr schedule')
parser.add_argument('--exp-dir', default='experiment_pcl', type=str,
                    help='experiment directory to store tb logs and checkpoints')

_StoreAction(option_strings=['--exp-dir'], dest='exp_dir', nargs=None, const=None, default='experiment_pcl', type=<class 'str'>, choices=None, help='experiment directory to store tb logs and checkpoints', metavar=None)

In [7]:
import os
if not os.path.exists('../tb_logs'):
    os.makedirs('../tb_logs')

In [8]:
#export

def setup_tb(exp_name):
    tb_directory = os.path.join('../tb_logs', exp_name)
    return SummaryWriter(tb_directory)

In [9]:
#export

def run_training(args):
    
#     parser = argparse.ArgumentParser(description='Relational 2d Training')
    
#     if default_args:
#         args = parser.parse_args(default_args)
#     else:
#         args = parser.parse_args()
        
    tb_logger = setup_tb(args.exp_dir)
    
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')
    
    if not os.path.exists(args.exp_dir):
        os.mkdir(args.exp_dir)
    if not os.path.exists(os.path.join('../tb_logs',args.exp_dir)):
        os.mkdir(os.path.join('../tb_logs', args.exp_dir))
    
    ngpus_per_node = torch.cuda.device_count()
    
    gpu_devices = ','.join([str(id) for id in range(ngpus_per_node)])
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
    
    best_acc = 0

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    
    train_dataset = GQNDataset_pdisco(root_dir='/home/mprabhud/dataset/clevr_veggies/npys/be_lt.txt')
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_boxes)

    print('==> Making model..')

    model = MoCo()
    #model = nn.DataParallel(model)
    model = model.to(device)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)
    
    
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
#             else:
#                 # Map model to be loaded to specified single gpu.
#                 loc = 'cuda:{}'.format(args.gpu)
#                 checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            
    for epoch in range(args.start_epoch, args.epochs):

        cluster_result = None
        
        if epoch>=args.warmup_epoch:
            # compute momentum features for center-cropped images
            features = compute_features(eval_loader, model, args)         

            # placeholder for clustering result
            cluster_result = {'im2cluster':[],'centroids':[],'density':[]}
            for num_cluster in args.num_cluster:
                cluster_result['im2cluster'].append(torch.zeros(len(eval_dataset),dtype=torch.long).cuda())
                cluster_result['centroids'].append(torch.zeros(int(num_cluster),args.low_dim).cuda())
                cluster_result['density'].append(torch.zeros(int(num_cluster)).cuda()) 

            features[torch.norm(features,dim=1)>1.5] /= 2 #account for the few samples that are computed twice  
            features = features.numpy()
            cluster_result = run_kmeans(features,args)  #run kmeans clustering on master node
                # save the clustering result
                # torch.save(cluster_result,os.path.join(args.exp_dir, 'clusters_%d'%epoch))  

        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, cluster_result, tb_logger)
        break
        if (epoch+1)%5==0:
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, is_best=False, filename='{}/checkpoint.pth.tar'.format(args.exp_dir))

    

In [10]:
#export

def train(train_loader, model, criterion, optimizer, epoch, args, cluster_result=None, tb_logger=None):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    acc_inst = AverageMeter('Acc@Inst', ':6.2f')   
    acc_proto = AverageMeter('Acc@Proto', ':6.2f')
    
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, acc_inst, acc_proto],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (feed_dict_q, feed_dict_k, metadata) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)   
                
        # compute output
        index = metadata["scene_number"]
        output, target, output_proto, target_proto = model(feed_dict_q, feed_dict_k, metadata, cluster_result=cluster_result, index=index)
        
        # InfoNCE loss
        loss = criterion(output, target)  
        
        # ProtoNCE loss
        if output_proto is not None:
            loss_proto = 0
            for proto_out,proto_target in zip(output_proto, target_proto):
                loss_proto += criterion(proto_out, proto_target)  
                accp = accuracy(proto_out, proto_target)[0] 
                acc_proto.update(accp[0], args.batch_size)
                
            # average loss across all sets of prototypes
            loss_proto /= len(args.num_cluster) 
            loss += loss_proto   

        losses.update(loss.item(), args.batch_size)
        acc = accuracy(output, target)[0] 
        acc_inst.update(acc[0], args.batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
 

        if i % args.print_freq == 0:
            progress.display(i)
        break

    print("Logging to TB....")
    tb_logger.add_scalar('Train Acc Inst', acc_inst.avg, epoch)
    tb_logger.add_scalar('Train Acc Prototype', acc_proto.avg, epoch)
    tb_logger.add_scalar('Train Total Loss', losses.avg, epoch)

In [11]:
run_training(default_args="--exp-dir test_run".split())

==> Preparing data..
Initialised..... 27495  files...
==> Making model..
The number of parameters of model is 29279144


  boxes_q = utils_disco.get_bounding_boxes(torch.tensor(query_image), boxes, camXs_T_origin_q, pix_T_camXs_q, num_boxes)
  xmin,ymin,zmin,xmax,ymax,zmax = torch.unbind(torch.tensor(aligned_boxes), dim=-1)
  boxes_k = utils_disco.get_bounding_boxes(torch.tensor(key_image), boxes, camXs_T_origin_k, pix_T_camXs_k, num_boxes)
  return torch.tensor(query_image), num_boxes, torch.tensor(boxes_q), torch.tensor(key_image), num_boxes, torch.tensor(boxes_k), scene_num, key_img_view, torch.tensor(pix_T_cams_raw), torch.tensor(camR_T_origin_raw), torch.tensor(origin_T_camXs_raw), torch.tensor(rel_viewpoint)
  metadata = {"scene_number":scene_num, "key_image_index":key_img_view, "pix_T_cams_raw":torch.tensor(pix_T_cams_raw).cuda(), "camR_T_origin_raw":torch.tensor(camR_T_origin_raw).cuda(), "origin_T_camXs_raw":torch.tensor(origin_T_camXs_raw).cuda(), "rel_viewpoint":torch.tensor(gt_egomotion).cuda()}
  feed_dict_q = {"images":torch.tensor(query_image).cuda(), "objects":num_boxes_q, "objects_boxes"

Viewpoint Transformation of Node feature vectors
Node: Pose with Node Concat :  Batch Ind: 0
Node: Pose with Node Concat :  Batch Ind: 1
Node: Pose with Node Concat :  Batch Ind: 2
Node: Pose with Node Concat :  Batch Ind: 3
Node: Pose with Node Concat :  Batch Ind: 4
Node: Pose with Node Concat :  Batch Ind: 5
Node: Pose with Node Concat :  Batch Ind: 6
Node: Pose with Node Concat :  Batch Ind: 7
Node: Pose with Node Concat :  Batch Ind: 8
Node: Pose with Node Concat :  Batch Ind: 9
Node: Pose with Node Concat :  Batch Ind: 10
Node: Pose with Node Concat :  Batch Ind: 11
Node: Pose with Node Concat :  Batch Ind: 12
Node: Pose with Node Concat :  Batch Ind: 13
Node: Pose with Node Concat :  Batch Ind: 14
Node: Pose with Node Concat :  Batch Ind: 15
Node: Transformation:  Batch Ind: 0
Node: Transformation:  Batch Ind: 1
Node: Transformation:  Batch Ind: 2
Node: Transformation:  Batch Ind: 3
Node: Transformation:  Batch Ind: 4
Node: Transformation:  Batch Ind: 5
Node: Transformation:  Ba