In [2]:
import os
import json
import time
import random
import datetime
import argparse

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, DistributedSampler

import numpy as np
from pathlib import Path

import datasets
import util.misc as utils
from models import build_model
from datasets import build_dataset
from engine import evaluate, train_one_epoch

In [5]:
m1 = torch.load('/home/marufm/intr-projects/INTR/output/fish_trained/output_sub/checkpoint.pth')

In [17]:
m2 = torch.load('/home/marufm/intr-projects/INTR/output/fish_trained/output_sub/checkpoint0139.pth')

In [16]:
torch.sum(m1['model']['backbone.0.body.layer4.2.bn3.weight'] != m2['model']['backbone.0.body.layer4.2.bn3.weight'])

tensor(0, device='cuda:0')

In [14]:
args = torch.load('args_train.pt')
print(args)

Namespace(backbone='resnet50', batch_size=12, clip_max_norm=0.1, dataset_name='cub', dataset_path='/home/marufm/intr-projects/INTR/datasets', dec_layers=6, device='cuda', dilation=False, dim_feedforward=2048, dist_url='env://', dropout=0.1, enc_layers=6, epochs=140, eval=False, finetune='/home/marufm/intr-projects/INTR/checkpoints/detr-r50-e632da11.pth', hidden_dim=256, lr=0.0001, lr_backbone=1e-05, lr_drop=80, lr_scheduler='StepLR', min_lr=1e-06, nheads=8, noise_frac=0.1, num_queries=190, num_workers=2, output_dir='output', output_sub_dir='output_sub', position_embedding='sine', pre_norm=False, resume='', seed=42, start_epoch=0, test='val', weight_decay=1e-06, world_size=1)


In [15]:
utils.init_distributed_mode(args)
print("git:\n  {}\n".format(utils.get_sha()))
print(args)

Not using distributed mode
git:
  sha: 478f94b3ad77eda1997d037a52b3b1ca400d8997, status: has uncommited changes, branch: main

Namespace(backbone='resnet50', batch_size=12, clip_max_norm=0.1, dataset_name='cub', dataset_path='/home/marufm/intr-projects/INTR/datasets', dec_layers=6, device='cuda', dilation=False, dim_feedforward=2048, dist_url='env://', distributed=False, dropout=0.1, enc_layers=6, epochs=140, eval=False, finetune='/home/marufm/intr-projects/INTR/checkpoints/detr-r50-e632da11.pth', hidden_dim=256, lr=0.0001, lr_backbone=1e-05, lr_drop=80, lr_scheduler='StepLR', min_lr=1e-06, nheads=8, noise_frac=0.1, num_queries=190, num_workers=2, output_dir='output', output_sub_dir='output_sub', position_embedding='sine', pre_norm=False, resume='', seed=42, start_epoch=0, test='val', weight_decay=1e-06, world_size=1)


In [16]:
# CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 --master_port 12345 --use_env main.py --finetune /home/marufm/intr-projects/INTR/checkpoints/detr-r50-e632da11.pth --dataset_path /home/marufm/intr-projects/INTR/datasets --dataset_name cub --num_queries 190

In [None]:
# CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 --master_port 12345 --use_env main.py --finetune /home/marufm/intr-projects/INTR/checkpoints/detr-r50-e632da11.pth --dataset_path /home/marufm/intr-projects/INTR/datasets --dataset_name butterfly --num_queries 51

In [None]:
# CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 --master_port 12345 --use_env main.py --finetune /home/marufm/intr-projects/INTR/checkpoints/detr-r50-e632da11.pth --dataset_path /home/marufm/intr-projects/INTR/datasets --dataset_name fish --num_queries 38

In [17]:
device = torch.device(args.device)

# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

model, criterion= build_model(args)
model.to(device)
model_without_ddp = model



In [18]:
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
    ## for 2-phase training
    # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) 
    model_without_ddp = model.module
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)

param_dicts = [
    {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
    {
        "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
        "lr": args.lr_backbone,
    },
]

number of params: 41171969


In [25]:
args.epochs

140

In [19]:
if args.lr_scheduler=="StepLR":
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

if args.lr_scheduler=="CosineAnnealingLR":
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, 
                                weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=args.min_lr)

In [20]:
dataset_train = build_dataset(image_set='train', args=args)
dataset_val = build_dataset(image_set=args.test, args=args)

In [21]:
dataset_train

Dataset CreateImageFolder
    Number of datapoints: 5695
    Root location: /home/marufm/intr-projects/INTR/datasets/cub/train
    StandardTransform
Transform: Compose(
               <datasets.transforms.RandomHorizontalFlip object at 0x7f1b00e09bb0>
               <datasets.transforms.RandomSelect object at 0x7f1b05b7ebb0>
               Compose(
               <datasets.transforms.ToTensor object at 0x7f1b05c63d00>
               <datasets.transforms.Normalize object at 0x7f1b00e09a30>
           )
           )

In [22]:
if args.distributed:
    sampler_train = DistributedSampler(dataset_train)
    sampler_val = DistributedSampler(dataset_val, shuffle=False)
else:
    sampler_train = torch.utils.data.RandomSampler(dataset_train)
    sampler_val = torch.utils.data.SequentialSampler(dataset_val)

batch_sampler_train = torch.utils.data.BatchSampler(
    sampler_train, args.batch_size, drop_last=True)

data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                               collate_fn=utils.collate_fn, num_workers=args.num_workers)
data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
                             drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)

In [24]:
#   We create output directories to store results
output_dir = Path(args.output_dir)
if not os.path.exists(os.path.join(output_dir, args.dataset_name)):
    os.makedirs(os.path.join(output_dir, args.dataset_name), exist_ok=True)
if not os.path.exists(os.path.join(output_dir, args.dataset_name, args.output_sub_dir)):
    os.makedirs(os.path.join(output_dir, args.dataset_name, args.output_sub_dir), exist_ok=True)

if args.resume:
    if args.resume.startswith('https'):
        checkpoint = torch.hub.load_state_dict_from_url(
            args.resume, map_location='cpu', check_hash=True)
    else:
        checkpoint = torch.load(args.resume, map_location='cpu')
    model_without_ddp.load_state_dict(checkpoint['model'])
    if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1

if args.eval:
    test_stats = evaluate(model, criterion, 
                            data_loader_val, device, args.output_dir)
    if args.output_dir and utils.is_main_process():
        with (output_dir / args.dataset_name / args.output_sub_dir/ "log.txt").open("a") as f:
            f.write(json.dumps(test_stats) + "\n")
    # return

if args.finetune:
    if args.finetune.startswith('https'):
        checkpoint = torch.hub.load_state_dict_from_url(
            args.finetune, map_location='cpu', check_hash=True)
    else:
        checkpoint = torch.load(args.finetune, map_location='cpu')
    state_dict = checkpoint['model']
    state_dict=utils.load_model(args, state_dict)
    
    model_without_ddp.load_state_dict(state_dict)

    for param in model_without_ddp.parameters():
        param.requires_grad = True
    model_without_ddp.to(device)

print("Start training")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):

    ## for 2-phase training
    # if epoch>=args.rm_freeze:
    #     for param in model_without_ddp.transformer.encoder.parameters():
    #         param.requires_grad = True

    if args.distributed:
        sampler_train.set_epoch(epoch)
    train_stats = train_one_epoch(
        model, criterion, data_loader_train, optimizer, device, epoch,
        args.clip_max_norm)

    lr_scheduler.step()
    if args.output_dir:
        checkpoint_paths = [output_dir / args.dataset_name / args.output_sub_dir/ 'checkpoint.pth']

        if (epoch + 1) % args.lr_drop == 0 or (epoch + 1)==args.epochs:
            checkpoint_paths.append(output_dir / args.dataset_name / args.output_sub_dir / f'checkpoint{epoch:04}.pth')
        for checkpoint_path in checkpoint_paths:
            utils.save_on_master({
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args,
            }, checkpoint_path)

    test_stats = evaluate(
        model, criterion,  data_loader_val, device, args.output_dir
    )

    log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                 **{f'test_{k}': v for k, v in test_stats.items()},
                 'epoch': epoch,
                 'n_parameters': n_parameters}

    if args.output_dir and utils.is_main_process():
        with (output_dir / args.dataset_name / args.output_sub_dir/ "log.txt").open("a") as f:
            f.write(json.dumps(log_stats) + "\n")

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))


Start training
Epoch: [0]  [  0/474]  eta: 0:06:28  lr: 0.000100  loss: 5.5047 (5.5047)  acc1: 0.0000 (0.0000)  acc5: 0.0000 (0.0000)  time: 0.8201  data: 0.3794  max mem: 11888
Epoch: [0]  [ 10/474]  eta: 0:03:24  lr: 0.000100  loss: 5.3324 (5.3465)  acc1: 0.0000 (0.0000)  acc5: 0.0000 (0.7576)  time: 0.4402  data: 0.0472  max mem: 15618
Epoch: [0]  [ 20/474]  eta: 0:03:13  lr: 0.000100  loss: 5.3402 (5.3553)  acc1: 0.0000 (0.3968)  acc5: 0.0000 (2.7778)  time: 0.4075  data: 0.0140  max mem: 17136
Epoch: [0]  [ 30/474]  eta: 0:03:10  lr: 0.000100  loss: 5.3585 (5.3529)  acc1: 0.0000 (0.2688)  acc5: 0.0000 (3.2258)  time: 0.4229  data: 0.0142  max mem: 21137
Epoch: [0]  [ 40/474]  eta: 0:03:02  lr: 0.000100  loss: 5.3608 (5.3479)  acc1: 0.0000 (0.2033)  acc5: 0.0000 (2.6423)  time: 0.4139  data: 0.0140  max mem: 21137
Epoch: [0]  [ 50/474]  eta: 0:03:01  lr: 0.000100  loss: 5.3518 (5.3479)  acc1: 0.0000 (0.1634)  acc5: 0.0000 (2.4510)  time: 0.4286  data: 0.0140  max mem: 21137
Epoch: 

Traceback (most recent call last):
  File "/home/marufm/miniconda/envs/intr/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/home/marufm/miniconda/envs/intr/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/marufm/miniconda/envs/intr/lib/python3.8/multiprocessing/connection.py", line 411, in _send_bytes
    self._send(header + buf)
  File "/home/marufm/miniconda/envs/intr/lib/python3.8/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 