In [3]:
import json

# Root path
root_path = "rgb"

# Load JSON file
with open('/svl/data/kinetics-400/diving48/Diving48_V2_train.json', 'r') as f:
    data = json.load(f)

# Open the output file
with open('diving48_video_train_labels.txt', 'w') as f:
    for item in data:
        # Construct file path
        file_path = f"{root_path}/{item['vid_name']}"
        # Write file path and label to file
        f.write(f"{file_path} {item['label']}\n")

In [4]:
# Load JSON file
with open('/svl/data/kinetics-400/diving48/Diving48_V2_test.json', 'r') as f:
    data = json.load(f)

# Open the output file
with open('diving48_video_val_labels.txt', 'w') as f:
    for item in data:
        # Construct file path
        file_path = f"{root_path}/{item['vid_name']}"
        # Write file path and label to file
        f.write(f"{file_path} {item['label']}\n")

In [5]:
from functools import partial
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import sys
sys.path.append("../")
import diving48
from datasets import get_dataloaders, get_toy_dataloader


In [19]:
parser = ArgumentParser()
# training arguments
parser.add_argument('--batch_size', default=512, type=int)
parser.add_argument("--criterion_name", type=str, default="binary_crossentropy", choices=["binary_crossentropy"])
parser.add_argument('--balance_classes', default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--epochs', default=40, type=int)
parser.add_argument('--gradient_clip_val', default=1, type=float)
parser.add_argument('--gpus', default=1, type=int)
parser.add_argument('--num_workers', default=0, type=int)
parser.add_argument('--seed', default=None, type=int)
parser.add_argument('--checkpoint_every_n_epochs', type=int, default=5)
parser.add_argument('--wandb_group', type=str, default="latest")
parser.add_argument('--toy_dataloader', default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--validate_before_train', default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--mode', type=str, default="train", choices=["train", "eval"])
parser.add_argument('--checkpoint_path', type=str, default=None)
parser.add_argument('--included_classes_path', type=str, default=None)
parser.add_argument('--log_per_class_acc', default=False, type=lambda x: (str(x).lower() == 'true'))

# Optimizer arguments
parser.add_argument('--lr', default=5e-4, type=float)
parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',
                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',
                    help='epochs to warmup LR, if scheduler supports')
parser.add_argument('--warmup_steps', type=int, default=-1, metavar='N', 
                    help='num of steps to warmup LR, will overload warmup_epochs if set > 0')
parser.add_argument('--wd', default=0.05, type=float, 
                    help="Weight decay (will use Adam if set to 0, AdamW otherwise).")
parser.add_argument('--gradient_accumulation_steps', default=1, type=int)
parser.add_argument('--adam_betas', nargs='+', type=float, default=(0.9, 0.999), help='Adam betas')
parser.add_argument('--adam_eps', type=float, default=1e-8, help='Adam epsilon')
parser.add_argument('--backbone_lr', type=float, default=-1, help='backbone learning rate (if -1 uses model lr)')
parser.add_argument('--min_backbone_lr', type=float, default=-1, help='backbone min learning rate (if -1 uses model min lr)')


# Augmentation params
parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing (default: 0.1) - only works if mixup is enabled')
parser.add_argument('--num_aug_sample', type=int, default=2,
                    help='Repeated_aug (default: 2)')
parser.add_argument('--aa', type=str, default='rand-m7-n4-mstd0.5-inc1', metavar='NAME',
                    help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m7-n4-mstd0.5-inc1). Set to "None" to disable.'),
parser.add_argument('--train_interpolation', type=str, default='bicubic',
                    help='Training interpolation (random, bilinear, bicubic default: "bicubic")')

# Random Erase params
parser.add_argument('--reprob', type=float, default=0, metavar='PCT',
                    help='Random erase prob (default: 0)')
parser.add_argument('--remode', type=str, default='pixel',
                    help='Random erase mode (default: "pixel")')
parser.add_argument('--recount', type=int, default=1,
                    help='Random erase count (default: 1)')

# Mixup params
parser.add_argument('--mixup', type=float, default=0,
                    help='mixup alpha, mixup enabled if > 0.')
parser.add_argument('--cutmix', type=float, default=0,
                    help='cutmix alpha, cutmix enabled if > 0.')
parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
parser.add_argument('--mixup_prob', type=float, default=1.0,
                    help='Probability of performing mixup or cutmix when either/both is enabled')
parser.add_argument('--mixup_switch_prob', type=float, default=0.5,
                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
parser.add_argument('--mixup_mode', type=str, default='batch',
                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')

# dataset arguments
parser.add_argument('--task_name', type=str, required=True)
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--label_path', type=str, required=True)
parser.add_argument('--n_frames', default=32, type=int)
parser.add_argument('--test_temporal_views', default=1, type=int)
parser.add_argument('--test_spatial_views', default=3, type=int)
parser.add_argument('--frame_sample_rate', default=4, type=int)
parser.add_argument('--load_from', type=str, default="video", choices=["video", "rgb"])

# model structure arguments
parser.add_argument("--model_name", type=str, default="encode_pool_classify", choices=["encode_pool_classify", "text4vis"])

# backbone arguments
parser.add_argument("--backbone_name", type=str, default="clip_ViT-B/32")
parser.add_argument('--backbone_freeze', default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--backbone_unfreeze_layer_norm', default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--backbone_drop_path_rate', default=0.1, type=float,
                    help="Drop path rate (Stochastic Depth) (default: 0.1). Currently only implemented for non default backbones")
parser.add_argument('--backbone_proj_after', default=True, type=lambda x: (str(x).lower() == 'true'), help="Whether to apply projection to output of backbone (currently not supported for default clip backbone)")
parser.add_argument('--temporal_pool_backbone', default=False, type=lambda x: (str(x).lower() == 'true'), help="Whether to apply mean \
                        temporal pooling in backbone instead of after. Automatically sets temporal pooling name to None. \
                      ""  (currently not supported for default clip backbone)")

# AIM + QRNN Shared
parser.add_argument('--adapter_upsample_zero_init', default=False, type=lambda x: (str(x).lower() == 'true'))

## AIM Backbone arguments
parser.add_argument('--adapter_settings', type=str, default="i", help="Adapter settings if using Adapter backbone. \
                    Choices of i, t, s, m (i.e. to use input adapter + temporal adapter (which implies temporal attention), you would input \
                    'it', to use all 4 adapters, do `itsm`. Order doesn't matter)" )
parser.add_argument('--adapter_checkpoint', type=str, default=None, help="Optional checkpoint to use when using adapter backbone - used for loading pretrained AIM backbones")

## QRNN-Adapter arguments
parser.add_argument("--backbone_qrnn_bidirectional", default=False, type=lambda x: (str(x).lower() == 'true'))
parser.add_argument('--num_qrnn_adapters', type=int, default=1, help="Number of QRNN adapters to use if using QRNN Adapter backbone")
parser.add_argument('--vanilla_adapter', default=False, type=lambda x: (str(x).lower() == 'true'), help="Whether to use vanilla adapter (i.e. no QRNN)")
parser.add_argument('--downsample_qrnn_adapter', default=False, type=lambda x: (str(x).lower() == 'true'), help="Whether to use downsample QRNN adapter (downsample before QRNN)")
parser.add_argument('--num_qrnn_layers', type=int, default=1, help="Number of QRNN layers to use if using QRNN Adapter backbone")
parser.add_argument('--qrnn_lookback', type=int, default=1, help="Number of previous frames to look at if using QRNN Adapter backbone")
parser.add_argument('--qrnn_lookahead', type=int, default=0, help="Number of future frames to look at if using QRNN Adapter backbone")
parser.add_argument('--adapter_downsample_ratio', type=float, default=0.25, help="Ratio to downsample in adapter")

# encoder arguments
parser.add_argument("--temporal_pooling_name", type=str, default="mean", choices=["mean", "transformer", "identity", "last"])
# transformer specific arguments used if `temporal_pooling_name` is `transformer`
parser.add_argument('--temporal_pooling_transformer_depth', default=3, type=int)
parser.add_argument('--temporal_pooling_transformer_heads', default=4, type=int)
parser.add_argument('--temporal_pooling_transformer_dim', default=512, type=int)
parser.add_argument('--temporal_pooling_transformer_ff_dim', default=512, type=int)
parser.add_argument('--temporal_pooling_transformer_input_dim', default=512, type=int)
parser.add_argument('--temporal_pooling_transformer_emb_dropout', default=0.1, type=float)

# classifier arguments
parser.add_argument("--classification_layer_name", type=str, default="linear", choices=["linear"])
parser.add_argument("--classification_input_dim", type=int, default=512)
parser.add_argument("--num_classes", type=int, required=True)
parser.add_argument("--classification_layer_dropout", type=float, default=0.5)



_StoreAction(option_strings=['--classification_layer_dropout'], dest='classification_layer_dropout', nargs=None, const=None, default=0.5, type=<class 'float'>, choices=None, required=False, help=None, metavar=None)

In [20]:
hparams = parser.parse_args("--task_name diving48 --data_path /svl/data/kinetics-400/diving48 \
    --label_path /vision/u/eatang/leaky_video/datasets/diving48 --n_frames 8 --num_classes 174 --num_aug_sample 1 --aa rand-m9-n2-mstd0.5-inc1 --backbone_name qrnn_adapter_clip_ViT-B/16 \
    --temporal_pooling_name last --backbone_freeze True --backbone_unfreeze_layer_norm False --backbone_qrnn_bidirectional False --num_qrnn_adapters 2 \
     --adapter_upsample_zero_init True --backbone_proj_after False --classification_input_dim 768 \
    --num_qrnn_layers 1 --downsample_qrnn_adapter False --temporal_pool_backbone True \
    --qrnn_lookahead 0 --qrnn_lookback 1 --adapter_downsample_ratio 0.282 --downsample_qrnn_adapter True".split())


In [26]:
train_loader, val_loaders = get_dataloaders(hparams)


In [None]:
for i, batch in enumerate(val_loaders[0]):
    break

In [24]:
mode = "val"
dataset = diving48.Diving48("/svl/data/kinetics-400/diving48/", 
           "/vision/u/eatang/leaky_video/datasets/diving48",
            8, mode=mode, 
            crop_size=224, 
            short_side_size=224,
            num_aug_sample=1,
            args=args,
            test_num_segment=1,
            test_num_crop=1, load_from="rgb")

j = 0
data = dataset.__getitem__(j)
frames = data["video_features"]
import matplotlib.pyplot as plt

w = 10
h = 10
fig = plt.figure(figsize=(8, 8))
columns = 4
rows = 4
for i in range(1, 9):
    img = frames[:,i - 1,:,:]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img.numpy().transpose((1,2,0)))
plt.show()


<torch.utils.data.dataloader.DataLoader at 0x7fa6b5859180>