In [3]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import torch
import matplotlib.pyplot as plt
frames = torch.load("../video.pt")
w = 10
h = 10
fig = plt.figure(figsize=(8, 8))
columns = 4
rows = 4
for i in range(1, 16):
    img = frames[i,:,:,:]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img.transpose((1,2,0)))
plt.show()


In [4]:
frames.shape

torch.Size([18, 3, 224, 224])

In [4]:
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.distributed import DistributedSampler

class BaseDataLoader(DataLoader):
    """
    Base class for all data loaders
    """
    def __init__(self, dataset, batch_size, shuffle, validation_split, num_workers, collate_fn=default_collate):
        self.validation_split = validation_split
        self.shuffle = shuffle

        self.batch_idx = 0
        self.n_samples = len(dataset)

        self.sampler, self.valid_sampler = self._split_sampler(self.validation_split)

        self.init_kwargs = {
            'dataset': dataset,
            'batch_size': batch_size,
            'shuffle': self.shuffle,
            'collate_fn': collate_fn,
            'num_workers': num_workers
        }
        super().__init__(sampler=self.sampler, **self.init_kwargs)

    def _split_sampler(self, split):
        if split == 0.0:
            return None, None

        idx_full = np.arange(self.n_samples)

        np.random.seed(0)
        np.random.shuffle(idx_full)

        if isinstance(split, int):
            assert split > 0
            assert split < self.n_samples, "validation set size is configured to be larger than entire dataset."
            len_valid = split
        else:
            len_valid = int(self.n_samples * split)

        valid_idx = idx_full[0:len_valid]
        train_idx = np.delete(idx_full, np.arange(0, len_valid))

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)
        # turn off shuffle option which is mutually exclusive with sampler
        self.shuffle = False
        self.n_samples = len(train_idx)

        return train_sampler, valid_sampler

    def split_validation(self, diff_kwargs=None):
        init_kwargs = self.init_kwargs
        if diff_kwargs is not None:
            init_kwargs.update(diff_kwargs)
        if self.valid_sampler is None:
            return None
        else:
            return DataLoader(sampler=self.valid_sampler, **self.init_kwargs)

    def num_samples(self):
        return len(self.sampler)


class BaseDataLoaderExplicitSplit(DataLoader):
    """
    Base class for all data loaders
    """
    def __init__(self, dataset, batch_size, shuffle, num_workers, collate_fn=default_collate):
        self.shuffle = shuffle

        self.batch_idx = 0
        self.n_samples = len(dataset)

        self.init_kwargs = {
            'dataset': dataset,
            'batch_size': batch_size,
            'shuffle': self.shuffle,
            'collate_fn': collate_fn,
            'num_workers': num_workers,
            'pin_memory': True
        }
        super().__init__(**self.init_kwargs)


class DistBaseDataLoaderExplicitSplit(DataLoader):
    """
    Base class for all data loaders
    """
    def __init__(self, dataset, batch_size, shuffle, num_workers, collate_fn=default_collate):
        self.shuffle = shuffle

        self.batch_idx = 0
        self.n_samples = len(dataset)
        self.train_sampler = DistributedSampler(dataset)
        self.init_kwargs = {
            'dataset': dataset,
            'batch_size': batch_size,
            'shuffle': False,
            'collate_fn': collate_fn,
            'num_workers': num_workers,
            'pin_memory': True,
            'sampler': self.train_sampler
        }
        super().__init__(**self.init_kwargs)


class MultiDistBaseDataLoaderExplicitSplit(DataLoader):
    """
    Base class for all data loaders
    """
    def __init__(self, args, dataset, batch_size, shuffle, num_workers, collate_fn=default_collate):
        self.shuffle = shuffle

        self.batch_idx = 0
        self.n_samples = len(dataset)
        self.args = args
        self.train_sampler = DistributedSampler(dataset, num_replicas=self.args.world_size, rank=self.args.rank, drop_last=False)
        self.init_kwargs = {
            'dataset': dataset,
            'batch_size': batch_size,
            'shuffle': False,
            'collate_fn': collate_fn,
            'num_workers': num_workers,
            'pin_memory': True,
            'sampler': self.train_sampler
        }
        super().__init__(**self.init_kwargs)

class BaseMultiDataLoader:
    """
    Currently implemented as undersample the bigger dataloaders...
    """
    def __init__(self, dataloaders):
        self.dataloaders = dataloaders
        self.batch_size = self.dataloaders[0].batch_size

    def __getitem__(self, item):
        dl_idx = item % len(self.dataloaders)
        return next(iter(self.dataloaders[dl_idx]))

    def __len__(self):
        return min(len(x) for x in self.dataloaders) * len(self.dataloaders)

    def num_samples(self):
        return sum(len(x.sampler) for x in self.dataloaders)

In [5]:
import os
import pdb

import tqdm
import random
from abc import abstractmethod

import av
import cv2
import decord
import ffmpeg
import numpy as np
import torch
from PIL import Image
from torch.utils.data import Dataset, get_worker_info
from torchvision import transforms


class TextVideoDataset(Dataset):
    def __init__(self,
                 dataset_name,
                 text_params,
                 video_params,
                 data_dir,
                 meta_dir=None,
                 split='train',
                 tsfms=None,
                 cut=None,
                 subsample=1,
                 sliding_window_stride=-1,
                 reader='decord',
                 neg_param=None,
                 data_source="clip",
                 ):
        self.dataset_name = dataset_name
        self.text_params = text_params
        self.video_params = video_params
        self.data_source = data_source
        # check for environment variables
        self.data_dir = os.path.expandvars(data_dir)
        if meta_dir is not None:
            self.meta_dir = os.path.expandvars(meta_dir)
        else:
            self.meta_dir = self.data_dir
        self.split = split
        self.transforms = tsfms
        self.cut = cut
        self.subsample = subsample
        self.sliding_window_stride = sliding_window_stride
        self.video_reader = video_reader[reader]
        self.label_type = 'caption'
        self.neg_param = neg_param
        self._load_metadata()
        if self.sliding_window_stride != -1:
            if self.split != 'test':
                raise ValueError('Fixing frame sampling is for test time only. can remove but...')
            self._fix_temporal_samples()

    @abstractmethod
    def _load_metadata(self):
        raise NotImplementedError("Metadata loading must be implemented by subclass")

    @abstractmethod
    def _get_video_path(self, sample):
        raise NotImplementedError("Get video path function must be implemented by subclass")

    def _get_caption(self, sample):
        raise NotImplementedError("Get caption function must be implemented by subclass")

    def _get_video_lens(self):
        vlen_li = []
        for idx, row in self.metadata.iterrows():
            video_path = self._get_video_path(row)[0]
            vlen_li.append(get_video_len(video_path))

        return vlen_li

    def _fix_temporal_samples(self):
        self.metadata['vlen'] = self._get_video_lens()
        self.metadata['frame_intervals'] = self.metadata['vlen'].apply(
            lambda x: np.linspace(start=0, stop=x, num=min(x, self.video_params['num_frames']) + 1).astype(int))
        self.metadata['fix_start'] = self.metadata['frame_intervals'].apply(
            lambda x: np.arange(0, int(x[-1] / len(x - 1)), self.sliding_window_stride)
        )
        self.metadata = self.metadata.explode('fix_start')

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, item):
        item = item % len(self.metadata)
        sample = self.metadata.iloc[item]
        video_fp, rel_fp = self._get_video_path(sample)
        caption = self._get_caption(sample)

        video_loading = self.video_params.get('loading', 'strict')
        frame_sample = 'rand'
        fix_start = None
        if self.split == 'test':
            frame_sample = 'uniform'
        if self.sliding_window_stride != -1:
            fix_start = sample['fix_start']

        try:
            if os.path.isfile(video_fp):
                imgs, idxs = self.video_reader(video_fp, self.video_params['num_frames'], frame_sample,
                                               fix_start=fix_start)
            else:
                print(f"Warning: missing video file {video_fp}.")
                assert False
        except Exception as e:
            if video_loading == 'strict':
                raise ValueError(
                    f'Video loading failed for {video_fp}, video loading for this dataset is strict.') from e
            else:
                imgs = Image.new('RGB', (self.video_params['input_res'], self.video_params['input_res']), (0, 0, 0))
                imgs = transforms.ToTensor()(imgs).unsqueeze(0)

        # if self.transforms is not None:
        #     imgs = self.transforms(imgs)

        if self.transforms is not None:
            if self.video_params['num_frames'] > 1:
                imgs = imgs.transpose(0, 1)  # [T, C, H, W] ---> [C, T, H, W]
                imgs = self.transforms(imgs)
                imgs = imgs.transpose(0, 1)  # recover
            else:
                imgs = self.transforms(imgs)

        # if self.transforms is not None:
        #     # for video aug if T>1 else just do image aug
        #     if imgs.size(0) > 1:
        #         # added by Mr. YAN
        #         imgs = imgs.transpose(0, 1) # [T, C, H, W] ---> [C, T, H, W]
        #         imgs = self.transforms(imgs)
        #         imgs = imgs.transpose(0, 1) # recover
        #     else:
        #         imgs = self.transforms(imgs)    # t

        final = torch.zeros([self.video_params['num_frames'], 3, self.video_params['input_res'],
                             self.video_params['input_res']])
        final[:imgs.shape[0]] = imgs

        meta_arr = {'raw_captions': caption, 'paths': rel_fp, 'dataset': self.dataset_name}
        data = {'video': final, 'text': caption, 'meta': meta_arr}
        return data


def sample_frames(num_frames, vlen, sample='rand', fix_start=None):
    acc_samples = min(num_frames, vlen)
    intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
    ranges = []
    for idx, interv in enumerate(intervals[:-1]):
        ranges.append((interv, intervals[idx + 1] - 1))
    if sample == 'rand':
        frame_idxs = [random.choice(range(x[0], x[1])) for x in ranges]
    elif fix_start is not None:
        frame_idxs = [x[0] + fix_start for x in ranges]
    elif sample == 'uniform':                                             
        frame_idxs = [(x[0] + x[1]) // 2 for x in ranges]
    else:
        raise NotImplementedError

    return frame_idxs

def sample_frames_clips(start, end, vlen, acc_samples):
    start = max(0, start)
    end = min(vlen, end)

    intervals = np.linspace(start=start, stop=end, num=int(acc_samples) + 1).astype(int)
    ranges = []
    for idx, interv in enumerate(intervals[:-1]):
        ranges.append((interv, intervals[idx + 1] - 1))
        frame_idxs = [(x[0] + x[1]) // 2 for x in ranges
                      ]
    return frame_idxs

def sample_frames_start_end(num_frames, start, end, sample='rand', fix_start=None):
    acc_samples = min(num_frames, end)
    intervals = np.linspace(start=start, stop=end, num=acc_samples + 1).astype(int)
    ranges = []
    for idx, interv in enumerate(intervals[:-1]):
        ranges.append((interv, intervals[idx + 1] - 1))
    if sample == 'rand':
        frame_idxs = [random.choice(range(x[0], x[1])) for x in ranges]
    elif fix_start is not None:
        frame_idxs = [x[0] + fix_start for x in ranges]
    elif sample == 'uniform':
        frame_idxs = [(x[0] + x[1]) // 2 for x in ranges]
    else:
        raise NotImplementedError

    return frame_idxs

decord.bridge.set_bridge("torch")

def read_frames_decord(video_path, num_frames, sample='rand', fix_start=None):
    video_reader = decord.VideoReader(video_path, num_threads=1)
    vlen = len(video_reader)
    frame_idxs = sample_frames(num_frames, vlen, sample=sample, fix_start=fix_start)
    video_reader.skip_frames(1)
    frames = video_reader.get_batch(frame_idxs)

    frames = frames.float() / 255
    frames = frames.permute(0, 3, 1, 2)
    return frames, frame_idxs

def read_frames_decord_start_end(video_path, start, end, num_frames):
    video_reader = decord.VideoReader(video_path, num_threads=1)
    vlen = len(video_reader)
    frame_idxs = sample_frames_clips(start, end, vlen, num_frames + 1)
    video_reader.skip_frames(1)
    frames = video_reader.get_batch(frame_idxs)

    frames = frames.float() / 255
    frames = frames.permute(0, 3, 1, 2)
    return frames, frame_idxs

def get_video_len(video_path):
    cap = cv2.VideoCapture(video_path)
    if not (cap.isOpened()):
        return False
    vlen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    return vlen

video_reader = {
#     'av': read_frames_av,
#     'cv2': read_frames_cv2,
#     'cv2_epic': read_frames_cv2_epic,
#     'cv2_charades': read_frames_cv2_charades,
#     'cv2_egoclip': read_frames_cv2_egoclip,
    'decord': read_frames_decord,
    'decord_start_end': read_frames_decord_start_end,
}

In [34]:
import os
import pdb
import sys
import json
import pandas as pd

from transforms import init_transform_dict, init_video_transform_dict

class SQDES(TextVideoDataset):
    def _load_metadata(self):
        split_files = {
            'train': 'train_v1.json',
#             'val': 'nlq_val.json',            # there is no test
#             'test': 'nlq_test_unannotated.json'
        }
        # target_split_fp = split_files[self.split]
        assert self.subsample in ['video', 'text',]

        self.metadata = pd.DataFrame(columns=['video_uid', 'clip_uid',
                                              'video_start_time', 'video_end_time',
                                              'query'])

        # for split in ['train', 'val']:
        target_split_fp = split_files[split]
        ann_file = os.path.join(self.meta_dir, target_split_fp)
        with open(ann_file) as f:
            anno_json = json.load(f)

        # forward clip features
        if self.subsample == 'video':
            for anno_video in anno_json["videos"]:
                for anno_clip in anno_video["clips"]:
                    for annotator in anno_clip["annotations"]:
                        for label in annotator["labels"]:
                            times = float(label["video_start_time"]), float(label["video_end_time"])
                            label_duration = times[1] - times[0]
                            query = None
                            if 'query' in label:
                                query = label['query']['query']
                                print("here")
                            new = pd.DataFrame({
                                'video_uid': anno_video['video_uid'],
                                'clip_uid': anno_clip['clip_uid'],
                                'video_start_time': times[0],
                                'video_end_time': times[1],
                                'query': query}, index=[1])
                            self.metadata = self.metadata.append(new, ignore_index=True)

        self.transforms = init_video_transform_dict()['test']

    def _get_video_path(self, sample):
        if self.data_source == "clip":
            rel_video_fp = sample["clip_uid"]
        else:
            rel_video_fp = sample["video_uid"]
        full_video_fp = os.path.join(self.data_dir, rel_video_fp + '.mp4')
        return full_video_fp, rel_video_fp
    
    def _get_caption(self, sample):
        caption = sample['query']
        return caption

    def _get_video_feats(self, item):
        sample = self.metadata.iloc [item]
        print(sample)
        video_fp, rel_fp = self._get_video_path(sample)

        fps = 1.87
        try:
            print(sample[2], sample[3])
            imgs, idxs = self.video_reader(video_fp, sample[2]*30, sample[3]*30,
                                               (sample[3]-sample[2]) * fps * self.video_params['num_frames'])
        except:
            print(f"Warning: missing video file {video_fp}.")

        if self.transforms is not None:
            imgs = imgs.transpose(0, 1)  # [T, C, H, W] ---> [C, T, H, W]
            imgs = self.transforms(imgs)
            imgs = imgs.transpose(0, 1)  # recover

        meta_arr = {'video_uid': sample[0], 'clip_uid': sample[1], 'data': video_fp}
        data = {'video': imgs, 'meta' : meta_arr}
        return data

    def _get_text_feats(self, item):
        sample = self.metadata.iloc [item]
        text = self._get_caption(sample)
        meta_arr = {'video_uid': sample[0], 'clip_uid': sample[1], 'dataset': self.dataset_name}
        data = {'text': text, 'meta' : meta_arr}
        return data

    def __getitem__(self, item):
        if self.subsample == 'video':
            return self._get_video_feats(item)
        if self.subsample == 'text':
            return self._get_text_feats(item)

split = 'train'
kwargs = dict(
    dataset_name="Ego4d_NLQ",
    text_params={
        "input": "text"
    },
    video_params={
        "input_res": 224,
        "num_frames": 4,
        "loading": "lax"
    },
    data_dir="/vision/group/ego4d/v1/clips/",
    meta_dir="/vision/u/eatang/sdas/datasets/",
    tsfms=init_video_transform_dict()['train'],
    reader='decord_start_end',
    subsample='video',
    split=split,
)
dataset = SQDES(**kwargs)
print(len(dataset))
# for i in range(1000):
#     item = dataset[i]
#     # print(item.keys())
#     print(item)

Video Transform is used!
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_inde

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
Video Transform is used!
278


  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_inde

In [35]:
dataset.metadata['query']

0      Next time I use my phone after arranging the d...
1      Next time I'm watering the flowers, please rem...
2      Next time I'm cleaning or wiping a table or ki...
3      Next time I'm walking up the stairs, please re...
4      Next time I'm arranging and organizing items, ...
                             ...                        
273    Next time I'm about to request and eat a food ...
274    Next time I'm conversing or interacting with s...
275    Next time I see someone serving food onto a pl...
276    Next time I see person Y drinking from a cup, ...
277    Next time I'm conversing with someone, after I...
Name: query, Length: 278, dtype: object

In [36]:
def dataset_loader(dataset_name,
                   text_params,
                   video_params,
                   data_dir,
                   meta_dir=None,
                   split='train',
                   tsfms=None,
                   cut=None,
                   subsample=1,
                   sliding_window_stride=-1,
                   reader='decord',
                   neg_param=None):
    kwargs = dict(
        dataset_name=dataset_name,
        text_params=text_params,
        video_params=video_params,
        data_dir=data_dir,
        meta_dir=meta_dir,
        split=split,
        tsfms=tsfms,
        cut=cut,
        subsample=subsample,
        sliding_window_stride=sliding_window_stride,
        reader=reader,
        neg_param=neg_param,
    )

    dataset = SQDES(**kwargs)

    return dataset

class TextVideoDataLoader(BaseDataLoaderExplicitSplit):
    def __init__(self,
                 dataset_name,
                 text_params,
                 video_params,
                 data_dir,
                 meta_dir=None,
                 split='train',
                 tsfm_params=None,
                 tsfm_split=None,
                 cut=None,
                 subsample=1,
                 sliding_window_stride=-1,
                 reader='decord',
                 neg_param=None,
                 batch_size=1,
                 num_workers=1,
                 shuffle=True):
        if tsfm_params is None:
            tsfm_params = {}
        if video_params['num_frames'] > 1:
            # video data can not do flip, crop aug
            tsfm_dict = init_video_transform_dict(**tsfm_params)
        else:
            tsfm_dict = init_transform_dict(**tsfm_params)
        if tsfm_split is None:
            tsfm_split = split
        tsfm = tsfm_dict[tsfm_split]
        dataset = dataset_loader(dataset_name, text_params, video_params, data_dir, meta_dir, split, tsfm, cut,
                                 subsample, sliding_window_stride, reader, neg_param)

        super().__init__(dataset, batch_size, shuffle, num_workers)
        self.dataset_name = dataset_name

In [26]:
loader = TextVideoDataLoader(
    dataset_name="Ego4D_NLQ",
    text_params={"input": "text"},
    video_params={"input_res": 224, "num_frames": 4, "loading": "lax"},
    data_dir="/vision/group/ego4d/v1/clips",
    meta_dir="/vision/u/eatang/sdas/datasets/",
    split="train",
    subsample="video",
    reader="decord_start_end",
    batch_size=32,
    num_workers=16,
    shuffle=True
)

Video Transform is used!
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_index=True)
  self.metadata = self.metadata.append(new, ignore_inde

In [38]:
dataset.__getitem__(0)["video"].shape

video_uid                        dd08bc58-b614-4ba7-b883-a213560621dd
clip_uid                         9df49083-577b-43f9-9874-6e4b21f104b4
video_start_time                                            347.62496
video_end_time                                                  349.0
query               Next time I use my phone after arranging the d...
Name: 0, dtype: object
347.62496 349.0


torch.Size([11, 3, 224, 224])

In [22]:
missing_clips = set()
here_count = 0
for clip_uid in dataset.metadata["clip_uid"]:
    if os.path.exists(os.path.join("/vision/group/ego4d/v1/clips/", clip_uid + ".mp4")):
        here_count += 1
        print("here")
    else:
        missing_clips.add(clip_uid)

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


In [31]:
"77be1058-8ecb-49b0-97ce-5acac6fd5a43" in list(dataset.metadata["clip_uid"])

True

In [33]:
 os.path.exists(os.path.join("/vision/group/ego4d/v1/clips/", "77be1058-8ecb-49b0-97ce-5acac6fd5a43" + ".mp4"))

True

In [21]:
for idx, batch in enumerate(loader):
    break
    

video_uid                        dddce8ac-09b0-4b13-b0fa-eb18e1f27b21
clip_uid                         77be1058-8ecb-49b0-97ce-5acac6fd5a43
video_start_time                                          1709.082923
video_end_time                                            1710.714333
query               Next time I use my phone, please remind me to ...
Name: 181, dtype: objectvideo_uid                        2c54abd0-aa4f-4b0a-9e78-c45aca156af0
clip_uid                         94bf098c-6430-438c-9c71-6fd4f02153c4
video_start_time                                           178.890215
video_end_time                                             183.194185
query               Next time I'm taking a photo or recording a vi...
Name: 90, dtype: objectvideo_uid           59dac266-4b43-43b1-9fe3-6013ade33eb6
clip_uid            fbafc354-121f-40f1-a553-bbae0d2f66ed
video_start_time                              724.469249
video_end_time                                728.469029
query                    

UnboundLocalError: Caught UnboundLocalError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/eatang/ipykernel_466762/3532354571.py", line 93, in __getitem__
    return self._get_video_feats(item)
  File "/tmp/eatang/ipykernel_466762/3532354571.py", line 76, in _get_video_feats
    imgs = imgs.transpose(0, 1)  # [T, C, H, W] ---> [C, T, H, W]
UnboundLocalError: local variable 'imgs' referenced before assignment


video_uid           65d56a26-3556-491a-880a-c0fbb581f1bf
clip_uid            135c81f0-56cc-4195-844a-af3a562f3307
video_start_time                             1729.271021
video_end_time                               1735.295031
query                                               None
Name: 236, dtype: object
video_uid                        9d43acb7-9a94-4a80-a461-63f074c6f865
clip_uid                         77e05256-299f-4703-81f9-d3c1da63c70e
video_start_time                                           303.849385
video_end_time                                             348.225695
query               Next time I'm operating an ATM machine, please...
Name: 203, dtype: object
video_uid                        59dac266-4b43-43b1-9fe3-6013ade33eb6
clip_uid                         1c861004-cfd1-4497-8e32-5cb4ad1ff276
video_start_time                                           337.614499
video_end_time                                             393.924029
query               Next time I'm f

Traceback (most recent call last):
  File "/viscam/u/eatang/miniconda3/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/viscam/u/eatang/miniconda3/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 711, in start
    self.io_loop.start()
  File "/viscam/u/eatang/miniconda3/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "/viscam/u/eatang/miniconda3/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/viscam/u/eat

In [51]:
dataset.

<__main__.NaturalLanguageQueries at 0x7f7a90f6e440>

In [9]:
import json

In [10]:
with open('/vision/u/eatang/sdas/datasets/train_v1.json', "r") as f:
    train_v1 = json.load(f)

In [32]:
for video in train_v1["videos"]:
    clip_idx = 0
    annotator_idx = 0
    label_idx = 0
    
    clip_uid = video["clips"][clip_idx]['clip_uid']
    start = video["clips"][clip_idx]['annotations'][annotator_idx]['labels'][0]['start_time']
    end = video["clips"][clip_idx]['annotations'][annotator_idx]['labels'][0]['end_time']
    query = video["clips"][clip_idx]['annotations'][annotator_idx]['labels'][0]['query']
    break

In [38]:
x = read_frames_decord_start_end(os.path.join('/vision/group/ego4d/v1/clips/', clip_uid + '.mp4'), start, end, 4)


In [48]:
query

{'event_has_occurred_before': True,
 'event': 'use_phone',
 'event_readable': '#c uses the phone.',
 'request': 'reminder to check emails.',
 'query': 'Next time I use my phone after arranging the documents and cleaning my work station, please remind me to check my emails.',
 'ans': 'Remember to check your emails.'}

In [49]:
x[1]

[346, 347, 347, 347, 348]

In [50]:
x[0][2]

tensor([[[0.2863, 0.2863, 0.2863,  ..., 0.3490, 0.3490, 0.3490],
         [0.2863, 0.2863, 0.2863,  ..., 0.3490, 0.3490, 0.3490],
         [0.2863, 0.2863, 0.2863,  ..., 0.3490, 0.3490, 0.3490],
         ...,
         [0.1922, 0.1922, 0.1922,  ..., 0.0706, 0.0706, 0.0706],
         [0.1922, 0.1922, 0.1922,  ..., 0.0745, 0.0706, 0.0706],
         [0.1922, 0.1922, 0.1922,  ..., 0.0784, 0.0784, 0.0745]],

        [[0.2510, 0.2510, 0.2510,  ..., 0.3529, 0.3529, 0.3529],
         [0.2510, 0.2510, 0.2510,  ..., 0.3529, 0.3529, 0.3529],
         [0.2510, 0.2510, 0.2510,  ..., 0.3529, 0.3529, 0.3529],
         ...,
         [0.1765, 0.1765, 0.1765,  ..., 0.0627, 0.0627, 0.0627],
         [0.1765, 0.1765, 0.1765,  ..., 0.0667, 0.0627, 0.0627],
         [0.1765, 0.1765, 0.1765,  ..., 0.0706, 0.0706, 0.0667]],

        [[0.2275, 0.2275, 0.2275,  ..., 0.3216, 0.3216, 0.3216],
         [0.2275, 0.2275, 0.2275,  ..., 0.3216, 0.3216, 0.3216],
         [0.2275, 0.2275, 0.2275,  ..., 0.3216, 0.3216, 0.