In [1]:
import sys
import copy
from glob import glob
import math
import os
import json 
import csv

import torch
from torch.utils.data import DataLoader

import nvvl

In [2]:
class NVVL():
    def __init__(self, num_frames, image_shape, is_cropped, crop_size, metadata,
                 batchsize=1, device_id=0, shuffle=False, distributed=False, fp16=False):
        self.batchsize = batchsize
        self.shuffle = shuffle
        self.distributed = distributed
        self.frames = frames
        self.device_id = device_id

        self.is_cropped = is_cropped
        self.crop_size = crop_size

        self.files = {}
        
        self.class_map = {}
        self.labels = {}
        
        fin = open(metadata, 'r')
        r = csv.reader(fin)
        for row in r:
            if row[1] not in self.class_map:
                self.class_map[row[1]] = int(row[2])
                
            for frame_no in range(int(row[3]),int(row[4])+1):
                self.labels[row[0]] = {frame_no:int(row[2])}
            
            if row[0] not in self.files:
                self.files[row[0]] = 1
            
        fin.close()
            
        self.class_map['background'] = len(self.class_map.keys())

        if fp16:
            tensor_type = 'half'
        else:
            tensor_type = 'float'
            
        self.image_shape = image_shape
        
        height = min(self.image_shape[0], self.crop_size[0])
        width = min(self.image_shape[1], self.crop_size[1])
        
        processing = {"input": nvvl.ProcessDesc(type=tensor_type, height=height, width=width,
                                               random_crop=self.is_cropped, random_flip=False,
                                               normalized=True, color_space="RGB", dimension_order="cfhw")}
        
        dataset = nvvl.VideoDataset(list(self.files.keys())[:100],
                                   sequence_length=self.frames,
                                   device_id=self.device_id,
                                   processing=processing, 
                                   get_label=self.get_label)
        
        self.loader = nvvl.VideoLoader(dataset, batch_size=self.batchsize, shuffle=self.shuffle, distributed=self.distributed) 
        
    def __len__(self):
        return len(self.loader)
    
    def __iter__(self):
        return iter(self.loader)
    
    def get_label(self, filename, frame_num, rand_changes):
        print (filename, frame_num)
        if frame_num in self.labels[filename]:
            print ('here')
            return self.labels[filename][frame_num]
        else:
            print ('not here')
            return self.class_map['background']

In [3]:
frames = 64
is_cropped = True
image_shape = [256, 256]
crop_size = [224, 224]
video_path = '../datasets/activity_net/processed_videos/'
train_csv = '../datasets/activity_net/train.csv'
val_csv = '../datasets/activity_net/val.csv'
batchsize = 1
shuffle = False
distributed = False
device_id = 0
fp16 = False
train_loader = NVVL(frames, image_shape, is_cropped, crop_size, train_csv,
                      batchsize, shuffle, distributed, device_id, fp16)
# val_loader = NVVL(frames, image_shape, is_cropped, crop_size, val_csv,
#                       batchsize, shuffle, distributed, device_id, fp16)

In [None]:
i = 0
train_iter = train_loader.__iter__()
t = next(train_iter)

# for v in val_loader:
#     print (v['input'].size())
#     break

../datasets/activity_net/processed_videos/v_sJFgo9H6zNo.mp4 0
not here
../datasets/activity_net/processed_videos/v_sJFgo9H6zNo.mp4 1
not here
../datasets/activity_net/processed_videos/v_sJFgo9H6zNo.mp4 2
not here
../datasets/activity_net/processed_videos/v_sJFgo9H6zNo.mp4 3
not here


In [None]:
t

In [9]:
train_batches = len(train_loader)
print (train_batches)

25454


[0;31mInit signature:[0m [0mnvvl[0m[0;34m.[0m[0mVideoDataset[0m[0;34m([0m[0mfilenames[0m[0;34m,[0m [0msequence_length[0m[0;34m,[0m [0mdevice_id[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mget_label[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mprocessing[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlog_level[0m[0;34m=[0m[0;34m'warn'[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
VideoDataset

Parameters
----------
filenames : collection of strings
    list of video files to draw from

sequence_length : int
    how many frames are in each sample

device_id : int, optional
    GPU device to use (Default: 0)

get_label : callable, optional
    callable with signature:
        (filename : str, frame_num : int) : anything
    The returned value is simply passed as an output
    alongside any returned frames.
    If None, label returned is None. (Default: None)

processing : dict {string -> ProcessDesc}, optional
    Describes processing to be done on the

In [1]:
fin = open('../datasets/activity_net/dataset.stats', 'r')
info = {}
this = []
for line in fin.readlines():
    if 'Input' in line:
        this.append(line.split(' ')[-1][3:-3])
    if 'Stream' in line and 'Video' in line:
        splits = line.split(' ')
        for split in splits:
            if 'x' in split and ')' not in split:
                this.append(split)
                break 
                
        for i in range(len(splits)):
            if 'fps' in splits[i]:
                this.append(splits[i-1])
                break
                
        info[this[0]] = this[1:-1]
        this = []

In [20]:
list(info.keys())[:10]

['v_sJFgo9H6zNo.mp4',
 'v_V1zhqaGFY2A.mp4',
 'v_JDg--pjY5gg.mp4',
 'v_KsFid_YVsn0.mp4',
 'v_-TmWR_keSfI.mp4',
 'v_u2uoYvo8J5s.mp4',
 'v_a-6rpItrRSk.mp4',
 'v_--0edUL8zmA.mp4',
 'v_c_NlYvL96y0.mp4',
 'v_hHiPEAiYKv0.mp4']

In [22]:
diff_codecs = {}
diff_dims = {}
diff_fps = {}
for k in info.keys():
    req_info = info[k]
    if req_info[0] not in diff_codecs:
        diff_dims[req_info[0]] = 1
        
    if req_info[-1]  not in diff_codecs:
        diff_fps[req_info[-1]] = 1

In [58]:
videos = os.listdir('../datasets/activity_net/videos/')

In [65]:
fout = open('../datasets/activity_net/process_videos.sh', 'w')
i = 1
for video in videos:
    fout.write('ffmpeg -i videos/' + video +' -s 256x256 -c:a copy processed_videos/' + video + ' & \n')
    if i % 44 == 0:
        fout.write('wait\n')
    
    i += 1
fout.close()

In [4]:
nvvl.VideoDataset?

[0;31mInit signature:[0m [0mnvvl[0m[0;34m.[0m[0mVideoDataset[0m[0;34m([0m[0mfilenames[0m[0;34m,[0m [0msequence_length[0m[0;34m,[0m [0mdevice_id[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mget_label[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mprocessing[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlog_level[0m[0;34m=[0m[0;34m'warn'[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
VideoDataset

Parameters
----------
filenames : collection of strings
    list of video files to draw from

sequence_length : int
    how many frames are in each sample

device_id : int, optional
    GPU device to use (Default: 0)

get_label : callable, optional
    callable with signature:
        (filename : str, frame_num : int) : anything
    The returned value is simply passed as an output
    alongside any returned frames.
    If None, label returned is None. (Default: None)

processing : dict {string -> ProcessDesc}, optional
    Describes processing to be done on the

In [1]:
import nvvl
def get_label(x,y,z):
    return 1
d = nvvl.VideoDataset(['../datasets/activity_net/processed_videos/v_-_gDSRlC1kg.mp4'], 3, processing={'a': nvvl.ProcessDesc()}, get_label=lambda x,y: (x,y))
fr = d[0]
print(type(fr))
print(fr['a'].shape)
print(fr['labels'])
for x in [50, 121, 31, 50]:
  print(d[x]['labels'])

TypeError: <lambda>() takes 2 positional arguments but 3 were given