In [1]:
import os

os.chdir('/scratch/sagarsj42')
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42'

In [2]:
import time
import concurrent.futures

import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import VideoMAEImageProcessor, VideoMAEConfig

import pytorchvideo.data
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    UniformTemporalSubsample,
    Normalize
)

from torchvision.transforms import (
    Compose,
    Lambda,
    Resize
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_INFO_DIR = './yt8m-clips-dataset-info'
CLIP_INFO_FILENAME = 'clip-info.jsonl'
VID_INFO_FILENAME = 'video-info.jsonl'
AUDIO_CLIPS_DIR = './yt8m-audio-clips'
VIDEO_CLIPS_DIR = './yt8m-video-clips'
AUDIO_FEATURES_DIR = './yt8m-audio-features'
VIDEO_FEATURES_DIR = './yt8m-video-features'

In [28]:
model_key = 'MCG-NJU/videomae-base'
clip_duration = 8.0
split = 'test'

In [29]:
def int16_to_float32(x):
    return (x / 32767.0).astype(np.float32)


def float32_to_int16(x):
    x = np.clip(x, a_min=-1., a_max=1.)
    return (x * 32767.).astype(np.int16)


def print_video_info(video):
    for k in video:
        if k == 'video':
            print(k, video[k].shape)
        else:
            print(k, video[k])
    
    return

In [37]:
vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, VID_INFO_FILENAME), lines=True)

print(vid_df.info())

vid_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 461 entries, 0 to 460
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vid          461 non-null    object 
 1   n_clips      461 non-null    int64  
 2   audio_dur    461 non-null    float64
 3   video_dur    461 non-null    float64
 4   split        461 non-null    object 
 5   labels       461 non-null    object 
 6   title        461 non-null    object 
 7   description  461 non-null    object 
 8   tags         461 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 32.5+ KB
None


Unnamed: 0,vid,n_clips,audio_dur,video_dur,split,labels,title,description,tags
0,ZKBM2XCWfo8,28,224.816,224.69,test,"[Piano, Pianist, Musical keyboard, Electronic ...",星から降る金 /ミュージカル【モーツァルト！】より カラオケ ピアノ伴奏[フルート],フルーティストのmkharu2さんとコラボさせていただきました♪ ぜひご覧ください！http...,新規プロジェクト
1,gOwJkB7AJ6Y,55,439.287,439.21,test,"[Concert, Musician, Musical ensemble, Drum kit...",Bill Ward Band - It's Alright - live 1997,"Rearranged, but recognizable and quite good.","Bill Ward (Musical Artist),it's alright live"
2,SXbvwzurIRA,26,205.032,204.88,test,"[Musician, Choir]",Nederland Zingt: Ik bouw op U,Op onze God kunnen wij bouwen. Daar worden wij...,"Nederland Zingt,NZD,NZ,NeZi,Christelijke muzie..."
3,DerkdSwHqT0,26,209.2,209.2,test,[Music video],Plain White T’s – Pause (Official Lyrics video...,"Sing along to Pause, the brand new single from...","lipton,lipton ice tea,lipton iced tea,ice tea,..."
4,PMrgYXKfZC0,28,222.447,222.29,test,[Music video],EXO - HURT [FANMADE] MV,,


In [38]:
clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, CLIP_INFO_FILENAME), lines=True)

print(clip_df.info())

clip_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14806 entries, 0 to 14805
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   vid              14806 non-null  object 
 1   clip_no          14806 non-null  int64  
 2   audio_clip_name  14806 non-null  object 
 3   audio_clip_dur   14806 non-null  float64
 4   video_clip_name  14806 non-null  object 
 5   video_clip_dur   14806 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 694.2+ KB
None


Unnamed: 0,vid,clip_no,audio_clip_name,audio_clip_dur,video_clip_name,video_clip_dur
0,ZKBM2XCWfo8,21,ZKBM2XCWfo8-audio-21.mp3,8.0,ZKBM2XCWfo8-video-21.mp4,8.01
1,ZKBM2XCWfo8,20,ZKBM2XCWfo8-audio-20.mp3,8.0,ZKBM2XCWfo8-video-20.mp4,8.01
2,ZKBM2XCWfo8,22,ZKBM2XCWfo8-audio-22.mp3,8.0,ZKBM2XCWfo8-video-22.mp4,8.01
3,ZKBM2XCWfo8,23,ZKBM2XCWfo8-audio-23.mp3,8.0,ZKBM2XCWfo8-video-23.mp4,8.01
4,ZKBM2XCWfo8,27,ZKBM2XCWfo8-audio-27.mp3,8.0,ZKBM2XCWfo8-video-27.mp4,8.01


In [8]:
image_processor = VideoMAEImageProcessor.from_pretrained(model_key)

image_processor

VideoMAEImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "VideoMAEFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "VideoMAEImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

In [9]:
model_config = VideoMAEConfig.from_pretrained(model_key)

model_config

VideoMAEConfig {
  "architectures": [
    "VideoMAEForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.0,
  "decoder_hidden_size": 384,
  "decoder_intermediate_size": 1536,
  "decoder_num_attention_heads": 6,
  "decoder_num_hidden_layers": 4,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "videomae",
  "norm_pix_loss": true,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 16,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.29.2",
  "tubelet_size": 2,
  "use_mean_pooling": false
}

In [10]:
image_mean = image_processor.image_mean
image_std = image_processor.image_std

if 'shortest_edge' in image_processor.size:
    height = width = image_processor.size['shortest_edge']
else:
    height = image_processor.size['height']
    width = image_processor.size['width']
resize_to = (height, width)

num_frames_to_sample = model_config.num_frames

In [11]:
video_transform = Compose(
    [
        ApplyTransformToKey(
            key='video',
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x/255.0),
                    Normalize(image_mean, image_std),
                    Resize(resize_to)
                ]
            )
        )
    ]
)

video_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(VIDEO_CLIPS_DIR, split),
    clip_sampler=pytorchvideo.data.make_clip_sampler('uniform', clip_duration),
    decode_audio=False,
    transform=video_transform
)

video_transform, video_dataset.num_videos

(Compose(
     <pytorchvideo.transforms.transforms.ApplyTransformToKey object at 0x7f38a01f1a90>
 ),
 16854)

In [12]:
sample = next(iter(video_dataset))
print_video_info(sample)

video torch.Size([3, 16, 224, 224])
video_name IAfT-CBJD7E-video-24.mp4
video_index 4398
clip_index 0
aug_index 0
label 131


In [13]:
%%timeit
next(iter(video_dataset))

289 ms ± 33.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
# read each clip, dump librosa output
row = clip_df.iloc[0]
vid = row['vid']
clip_no = row['clip_no']
audio_clip_filename = row['audio_clip_name']
audio_clip_filepath = os.path.join(AUDIO_CLIPS_DIR, split, vid, audio_clip_filename)
audio_data, _ = librosa.load(audio_clip_filepath, sr=48000)

audio_data.shape

(384001,)

In [15]:
%%timeit
audio_data, _ = librosa.load(audio_clip_filepath, sr=48000)

19.1 ms ± 62.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
os.makedirs(os.path.join(VIDEO_FEATURES_DIR, split), exist_ok=True)
for i, video_sample in enumerate(iter(video_dataset)):
    if i > 10:
        break
    video_clip_filename = video_sample['video_name']
    vid = video_clip_filename[:11]
    video_frames = video_sample['video'].permute(1, 0, 2, 3).numpy().astype(np.float16)
    
    os.makedirs(os.path.join(VIDEO_FEATURES_DIR, split, vid), exist_ok=True)
    video_features_filename = video_clip_filename[:-4].replace('-video-', '-vidfeat-') + '.npy'
    np.save(os.path.join(VIDEO_FEATURES_DIR, split, vid, video_features_filename), video_frames)

In [17]:
np.load(os.path.join(VIDEO_FEATURES_DIR, split, vid, video_features_filename)).shape

(16, 3, 224, 224)

In [18]:
%%timeit
np.load(os.path.join(VIDEO_FEATURES_DIR, split, vid, video_features_filename))

616 µs ± 3.45 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [19]:
def save_video_features(split):
    video_dataset = pytorchvideo.data.Ucf101(
        data_path=os.path.join(VIDEO_CLIPS_DIR, split),
        clip_sampler=pytorchvideo.data.make_clip_sampler('uniform', clip_duration),
        decode_audio=False,
        transform=video_transform
    )
    
    os.makedirs(os.path.join(VIDEO_FEATURES_DIR, split), exist_ok=True)
    for i, video_sample in enumerate(iter(video_dataset)):
        # if i > 1000:
        #     break
        if i > 0 and i % 1000 == 0:
            print(split, i, 'videos')
        video_clip_filename = video_sample['video_name']
        vid = video_clip_filename[:11]
        video_frames = video_sample['video'].permute(1, 0, 2, 3).numpy().astype(np.float16)

        os.makedirs(os.path.join(VIDEO_FEATURES_DIR, split, vid), exist_ok=True)
        video_features_filename = video_clip_filename[:-4].replace('-video-', '-vidfeat-') + '.npy'
        np.save(os.path.join(VIDEO_FEATURES_DIR, split, vid, video_features_filename), video_frames)
    
    return

In [None]:
start = time.time()
data_splits = ['dev', 'test', 'train1', 'train2', 'train3', 'train4', 'train5']
with concurrent.futures.ThreadPoolExecutor(max_workers=7) as pool:
    futures = (pool.submit(save_video_features, current_split) 
               for current_split in data_splits)
    concurrent.futures.wait(futures)

print('Time taken:', time.time() - start, 's')

In [43]:
def save_audio_features(split):
    if 'train' in split:
        clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', CLIP_INFO_FILENAME), lines=True)
        vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', VID_INFO_FILENAME), lines=True)
    else:
        clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, CLIP_INFO_FILENAME), lines=True)
        vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, VID_INFO_FILENAME), lines=True)
    vids = vid_df[vid_df['split'] == split]['vid'].tolist()
    os.makedirs(os.path.join(AUDIO_FEATURES_DIR, split), exist_ok=True)
    
    print(split, len(vids))
    
    for i, vid in enumerate(vids):
        # if i > 0 and i > 50:
        #     break
        if i > 0 and i % 50 == 0:
            print(split, i, 'audios')
        audio_filenames = clip_df[clip_df['vid'] == vid]['audio_clip_name']
        for clip_file_name in audio_filenames:
            audio_clip_filepath = os.path.join(AUDIO_CLIPS_DIR, split, vid, clip_file_name)
            audio_data, _ = librosa.load(audio_clip_filepath, sr=48000)
            os.makedirs(os.path.join(AUDIO_FEATURES_DIR, split, vid), exist_ok=True)
            audio_features_filename = clip_file_name[:-4].replace('-audio-', '-audfeat-') + '.npy'
            np.save(os.path.join(AUDIO_FEATURES_DIR, split, vid, audio_features_filename), audio_data)
    
    return

In [44]:
start = time.time()
data_splits = ['dev', 'test', 'train1', 'train2', 'train3', 'train4', 'train5']
with concurrent.futures.ThreadPoolExecutor(max_workers=7) as pool:
    futures = (pool.submit(save_audio_features, current_split) 
               for current_split in data_splits)
    concurrent.futures.wait(futures)

print('Time taken:', time.time() - start, 's')

test 461
dev 499
train3 958
train1 903
train5 1039
train4 959
train2 972
train1 50 videos
train2 50 videos
train1 100 videos
train3 50 videos
train2 100 videos
train1 150 videos
train2 150 videos
train1 200 videos
train3 100 videos
train1 250 videos
train2 200 videos
train3 150 videos
train1 300 videos
train2 250 videos
train1 350 videos
train3 200 videos
train5 50 videos
train4 50 videos
train2 300 videos
train1 400 videos
train3 250 videos
train1 450 videos
train2 350 videos
train4 100 videos
train1 500 videos
train5 100 videos
train2 400 videos
train3 300 videos
train1 550 videos
train2 450 videos
train1 600 videos
train3 350 videos
train4 150 videos
train2 500 videos
train1 650 videos
train5 150 videos
train3 400 videos
train1 700 videos
train2 550 videos
train4 200 videos
train1 750 videos
train3 450 videos
train2 600 videos
train1 800 videos
train5 200 videos
train2 650 videos
train1 850 videos
train4 250 videos
train3 500 videos
train1 900 videos
train2 700 videos
train3 550 vid