# YouTubeDataset Examples


In [9]:
import torch
import torchvision.transforms as vTransforms

import torchtext
import torchtext.vocab
import torchtext.data


import numpy as np


from YouTubeDataset import YouTubeDataset
from YouTubeDataset import transforms as ytTransforms

# YouTubeDataset requires a Google Cloud API Key with YouTube Data API V3 access to
# download a dataset
#
# API Key from https://console.cloud.google.com/apis/credentials
# 

from MYAPIKEY import API_KEY

In [2]:
def show(o):
    ds = dl = None
    if isinstance(o, torch.utils.data.IterableDataset):
        ds = dl = o

    if isinstance(o, torch.utils.data.DataLoader):
        dl = o
        ds = dl.dataset
    
    if ds != None and dl != None:
        print(ds.fields)
        
        for batch in dl:
            for k, v in zip(ds.fields, batch):
                s = np.shape(v)
                if len(s) == 0:
                    print(k,v)
                else:
                    print(k,s)
            break

## Default Dataset

Video frames as images, timestamp


In [3]:
from YouTubeDataset import YouTubeDataset

ds0 = YouTubeDataset('data', 'SecretLocation', 'train', 
                    download=True,
                    api_key=API_KEY,
                    user_name='thesecretlocation',
                    splits={'train':0.90, 'test':0.1})                  

show(ds0)

  9%|▊         | 4/46 [00:15<02:41,  3.85s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=E5E0Arr57YI Exception: KeyError, 'cipher'


 11%|█         | 5/46 [00:16<02:08,  3.14s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=wgl3nDpmXTQ Exception: KeyError, 'cipher'


 52%|█████▏    | 24/46 [01:25<01:24,  3.85s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=PAI8wBkxyw4 Exception: KeyError, 'cipher'


 54%|█████▍    | 25/46 [01:27<01:07,  3.23s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=G0FJHb6Cqn4 Exception: KeyError, 'cipher'


 65%|██████▌   | 30/46 [01:44<00:53,  3.33s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=y50B7DRNm70 Exception: KeyError, 'en'


 67%|██████▋   | 31/46 [01:46<00:43,  2.87s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=reQPwK774O0 Exception: KeyError, 'en'


 76%|███████▌  | 35/46 [01:58<00:31,  2.86s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=1BUJU3_ozaE Exception: KeyError, 'en'


 89%|████████▉ | 41/46 [02:18<00:14,  2.86s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=y50B7DRNm70 Exception: KeyError, 'en'


 91%|█████████▏| 42/46 [02:19<00:08,  2.17s/it]

pytube3 incompatibility, upgrade pytube3 to download: http://youtube.com/watch?v=itIZUupuHJ4 Exception: RegexMatchError, get_ytplayer_config: could not find match for config_patterns


100%|██████████| 46/46 [02:31<00:00,  3.23s/it]

VF_DATA (3, 360, 640)
TIME 0.0





### With Dataloader


In [4]:
ds0_loader = torch.utils.data.DataLoader(ds0, batch_size=64)

show(ds0_loader)


['VF_DATA', 'TIME']
VF_DATA torch.Size([64, 3, 318, 640])
TIME torch.Size([64])


## Audio Training Dataset

Training dataset for TTS, audio is aligned with text (uses captioning timestamps)

timestamp, text, audio buffer

In [6]:
#
# Custom Embedding function to embed text strings with word vectors
#
class MyFastTextEmbedding():
    def __init__(self, max_token_len=32, tokenizer="basic_english", language="en"):
        
        self.tokenizer = torchtext.data.get_tokenizer(tokenizer, language=language)
        self.vocab = torchtext.vocab.FastText(language=language)
        self.max_token_len = max_token_len

    def __call__(self, text):
        tokens = self.tokenizer(text)
        vec = self.vocab.get_vecs_by_tokens(tokens, lower_case_backup=True)
        pvec = torch.nn.functional.pad(vec, (0,0,self.max_token_len - vec.shape[0],0))
        return pvec

#
# Audio Transform to zero pad audio segment
#
class MyAudioTransform():
    def __init__(self, max_audio_len=0):
        self.max_audio_len = max_audio_len
        
    def __call__(self, audio):
        audio = torch.nn.functional.pad(audio, (self.max_audio_len - audio.shape[1],0))
        return audio
    
ds1 = YouTubeDataset('data', 'SecretLocation', 'train', 
                    fields=[YouTubeDataset.F_TIME,  YouTubeDataset.F_CC_TEXT, YouTubeDataset.F_AF_DATA],
                    key=YouTubeDataset.F_CC_TEXT)
ds1.text_transform = MyFastTextEmbedding(max_token_len=64)
ds1.audio_transform = MyAudioTransform(max_audio_len=int(ds1.max_text_dur * ds1.audio_sr))

show(ds1)


['TIME', 'CC_TEXT', 'AF_DATA']
TIME 0.0
CC_TEXT torch.Size([64, 300])
AF_DATA torch.Size([1, 1110879])


### With Dataloader

In [None]:
ds1_loader = torch.utils.data.DataLoader(ds1, batch_size=64)

show(ds1_loader)

## Video Captioning Dataset

Training Dataset for video captioning

timestamp, text, video_frames with video frames aligned to text and padded

In [7]:

ds2 = YouTubeDataset('data', 'MeumGaming', 'train', 
                    fields=[YouTubeDataset.F_TIME,  YouTubeDataset.F_CC_TEXT, YouTubeDataset.F_VF_DATA],
                    key=YouTubeDataset.F_CC_TEXT)                  

show(ds2)

['TIME', 'CC_TEXT', 'VF_DATA']
TIME 0.06
CC_TEXT what's up guys me I'm getting here back
VF_DATA (3, 318, 640)


## Video, Audio, Text Dataset

timestamp, 5s video segment, audio, text aligned to video segment

In [5]:
ds3 = YouTubeDataset('data', 'MeumGaming', 'train', 
                    fields=[YouTubeDataset.F_TIME,  YouTubeDataset.F_VF_DATA, YouTubeDataset.F_AF_DATA, YouTubeDataset.F_CC_TEXT ],
                    key=YouTubeDataset.F_VF_DATA,
                    video_cliplen=5*24)                  

show(ds3)

Files already downloaded
['TIME', 'VF_DATA', 'AF_DATA', 'CC_TEXT']
TIME 0.0
VF_DATA torch.Size([120, 3, 318, 640])
AF_DATA torch.Size([2, 220500])
CC_TEXT what's up guys me I'm getting here back
