In [1]:
import os
os.chdir('/scratch/sagarsj42')

In [10]:
import json
import warnings

import requests
import pandas as pd
import tensorflow as tf

In [24]:
data_dir = './yt8m'
yt8m_vocab_file = './yt8m-vocab.csv'
select_labels_list = './select-music-labels.txt'
extracted_labels_data_folder = './yt8m-label-extracted'
extracted_features_data_folder = './yt8m-features-extracted'
split = 'dev'

In [11]:
warnings.filterwarnings('ignore')

In [4]:
with open(select_labels_list, 'r') as f:
    labels = f.read().split('\n')

In [5]:
yt8m_vocab_df = pd.read_csv(yt8m_vocab_file)

print(yt8m_vocab_df.info())

yt8m_vocab_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3862 entries, 0 to 3861
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Index             3862 non-null   int64 
 1   TrainVideoCount   3862 non-null   int64 
 2   KnowledgeGraphId  3862 non-null   object
 3   Name              3806 non-null   object
 4   WikiUrl           3806 non-null   object
 5   Vertical1         3862 non-null   object
 6   Vertical2         584 non-null    object
 7   Vertical3         32 non-null     object
 8   WikiDescription   3806 non-null   object
dtypes: int64(2), object(7)
memory usage: 271.7+ KB
None


Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
0,0,788288,/m/03bt1gh,Game,https://en.wikipedia.org/wiki/Game,Games,,,"A game is structured form of play, usually und..."
1,1,539945,/m/01mw1,Video game,https://en.wikipedia.org/wiki/Video_game,Games,,,A video game is an electronic game that involv...
2,2,415890,/m/07yv9,Vehicle,https://en.wikipedia.org/wiki/Vehicle,Autos & Vehicles,,,A vehicle is a mobile machine that transports ...
3,3,378135,/m/01jddz,Concert,https://en.wikipedia.org/wiki/Concert,Arts & Entertainment,,,A concert is a live music performance in front...
4,4,286532,/m/09jwl,Musician,https://en.wikipedia.org/wiki/Musician,Arts & Entertainment,,,A musician is a person who plays a musical ins...


In [6]:
label_yt8m_indices = dict()
for label in labels:
    yt8m_indx = yt8m_vocab_df[yt8m_vocab_df['Name'] == label].iloc[0]['Index']
    label_yt8m_indices[yt8m_indx] = label

len(label_yt8m_indices), label_yt8m_indices

(45,
 {3: 'Concert',
  4: 'Musician',
  9: 'Guitar',
  10: 'String instrument',
  13: 'Musical ensemble',
  14: 'Music video',
  28: 'Drum kit',
  31: 'Piano',
  33: 'Drum',
  34: 'Acoustic guitar',
  37: 'Electric guitar',
  38: 'Drummer',
  41: 'Choir',
  47: 'Pianist',
  54: 'Orchestra',
  57: 'Musical keyboard',
  61: 'Disc jockey',
  78: 'Snare drum',
  86: 'Cymbal',
  97: 'Violin',
  117: 'Electronic keyboard',
  154: 'Accordion',
  164: 'Marching band',
  168: 'Brass instrument',
  188: 'Saxophone',
  243: 'Viola',
  263: 'Diatonic button accordion',
  278: 'Cello',
  300: 'Flute',
  324: 'Music festival',
  330: 'Trumpet',
  339: 'Guitar Hero',
  348: 'Organ (music)',
  351: 'Fiddle',
  421: 'Guitar amplifier',
  539: 'Flamenco',
  550: 'Banjo',
  551: 'Harmonica',
  553: 'Drum stick',
  552: 'Quartet',
  565: 'Trombone',
  578: 'Mixtape',
  594: 'Tenor saxophone',
  585: 'Clarinet',
  671: 'Bagpipes'})

In [7]:
def extract_video_info(tf_record_path, label_yt8m_indices):
    video_infos = list()
    for raw_rec in tf.data.TFRecordDataset(tf_record_path):
        example = tf.train.Example()
        example.ParseFromString(raw_rec.numpy())
        
        labels = [v for v in example.features.feature['labels'].int64_list.value]
        if len(set(labels).intersection(set(label_yt8m_indices.keys()))) < 1:
            continue
        label_values = list()
        for label in labels:
            if label in label_yt8m_indices:
                label_values.append(label_yt8m_indices[label])
        
        data_id = example.features.feature['id'].bytes_list.value[0].decode()
        try:
            prefix = data_id[:2]
            vid_req_url = f'https://data.yt8m.org/2/j/i/{prefix}/{data_id}.js'
            response = requests.get(vid_req_url, verify=False)
            vid = response.text.split(',')[1].split('"')[1]
        except:
            continue
        
        mean_rgb = [v for v in example.features.feature['mean_rgb'].float_list.value]
        mean_audio = [v for v in example.features.feature['mean_audio'].float_list.value]
        
        video_infos.append({
            'vid': vid,
            'labels': label_values,
            'mean_rgb': mean_rgb,
            'mean_audio': mean_audio
        })
    
    return video_infos

In [18]:
len(os.listdir(os.path.join(data_dir, split)))

3845

In [20]:
all_extracted_data = list()
for tf_record_filename in os.listdir(os.path.join(data_dir, split))[:2]:
    extracted_data = extract_video_info(os.path.join(data_dir, split, tf_record_filename), label_yt8m_indices)
    all_extracted_data.extend(extracted_data)

len(all_extracted_data), all_extracted_data[0].keys()

2023-06-04 01:10:09.326924: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-06-04 01:10:16.965360: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


(102, dict_keys(['vid', 'labels', 'mean_rgb', 'mean_audio']))

In [23]:
label_data = list()
os.makedirs(extracted_labels_data_folder, exist_ok=True)
for info in all_extracted_data:
    label_data.append({
        'vid': info['vid'],
        'labels': info['labels']
    })
label_data_jsonl = '\n'.join([json.dumps(d) for d in label_data])
with open(os.path.join(extracted_labels_data_folder, f'{split}.jsonl'), 'w') as f:
    f.write(label_data_jsonl)

In [25]:
os.makedirs(os.path.join(extracted_features_data_folder, split), exist_ok=True)
for info in all_extracted_data:
    vid = info['vid']
    mean_rgb = info['mean_rgb']
    mean_audio = info['mean_audio']
    with open(os.path.join(extracted_features_data_folder, split, f'{vid}.json'), 'w') as f:
        json.dump(info, f)

In [26]:
for split in ['train', 'dev', 'test']:
    all_extracted_data = list()
    for tf_record_filename in os.listdir(os.path.join(data_dir, split))[:2]:
        extracted_data = extract_video_info(os.path.join(data_dir, split, tf_record_filename), 
                                            label_yt8m_indices)
        all_extracted_data.extend(extracted_data)
    
    label_data = list()
    os.makedirs(extracted_labels_data_folder, exist_ok=True)
    for info in all_extracted_data:
        label_data.append({
            'vid': info['vid'],
            'labels': info['labels']
        })
    label_data_jsonl = '\n'.join([json.dumps(d) for d in label_data])
    with open(os.path.join(extracted_labels_data_folder, f'{split}.jsonl'), 'w') as f:
        f.write(label_data_jsonl)
    
    os.makedirs(os.path.join(extracted_features_data_folder, split), exist_ok=True)
    for info in all_extracted_data:
        vid = info['vid']
        mean_rgb = info['mean_rgb']
        mean_audio = info['mean_audio']
        with open(os.path.join(extracted_features_data_folder, split, f'{vid}.json'), 'w') as f:
            json.dump(info, f)

2023-06-04 01:29:07.545995: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-06-04 01:32:28.385342: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-06-04 01:35:48.177806: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{n