In [8]:
import sys
sys.executable = '/home2/sagarsj42/miniconda3/envs/video/bin/python'

In [9]:
import os
os.chdir('/scratch/sagarsj42')

In [10]:
sys.executable

'/home2/sagarsj42/miniconda3/envs/video/bin/python'

In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [12]:
DATASET_INFO_DIR = './yt8m-clips-dataset-info'
EMBEDS_DIR = 'zeroshot-embeds'

In [13]:
splits = ['train', 'dev', 'test']
media = ['text', 'audio', 'video']

In [14]:
clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', 'clip-info.jsonl'), lines=True)

print(clip_df.info())

clip_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58847 entries, 0 to 58846
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   vid              58847 non-null  object 
 1   clip_no          58847 non-null  int64  
 2   audio_clip_name  58847 non-null  object 
 3   audio_clip_dur   58847 non-null  float64
 4   video_clip_name  58847 non-null  object 
 5   video_clip_dur   58847 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.7+ MB
None


Unnamed: 0,vid,clip_no,audio_clip_name,audio_clip_dur,video_clip_name,video_clip_dur
0,6YIzDW8HBPo,23,6YIzDW8HBPo-audio-23.mp3,8.0,6YIzDW8HBPo-video-23.mp4,8.01
1,6YIzDW8HBPo,9,6YIzDW8HBPo-audio-9.mp3,8.0,6YIzDW8HBPo-video-9.mp4,8.01
2,6YIzDW8HBPo,24,6YIzDW8HBPo-audio-24.mp3,8.0,6YIzDW8HBPo-video-24.mp4,8.01
3,6YIzDW8HBPo,15,6YIzDW8HBPo-audio-15.mp3,8.0,6YIzDW8HBPo-video-15.mp4,8.01
4,6YIzDW8HBPo,4,6YIzDW8HBPo-audio-4.mp3,8.0,6YIzDW8HBPo-video-4.mp4,8.01


In [15]:
vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', 'video-info.jsonl'), lines=True)

print(vid_df.info())

vid_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4831 entries, 0 to 4830
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vid                4831 non-null   object 
 1   n_clips            4831 non-null   int64  
 2   n_sampled_clips    4831 non-null   int64  
 3   audio_dur          4831 non-null   float64
 4   sampled_audio_dur  4831 non-null   float64
 5   video_dur          4831 non-null   float64
 6   sampled_video_dur  4831 non-null   float64
 7   split              4831 non-null   object 
 8   labels             4831 non-null   object 
 9   title              4831 non-null   object 
 10  description        4831 non-null   object 
 11  tags               4831 non-null   object 
dtypes: float64(4), int64(2), object(6)
memory usage: 453.0+ KB
None


Unnamed: 0,vid,n_clips,n_sampled_clips,audio_dur,sampled_audio_dur,video_dur,sampled_video_dur,split,labels,title,description,tags
0,6YIzDW8HBPo,28,6,224.398,48.0,224.29,48.06,train1,[Music video],Sharper Than A Knife (Lyric Video) (Pete Hammo...,Dedicated to David Lynch. Animated by John von...,"parralox,sharper,than,knife,2012,pwl,saw,stock..."
1,4EYfkE6fczE,28,6,226.615,48.0,226.59,48.06,train1,[Music video],San Quinn - Realest Intro Ever,Music video by San Quinn performing Realest In...,"San Quinn,Realest Intro Ever,Realest,Intro,Eve..."
2,2aAIB9-0wrc,42,9,338.013,72.0,338.0,72.09,train1,"[Concert, Music video]",EU TENHO UMA ALIANÇA (DVD ALIANÇA) - André Val...,Gravação ao vivo do CD/DVD Aliança na Igreja B...,"EU TENHO UMA ALIANÇA,ALIANÇA,André Valadão,Igr..."
3,8kd4GNKcvc0,30,7,242.753,56.0,242.71,56.07,train1,"[Concert, Musician]",Big Time Rush - Worldwide,"Big Time Rush - Worldwide\nJingle Jam, Blue Cr...","Big,Time,Rush,Worldwide,Jingle,Jam,Rochester,N..."
4,3UoFEJ4e8KA,28,6,224.688,48.0,224.6,48.0,train1,[Concert],Hazama - Lagu Untukmu,GSA Bukit Tinggi Klang,"Hazama,Lagu,Untukmu,AF,GSA"


In [16]:
video_features_data = dict()
for split in splits:
    split_data = list()
    vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, 'video-info.jsonl'), lines=True)
    clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, 'clip-info.jsonl'), lines=True)
    for _, row in vid_df.iterrows():
        vid = row['vid']
        labels = row['labels']
        
        if split == 'train':
            n_clips = row['n_sampled_clips']
        else:
            n_clips = row['n_clips']
        clip_nos = clip_df[clip_df['vid'] == vid]['clip_no'].tolist()
        try:
            assert n_clips == len(clip_nos)
        except:
            print(f'Insufficient clips in {split} for {vid}: expected {n_clips}, found {len(clip_nos)}')
        
        audio_embeds = list()
        video_embeds = list()
        for clip_no in clip_nos:
            audio_file_name = f'{vid}-{clip_no}-audio-emb.npy'
            audio_embed = np.load(os.path.join(EMBEDS_DIR, split, 'audio', audio_file_name))
            audio_embeds.append(audio_embed)
            video_file_name = f'{vid}-{clip_no}-video-emb.npy'
            video_embed = np.load(os.path.join(EMBEDS_DIR, split, 'video', video_file_name))
            video_embeds.append(video_embed)
        audio_embeds = np.array(audio_embeds)
        video_embeds = np.array(video_embeds)
        split_data.append({
            'vid': vid,
            'audio_features': audio_embeds,
            'video_features': video_embeds,
            'labels': labels
        })
    video_features_data[split] = split_data

video_features_data.keys()

Insufficient clips in train for oKcBv8084ZU: expected 34, found 33
Insufficient clips in train for ukk0TpAkJpQ: expected 23, found 22
Insufficient clips in test for 9sMjAn5tP3U: expected 40, found 39
Insufficient clips in test for QyIOvRn2Z_8: expected 61, found 60
Insufficient clips in test for ZiAau4ESESs: expected 34, found 33
Insufficient clips in test for hlRpflYurPk: expected 25, found 24


dict_keys(['train', 'dev', 'test'])

In [17]:
len(video_features_data['train']), len(video_features_data['dev']), len(video_features_data['test'])

(4831, 499, 461)

In [18]:
video_features_data['train'][0]

{'audio_features': array([[ 0.6743951 , -0.36910027, -0.4471945 , ..., -0.12636545,
         -0.41669586,  0.11578765],
        [ 0.4688327 , -0.39710957, -0.162885  , ..., -0.12445029,
         -0.49513543, -0.14739208],
        [ 0.31135932, -0.5993597 ,  0.08894795, ..., -0.07330168,
         -0.5206213 , -0.23732568],
        [ 0.5557515 , -0.24827856,  0.279797  , ...,  0.06434721,
         -0.3691315 , -0.1484194 ],
        [ 0.7185022 , -0.40858406,  0.08637433, ..., -0.05709485,
         -0.36055908, -0.06420889],
        [ 0.15080181, -0.1680652 ,  0.39890742, ..., -0.21567212,
         -0.31063494, -0.46856382]], dtype=float32),
 'labels': ['Music video'],
 'vid': '6YIzDW8HBPo',
 'video_features': array([[ 0.38145015, -0.09980445,  0.3268809 , ..., -0.19327152,
         -0.00652518,  0.4238313 ],
        [ 0.9398173 ,  0.613344  ,  1.270302  , ..., -0.0383115 ,
         -0.41319394, -0.84817284],
        [ 0.97678685,  0.7646564 ,  1.1867974 , ...,  0.02679942,
         -0.44

In [19]:
video_features_data['train'][0]['video_features'].mean(axis=0).shape

(300,)

In [20]:
mean_data = dict()
for split in splits:
    split_data = video_features_data[split]
    all_audio = list()
    all_video = list()
    all_labels = list()
    for instance in split_data:
        all_audio.append(instance['audio_features'].mean(axis=0))
        all_video.append(instance['video_features'].mean(axis=0))
        all_labels.append(instance['labels'])
    all_audio = np.array(all_audio)
    all_video = np.array(all_video)
    mean_data[split] = {'audio': all_audio, 'video': all_video, 'labels': all_labels}

mean_data.keys()

dict_keys(['train', 'dev', 'test'])

In [21]:
mean_data['train'].keys()

dict_keys(['audio', 'video', 'labels'])

In [22]:
mean_data['train']['audio'].shape, mean_data['train']['video'].shape, len(mean_data['train']['labels'])

((4831, 300), (4831, 300), 4831)

In [23]:
all_labels = set()
[all_labels.update(labels) for labels in mean_data['train']['labels']]
len(all_labels)
label_enc = LabelEncoder()
label_enc.fit(list(all_labels))

LabelEncoder()

In [24]:
multi_lab_bin = MultiLabelBinarizer()
mean_data['train']['mult_lab'] = multi_lab_bin.fit_transform(
    [label_enc.transform(labels) for labels in mean_data['train']['labels']])

mean_data['train']['mult_lab'].shape

(4831, 45)

In [25]:
mean_data['dev']['mult_lab'] = multi_lab_bin.transform(
    [label_enc.transform(labels) for labels in mean_data['dev']['labels']])

mean_data['dev']['mult_lab'].shape

(499, 45)

In [26]:
mean_data['test']['mult_lab'] = multi_lab_bin.transform(
    [label_enc.transform(labels) for labels in mean_data['test']['labels']])

mean_data['test']['mult_lab'].shape

(461, 45)

In [27]:
x_train = np.hstack((mean_data['train']['audio'], mean_data['train']['video']))
x_dev = np.hstack((mean_data['dev']['audio'], mean_data['dev']['video']))
x_test = np.hstack((mean_data['test']['audio'], mean_data['test']['video']))

y_train = mean_data['train']['mult_lab']
y_dev = mean_data['dev']['mult_lab']
y_test = mean_data['test']['mult_lab']

x_train.shape, x_dev.shape, x_test.shape, y_train.shape, y_dev.shape, y_test.shape

((4831, 600), (499, 600), (461, 600), (4831, 45), (499, 45), (461, 45))

In [33]:
mlp = MLPClassifier()
mlp.fit(x_train, y_train)



MLPClassifier()

In [36]:
pred_train = mlp.predict(x_train)
pred_dev = mlp.predict(x_dev)
pred_test = mlp.predict(x_test)

pred_train.shape, pred_dev.shape, pred_test.shape

((4831, 45), (499, 45), (461, 45))

In [38]:
accuracy_score(y_true=y_train, y_pred=pred_train)

0.6131235768991927

In [39]:
accuracy_score(y_true=y_dev, y_pred=pred_dev)

0.5390781563126252

In [40]:
accuracy_score(y_true=y_test, y_pred=pred_test)

0.5184381778741866