In [1]:
import os
os.chdir('../../data')

In [3]:
import faiss
import numpy as np
import pandas as pd

In [27]:
DATASET_INFO_DIR = './yt8m-clips-dataset-info'
EMBEDS_DIR = 'weighted-contrastive-embeds'
EMB_SIZE = 300
RET_SIZE = 10000

In [28]:
splits = ['train', 'dev', 'test']
media = ['text', 'audio', 'video']

In [6]:
clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', 'clip-info.jsonl'), lines=True)

print(clip_df.info())

clip_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58847 entries, 0 to 58846
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   vid              58847 non-null  object 
 1   clip_no          58847 non-null  int64  
 2   audio_clip_name  58847 non-null  object 
 3   audio_clip_dur   58847 non-null  float64
 4   video_clip_name  58847 non-null  object 
 5   video_clip_dur   58847 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.7+ MB
None


Unnamed: 0,vid,clip_no,audio_clip_name,audio_clip_dur,video_clip_name,video_clip_dur
0,6YIzDW8HBPo,23,6YIzDW8HBPo-audio-23.mp3,8.0,6YIzDW8HBPo-video-23.mp4,8.01
1,6YIzDW8HBPo,9,6YIzDW8HBPo-audio-9.mp3,8.0,6YIzDW8HBPo-video-9.mp4,8.01
2,6YIzDW8HBPo,24,6YIzDW8HBPo-audio-24.mp3,8.0,6YIzDW8HBPo-video-24.mp4,8.01
3,6YIzDW8HBPo,15,6YIzDW8HBPo-audio-15.mp3,8.0,6YIzDW8HBPo-video-15.mp4,8.01
4,6YIzDW8HBPo,4,6YIzDW8HBPo-audio-4.mp3,8.0,6YIzDW8HBPo-video-4.mp4,8.01


In [7]:
vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, 'train', 'video-info.jsonl'), lines=True)

print(vid_df.info())

vid_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4831 entries, 0 to 4830
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vid                4831 non-null   object 
 1   n_clips            4831 non-null   int64  
 2   n_sampled_clips    4831 non-null   int64  
 3   audio_dur          4831 non-null   float64
 4   sampled_audio_dur  4831 non-null   float64
 5   video_dur          4831 non-null   float64
 6   sampled_video_dur  4831 non-null   float64
 7   split              4831 non-null   object 
 8   labels             4831 non-null   object 
 9   title              4831 non-null   object 
 10  description        4831 non-null   object 
 11  tags               4831 non-null   object 
dtypes: float64(4), int64(2), object(6)
memory usage: 453.0+ KB
None


Unnamed: 0,vid,n_clips,n_sampled_clips,audio_dur,sampled_audio_dur,video_dur,sampled_video_dur,split,labels,title,description,tags
0,6YIzDW8HBPo,28,6,224.398,48.0,224.29,48.06,train1,[Music video],Sharper Than A Knife (Lyric Video) (Pete Hammo...,Dedicated to David Lynch. Animated by John von...,"parralox,sharper,than,knife,2012,pwl,saw,stock..."
1,4EYfkE6fczE,28,6,226.615,48.0,226.59,48.06,train1,[Music video],San Quinn - Realest Intro Ever,Music video by San Quinn performing Realest In...,"San Quinn,Realest Intro Ever,Realest,Intro,Eve..."
2,2aAIB9-0wrc,42,9,338.013,72.0,338.0,72.09,train1,"[Concert, Music video]",EU TENHO UMA ALIANÇA (DVD ALIANÇA) - André Val...,Gravação ao vivo do CD/DVD Aliança na Igreja B...,"EU TENHO UMA ALIANÇA,ALIANÇA,André Valadão,Igr..."
3,8kd4GNKcvc0,30,7,242.753,56.0,242.71,56.07,train1,"[Concert, Musician]",Big Time Rush - Worldwide,"Big Time Rush - Worldwide\nJingle Jam, Blue Cr...","Big,Time,Rush,Worldwide,Jingle,Jam,Rochester,N..."
4,3UoFEJ4e8KA,28,6,224.688,48.0,224.6,48.0,train1,[Concert],Hazama - Lagu Untukmu,GSA Bukit Tinggi Klang,"Hazama,Lagu,Untukmu,AF,GSA"


In [16]:
video_features_data = dict()
for split in splits:
    split_data = list()
    vid_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, 'video-info.jsonl'), lines=True)
    clip_df = pd.read_json(os.path.join(DATASET_INFO_DIR, split, 'clip-info.jsonl'), lines=True)
    for _, row in vid_df.iterrows():
        vid = row['vid']
        labels = row['labels']
        
        if split == 'train':
            n_clips = row['n_sampled_clips']
        else:
            n_clips = row['n_clips']
        clip_nos = clip_df[clip_df['vid'] == vid]['clip_no'].tolist()
        try:
            assert n_clips == len(clip_nos)
        except:
            print(f'Insufficient clips in {split} for {vid}: expected {n_clips}, found {len(clip_nos)}')
        
        text_embeds = list()
        audio_embeds = list()
        video_embeds = list()
        for clip_no in clip_nos:
            text_file_name = f'{vid}-{clip_no}-text-emb.npy'
            text_embed = np.load(os.path.join(EMBEDS_DIR, split, 'text', text_file_name))
            text_embeds.append(text_embed)
            audio_file_name = f'{vid}-{clip_no}-audio-emb.npy'
            audio_embed = np.load(os.path.join(EMBEDS_DIR, split, 'audio', audio_file_name))
            audio_embeds.append(audio_embed)
            video_file_name = f'{vid}-{clip_no}-video-emb.npy'
            video_embed = np.load(os.path.join(EMBEDS_DIR, split, 'video', video_file_name))
            video_embeds.append(video_embed)
        text_embeds = np.array(text_embeds)
        audio_embeds = np.array(audio_embeds)
        video_embeds = np.array(video_embeds)
        split_data.append({
            'vid': vid,
            'text_features': text_embeds,
            'audio_features': audio_embeds,
            'video_features': video_embeds,
            'labels': labels
        })
    video_features_data[split] = split_data

video_features_data.keys()

Insufficient clips in train for oKcBv8084ZU: expected 34, found 33
Insufficient clips in train for ukk0TpAkJpQ: expected 23, found 22
Insufficient clips in test for 9sMjAn5tP3U: expected 40, found 39
Insufficient clips in test for QyIOvRn2Z_8: expected 61, found 60
Insufficient clips in test for ZiAau4ESESs: expected 34, found 33
Insufficient clips in test for hlRpflYurPk: expected 25, found 24


dict_keys(['train', 'dev', 'test'])

In [17]:
len(video_features_data['train']), len(video_features_data['dev']), len(video_features_data['test'])

(4831, 499, 461)

In [18]:
video_features_data['train'][0]

{'vid': '6YIzDW8HBPo',
 'text_features': array([[-5.92609011e-02, -3.96903604e-04, -1.27322361e-01, ...,
          3.30117270e-02, -2.39759728e-01,  3.06975424e-01],
        [ 3.18600982e-01, -3.68400812e-02,  8.04564506e-02, ...,
         -2.70390898e-01, -2.88113326e-01, -1.27779663e-01],
        [ 4.44685847e-01, -4.56447423e-01,  1.03591394e+00, ...,
          1.08903952e-01, -5.33871293e-01,  7.53110528e-01],
        [ 7.06050754e-01, -7.69041896e-01,  6.71283007e-01, ...,
         -1.04032755e-01, -2.13710591e-01,  2.42335960e-01],
        [ 2.70545334e-01, -7.62386769e-02,  3.18925202e-01, ...,
         -2.65933394e-01, -1.53988600e-01, -2.07858413e-01],
        [ 1.64108455e-01,  2.19373524e-01,  1.80868298e-01, ...,
         -3.42652909e-02, -4.24078107e-03,  3.15298736e-02]], dtype=float32),
 'audio_features': array([[-0.09538653,  0.07119057, -0.06264777, ...,  0.03291445,
          0.07796044,  0.01284727],
        [-0.28774795,  0.16270117, -0.04386227, ...,  0.10613745,
 

In [19]:
video_features_data['train'][0]['video_features'].mean(axis=0).shape

(300,)

In [20]:
mean_data = dict()
for split in splits:
    split_data = video_features_data[split]
    all_text = list()
    all_audio = list()
    all_video = list()
    all_labels = list()
    for instance in split_data:
        all_text.append(instance['text_features'].mean(axis=0))
        all_audio.append(instance['audio_features'].mean(axis=0))
        all_video.append(instance['video_features'].mean(axis=0))
        all_labels.append(instance['labels'])
    all_text = np.array(all_text)
    all_audio = np.array(all_audio)
    all_video = np.array(all_video)
    mean_data[split] = {'text': all_text, 'audio': all_audio, 'video': all_video, 'labels': all_labels}

mean_data.keys()

dict_keys(['train', 'dev', 'test'])

In [21]:
split = 'test'
mean_data[split].keys()

dict_keys(['text', 'audio', 'video', 'labels'])

In [22]:
mean_data[split]['text'].shape, mean_data[split]['audio'].shape, mean_data[split]['video'].shape

((461, 300), (461, 300), (461, 300))

In [25]:
m_indices = dict()
for m in media:
    index = faiss.IndexFlatIP(EMB_SIZE)
    index.add(mean_data[split][m])
    m_indices[m] = index

m_indices

{'text': <faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x1314062a0> >,
 'audio': <faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x131865510> >,
 'video': <faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x1318651e0> >}

In [31]:
for i in range(len(media)):
    for j in range(i+1, len(media)):
        m_1 = media[i]
        m_2 = media[j]
        print(f'Retrieval: {m_1} to {m_2}')
        _, res = m_indices[media[j]].search(mean_data[split][media[i]], RET_SIZE)
        
        n_rows = res.shape[0]
        r_1 = 0
        r_5 = 0
        r_10 = 0
        ranks = list()
        for k in range(n_rows):
            search = res[k, :]
            try:
                pos = np.where(search == k)[0][0] + 1
            except IndexError:
                pos = RET_SIZE + 1
            if pos <= 1:
                r_1 += 1
            if pos <= 5:
                r_5 += 1
            if pos <= 10:
                r_10 += 1
            ranks.append(pos)
        ranks = np.array(ranks)
        mean_r = ranks.mean()
        median_r = np.median(ranks)
        r_1 = r_1 / n_rows * 100.0
        r_5 = r_5 / n_rows * 100.0
        r_10 = r_10 / n_rows * 100.0
        print(f'Recall @ 1: {r_1}, @ 5: {r_5}, @ 10: {r_10}')
        print(f'Mean rank: {mean_r}, median rank: {median_r}')
        print()

Retrieval: text to audio
Recall @ 1: 1.0845986984815619, @ 5: 0.21691973969631237, @ 10: 13.232104121475055
Mean rank: 106.56399132321042, median rank: 73.0

Retrieval: text to video
Recall @ 1: 1.0845986984815619, @ 5: 0.21691973969631237, @ 10: 11.279826464208242
Mean rank: 114.97180043383948, median rank: 83.0

Retrieval: audio to video
Recall @ 1: 2.386117136659436, @ 5: 0.21691973969631237, @ 10: 12.58134490238612
Mean rank: 90.1822125813449, median rank: 60.0

