## Импорты

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torchaudio

import yt_dlp
import subprocess

import os
import pickle
import gc
import json
import locale
import re
import tqdm.notebook as tqdm
from urllib.parse import urlparse
import requests
import math
import random

import av
from huggingface_hub import hf_hub_download
from typing import Callable

from transformers import VivitImageProcessor, VivitModel
from transformers import AutoImageProcessor, VideoMAEModel
from transformers import TimesformerConfig, TimesformerModel
from transformers import XCLIPProcessor, XCLIPModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from sentence_transformers import SentenceTransformer

import librosa
from moviepy.editor import VideoFileClip
import ast
import openunmix

from minio import Minio
from minio.error import S3Error

import hydra
import soundfile as sf
from omegaconf import OmegaConf

import logging
from typing import List

import numpy as np
from sklearn.cross_decomposition import PLSRegression, CCA
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from metric_learn import NCA
from hiclass import MultiLabelLocalClassifierPerParentNode

from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

from catboost import CatBoostClassifier

locale.getpreferredencoding = lambda: "UTF-8"

logging.basicConfig(
    filename='vivit_inference.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

SEED = 42

## Вспомогательные объекты

In [2]:
class EmbeddingStorage:
    def __init__(self, labels=None, filenames=None, embeddings=None):
        """
        Initialize the EmbeddingStorage class.
        
        Args:
            labels (list or np.ndarray): An array of labels for the embeddings.
            filenames (list or np.ndarray): An array of filenames associated with the embeddings.
            embeddings (np.ndarray): A NumPy array containing all embeddings.
        """
        self.labels = np.array(labels) if labels is not None else np.array([])
        self.filenames = np.array(filenames) if filenames is not None else np.array([])
        self.embeddings = np.array(embeddings) if embeddings is not None else np.empty((0,))

    def add_embedding(self, label, filename, embedding):
        """
        Add a new embedding, along with its label and filename.
        
        Args:
            label (int or str): The label of the embedding.
            filename (str): The filename associated with the embedding.
            embedding (np.ndarray or torch.Tensor): The embedding to add (can be a NumPy array or Tensor).
        """
        if isinstance(embedding, np.ndarray):
            emb_array = embedding
        else:
            # Convert torch.Tensor to NumPy
            emb_array = embedding.cpu().numpy()
        
        # Append the new data
        self.labels = np.append(self.labels, label)
        self.filenames = np.append(self.filenames, filename)
        
        if self.embeddings.size == 0:
            self.embeddings = emb_array.reshape(1, -1)
        else:
            self.embeddings = np.vstack([self.embeddings, emb_array])

    def save_to_file(self, file_path):
        np.savez(file_path, labels=self.labels, filenames=self.filenames, embeddings=self.embeddings)

    @classmethod
    def load_from_file(cls, file_path):
        data = np.load(file_path, allow_pickle=True)
        return cls(labels=data['labels'], filenames=data['filenames'], embeddings=data['embeddings'])

    def get_embedding_by_filename(self, filename):
        if filename in self.filenames:
            idx = np.where(self.filenames == filename)[0][0]
            return self.embeddings[idx]
        else:
            return None

    def join_on_videoname(self, other_storage):
        # Find common filenames
        common_filenames = np.intersect1d(self.filenames, other_storage.filenames)
        
        # Initialize lists to store merged data
        merged_labels = []
        merged_filenames = []
        merged_embeddings = []
        
        for filename in common_filenames:
            # Get embeddings for the common filename from both storages
            idx_self = np.where(self.filenames == filename)[0][0]
            idx_other = np.where(other_storage.filenames == filename)[0][0]
            
            emb_self = self.embeddings[idx_self]
            emb_other = other_storage.embeddings[idx_other]
            
            # Store embeddings as a tuple
            merged_embedding = (emb_self, emb_other)
            
            # Get the label from the first storage (could be changed based on use case)
            merged_label = self.labels[idx_self]
            
            # Append to the merged data
            merged_labels.append(merged_label)
            merged_filenames.append(filename)
            merged_embeddings.append(merged_embedding)
        
        # Convert lists to numpy arrays
        merged_labels = np.array(merged_labels)
        merged_filenames = np.array(merged_filenames)
        merged_embeddings = np.array(merged_embeddings, dtype=object)
        
        # Return a new EmbeddingStorage instance with merged data
        return EmbeddingStorage(labels=merged_labels, filenames=merged_filenames, embeddings=merged_embeddings)

    def __len__(self):
        """
        Return the number of embeddings stored.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieve the label, filename, and embedding by index.
        
        Args:
            idx (int): The index of the embedding to retrieve.
        
        Returns:
            tuple: A tuple containing (label, filename, embedding).
        """
        if idx >= len(self.labels):
            raise IndexError("Index out of range")
        return self.labels[idx], self.filenames[idx], self.embeddings[idx]

    def __repr__(self):
        return f"EmbeddingStorage(labels={len(self.labels)}, filenames={len(self.filenames)}, embeddings_shape={self.embeddings.shape})"

In [3]:
def iou_metric(ground_truth, predictions):
    iou =  len(set.intersection(set(ground_truth), set(predictions)))
    iou = iou/(len(set(ground_truth).union(set(predictions))))
    return iou

def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = [tag.strip().lower() for tag in tag.split(":")]
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list


def find_iou_for_sample_submission(pred_submission, true_submission):
    ground_truth_df = true_submission
    ground_truth_df["tags"] = ground_truth_df["tags"].apply(lambda l: l.split(', '))
    ground_truth_df["tags_split"] = ground_truth_df["tags"].apply(lambda l: split_tags(l))

    predictions_df = pred_submission
    # predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(ast.literal_eval)
    predictions_df["predicted_tags_split"] = predictions_df["predicted_tags"].apply(lambda l: split_tags(l))
    iou=0
    counter = 0
    for i, row in ground_truth_df.iterrows():
        predicted_tags = predictions_df[predictions_df["video_id"]==row["video_id"]]["predicted_tags_split"].values[0]
        iou_temp=iou_metric(row['tags_split'], predicted_tags)
        iou+=iou_temp
        counter+=1

    return iou/counter


def create_tags_to_labels(taxonomy):
    tags = {}
    for i, row in tqdm.tqdm(taxonomy.iterrows()):
        if isinstance(row['Уровень 3 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()+": "+row['Уровень 3 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 2 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 1 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()] = i
    return tags

## Get description's embeddings

In [4]:
data = pd.read_csv("train_data_categories.csv", index_col=0)
taxonomy = pd.read_csv("IAB_tags.csv")

In [5]:
# model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence')
# model.save_pretrained('models/rubert')
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = SentenceTransformer('models/rubert/')
# model.to(device)
# model.eval()
# encoded_values = data['description'].apply(lambda l: model.encode(l, convert_to_tensor=True).cpu().numpy())
# rubert_emb = EmbeddingStorage(filenames=data.index, embeddings=np.vstack(encoded_values.values))
# rubert_emb.save_to_file('data/new_embeddings/description_emb.npz')

## Clear tags data

In [6]:
tags_to_labels = create_tags_to_labels(taxonomy)
labels_to_tags = {v: k for k, v in tags_to_labels.items()}

0it [00:00, ?it/s]

In [7]:
def assign_category_ids(data, tags_to_labels):
    category_ids = []
    
    for i, row in tqdm.tqdm(data.iterrows()):
        tags = row['tags'].split(', ')  # Split tags by comma
        split_tag_list = split_tags(tags)  # Split hierarchical tags

        # Convert the tags to category IDs
        valid_ids = [tags_to_labels[tag] for tag in split_tag_list if tag in tags_to_labels]

        if len(valid_ids) > 0:
            # If valid category IDs are found, use all of them
            category_ids.append(valid_ids)  # Append the list of valid IDs
        else:
            # If no valid tags, assign a random category
            print("Empty list", )
            category_ids.append([random.choice(list(tags_to_labels.values()))])  # Random category from available tags

    return category_ids


data['category_id'] = assign_category_ids(data, tags_to_labels)

0it [00:00, ?it/s]

Empty list
Empty list
Empty list
Empty list


In [8]:
whisper_emb = EmbeddingStorage.load_from_file('data/new_embeddings/whisper.npz')
rubert_emb = EmbeddingStorage.load_from_file('data/new_embeddings/description_emb.npz')
xclip_emb = EmbeddingStorage.load_from_file('data/new_embeddings/xclip_emb.npz')
xclip_emb.embeddings = xclip_emb.embeddings.mean(axis=1)

data['audio_emb'] = [None] * len(data)
data['text_emb'] = [None] * len(data)
data['video_emb'] = [None] * len(data)

In [9]:
for filename, row in data.iterrows():
    data.at[filename, 'video_emb'] = xclip_emb.get_embedding_by_filename(filename + '.mp4')
    data.at[filename, 'audio_emb'] = whisper_emb.get_embedding_by_filename(filename + '.mp4')
    data.at[filename, 'text_emb'] = rubert_emb.get_embedding_by_filename(filename)

Do the same for augmented embeddings

In [10]:
data_2 = pd.read_csv("train_data_categories.csv", index_col=0)
data_2['category_id'] = assign_category_ids(data_2, tags_to_labels)
data_2['audio_emb'] = [None] * len(data)
data_2['text_emb'] = [None] * len(data)
data_2['video_emb'] = [None] * len(data)

whisper_emb_augm = EmbeddingStorage.load_from_file('data/new_embeddings/augm_1/whisper.npz')
rubert_emb_augm = EmbeddingStorage.load_from_file('data/new_embeddings/description_emb.npz')
xclip_emb_augm = EmbeddingStorage.load_from_file('data/new_embeddings/augm_1/xclip_emb.npz')
xclip_emb_augm.embeddings = xclip_emb_augm.embeddings.mean(axis=1)

for filename, row in data_2.iterrows():
    data_2.at[filename, 'video_emb'] = xclip_emb_augm.get_embedding_by_filename(filename + '.mp4')
    data_2.at[filename, 'audio_emb'] = whisper_emb_augm.get_embedding_by_filename(filename + '.mp4')
    data_2.at[filename, 'text_emb'] = rubert_emb_augm.get_embedding_by_filename(filename)

0it [00:00, ?it/s]

Empty list
Empty list
Empty list
Empty list


In [11]:
X1 = np.vstack(data.audio_emb.values)
X2 = np.vstack(data.text_emb.values)
X3 = np.vstack(data.video_emb.values)
Y = data.category_id.values

X_a = np.vstack([X1, np.vstack(data_2.audio_emb.values)])
X_t = np.vstack([X2, np.vstack(data_2.text_emb.values)])
X_v = np.vstack([X3, np.vstack(data_2.video_emb.values)])
Y_ttl = np.hstack([Y, data_2.category_id.values])

## Experiment with models

In [12]:
def calc_iou_from_ids(y_true, y_pred, labels_to_tags):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_labels_dec = split_tags([labels_to_tags[x] for x in true_labels])
        pred_labels_dec = split_tags([labels_to_tags[x] for x in pred_labels])
        
        iou_score = iou_metric(true_labels_dec, pred_labels_dec)
        iou_scores.append(iou_score)
    return np.mean(iou_scores)

### kNN

In [20]:
data_arr = [
    np.hstack([X1, X2, X3]),
    np.hstack([X1, X3]),
    np.hstack([X2, X3]),
    np.hstack([X1, X2]),
    X1,
    X3,
    X2
]

data_types = [
    'all',
    'audio + video',
    'video + text',
    'audio + text',
    'audio',
    'video',
    'text'
]

K_ARR = [1, 3, 5]

np.random.seed(SEED)
test_ratio = 0.2
sample_size = data.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

for K in K_ARR:
    print(f'==================== {K}-NN ====================')
    for data, modalities in zip(data_arr, data_types):
        X = data[train_indices]
        data_test = data[test_indices]
        
        mlb = MultiLabelBinarizer()
        mlb.fit(Y)
        Y_enc = mlb.transform(Y[train_indices])
        Y_test = Y[test_indices]
        
        # Step 4: Define the cross-validation process
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        iou_scores = []
        test_iou_scores = []
        
        for train_index, test_index in kf.split(X):
            # Split the data into training and test sets for this fold
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y_enc[train_index], Y_enc[test_index]
            
            # KNN classifier for multilabel classification
            knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='uniform')
            multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
            multi_knn.fit(X_train, y_train)
        
            # Decode the predictions back to original multilabel format
            y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
            y_test_decoded = mlb.inverse_transform(y_test)
        
            # Calculate IoU for this fold
            fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
            iou_scores.append(fold_iou_score)

            Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
            test_iou_scores.append(calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags))
            
        average_iou = np.mean(iou_scores)
        test_iou = np.mean(test_iou_scores)
        print(f"Modalities: {modalities}, iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, iou: 0.5667, test_iou: 0.5710
Modalities: audio + video, iou: 0.5064, test_iou: 0.5225
Modalities: video + text, iou: 0.5500, test_iou: 0.5720
Modalities: audio + text, iou: 0.5582, test_iou: 0.5600
Modalities: audio, iou: 0.4754, test_iou: 0.4946
Modalities: video, iou: 0.5841, test_iou: 0.5923
Modalities: text, iou: 0.5407, test_iou: 0.5582
Modalities: all, iou: 0.5732, test_iou: 0.5824
Modalities: audio + video, iou: 0.5098, test_iou: 0.5200
Modalities: video + text, iou: 0.5575, test_iou: 0.5727
Modalities: audio + text, iou: 0.5631, test_iou: 0.5751
Modalities: audio, iou: 0.4845, test_iou: 0.4973
Modalities: video, iou: 0.5995, test_iou: 0.6344
Modalities: text, iou: 0.5423, test_iou: 0.5605
Modalities: all, iou: 0.5447, test_iou: 0.5809
Modalities: audio + video, iou: 0.4803, test_iou: 0.5088
Modalities: video + text, iou: 0.5313, test_iou: 0.5750
Modalities: audio + text, iou: 0.5415, test_iou: 0.5805
Modalities: audio, iou: 0.4488, test_iou: 0.4657
Modalities:

Choose appropriate K and modality

In [None]:
data_arr = [
    np.hstack([X_a, X_v, X_t]),
    np.hstack([X_a, X_v]),
    np.hstack([X_v, X_t]),
    np.hstack([X_a, X_t]),
    X_a,
    X_v,
    X_t
]

data_types = [
    'all',
    'audio + video',
    'video + text',
    'audio + text',
    'audio',
    'video',
    'text'
]

K_ARR = [1, 2, 3, 4, 5, 6]

for K in K_ARR:
    print(f'==================== {K}-NN ====================')
    for data, modalities in zip(data_arr, data_types):
        X = data
        mlb = MultiLabelBinarizer()
        Y_enc = mlb.fit_transform(Y_ttl)  # Y is your category_id, encoded as multilabel
        
        # Step 4: Define the cross-validation process
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        iou_scores = []
        
        for train_index, test_index in kf.split(X):
            # Split the data into training and test sets for this fold
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y_enc[train_index], Y_enc[test_index]
            
            # KNN classifier for multilabel classification
            knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
            multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
            multi_knn.fit(X_train, y_train)
        
            # Predict using the KNN classifier
            y_pred_enc = multi_knn.predict(X_test)
        
            # Decode the predictions back to original multilabel format
            y_pred = mlb.inverse_transform(y_pred_enc)
            y_test_decoded = mlb.inverse_transform(y_test)
        
            # Calculate IoU for this fold
            fold_iou_score = calculate_iou(y_test_decoded, y_pred)
            iou_scores.append(fold_iou_score)
            
        # Step 5: Calculate the average IoU across all folds
        average_iou = np.mean(iou_scores)
        print(f"Modalities: {modalities}, iou: {average_iou:.4f}")

Modalities: all, iou: 0.9063
Modalities: audio + video, iou: 0.6996
Modalities: video + text, iou: 0.9080
Modalities: audio + text, iou: 0.9036
Modalities: audio, iou: 0.6249
Modalities: video, iou: 0.9212
Modalities: text, iou: 0.8949
Modalities: all, iou: 0.9063
Modalities: audio + video, iou: 0.6997
Modalities: video + text, iou: 0.9080
Modalities: audio + text, iou: 0.9037
Modalities: audio, iou: 0.6250
Modalities: video, iou: 0.9212
Modalities: text, iou: 0.8936
Modalities: all, iou: 0.9034
Modalities: audio + video, iou: 0.6316
Modalities: video + text, iou: 0.9065
Modalities: audio + text, iou: 0.9020
Modalities: audio, iou: 0.5754
Modalities: video, iou: 0.8950
Modalities: text, iou: 0.8971
Modalities: all, iou: 0.8943
Modalities: audio + video, iou: 0.6306
Modalities: video + text, iou: 0.9079
Modalities: audio + text, iou: 0.8937
Modalities: audio, iou: 0.5630
Modalities: video, iou: 0.8652
Modalities: text, iou: 0.8985
Modalities: all, iou: 0.8689
Modalities: audio + video, 

### ~~LogisticRegression~~

In [39]:
# data_arr = [
#     np.hstack([X_a, X_v, X_t]),
#     np.hstack([X_a, X_v]),
#     np.hstack([X_v, X_t]),
#     np.hstack([X_a, X_t]),
#     X_a,
#     X_v,
#     X_t
# ]

# data_types = [
#     'all',
#     'audio + video',
#     'video + text',
#     'audio + text',
#     'audio',
#     'video',
#     'text'
# ]

# C_ARR = [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10]

# for C in C_ARR:
#     print(f'==================== LogisticRegression, C = {C} ====================')
#     for data, modalities in zip(data_arr, data_types):
#         X = data[train_indices]
#         data_test = data[test_indices]
        
#         mlb = MultiLabelBinarizer()
#         mlb.fit(Y_ttl)
#         Y_enc = mlb.transform(Y_ttl[train_indices])
#         Y_test = Y_ttl[test_indices]
        
#         # Step 4: Define the cross-validation process
#         kf = KFold(n_splits=5, shuffle=True, random_state=42)
#         iou_scores = []
        
#         for train_index, test_index in kf.split(X):
#             # Split the data into training and test sets for this fold
#             X_train, X_test = X[train_index], X[test_index]
#             y_train, y_test = Y_enc[train_index], Y_enc[test_index]

#             for i in range(y_train.shape[1]):
#                 unique_classes = np.unique(y_train[:, i])
#                 if len(unique_classes) < 2:
#                     print(f"Skipping fold for label {i} due to lack of class diversity.")
#                     continue  # Skip this fold if only one class is present
            
#             # KNN classifier for multilabel classification
#             model = LogisticRegression(C=C, penalty='l1', max_iter=500, random_state=SEED, solver='saga')
#             multi_model = MultiOutputClassifier(model, n_jobs=-1)
#             multi_model.fit(X_train, y_train)
        
#             # Decode the predictions back to original multilabel format
#             y_pred = mlb.inverse_transform(multi_model.predict(X_test))
#             y_test_decoded = mlb.inverse_transform(y_test)
        
#             # Calculate IoU for this fold
#             fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
#             iou_scores.append(fold_iou_score)

#         model = LogisticRegression(C=C, penalty='l1', max_iter=500, random_state=SEED, solver='saga')
#         multi_model = MultiOutputClassifier(model, n_jobs=-1)
#         multi_model.fit(X, Y_enc)
#         Y_pred_test = mlb.inverse_transform(multi_model.predict(data_test))
#         test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)
            
#         average_iou = np.mean(iou_scores)
#         print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

### ~~CatBoostClassifier~~

In [34]:
# data_arr = [
#     np.hstack([X_a, X_v, X_t]),
#     np.hstack([X_a, X_v]),
#     np.hstack([X_v, X_t]),
#     np.hstack([X_a, X_t]),
#     X_a,
#     X_v,
#     X_t
# ]

# data_types = [
#     'all',
#     'audio + video',
#     'video + text',
#     'audio + text',
#     'audio',
#     'video',
#     'text'
# ]

# for data, modalities in zip(data_arr, data_types):
#     X = data[train_indices]
#     data_test = data[test_indices]
    
#     mlb = MultiLabelBinarizer()
#     mlb.fit(Y_ttl)
#     Y_enc = mlb.transform(Y_ttl[train_indices])
#     Y_test = Y_ttl[test_indices]
    
#     # Step 4: Define the cross-validation process
#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     iou_scores = []
    
#     for train_index, test_index in kf.split(X):
#         # Split the data into training and test sets for this fold
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = Y_enc[train_index], Y_enc[test_index]

#         # KNN classifier for multilabel classification
#         model = CatBoostClassifier(learning_rate=0.01, iterations=100)
#         multi_model = MultiOutputClassifier(model, n_jobs=-1)
#         multi_model.fit(X_train, y_train)
    
#         # Decode the predictions back to original multilabel format
#         y_pred = mlb.inverse_transform(multi_model.predict(X_test))
#         y_test_decoded = mlb.inverse_transform(y_test)
    
#         # Calculate IoU for this fold
#         fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
#         iou_scores.append(fold_iou_score)

#     model = CatBoostClassifier(learning_rate=0.01, iterations=100)
#     multi_model = MultiOutputClassifier(model, n_jobs=-1)
#     multi_model.fit(X, Y_enc)
#     Y_pred_test = mlb.inverse_transform(multi_model.predict(data_test))
#     test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)
        
#     average_iou = np.mean(iou_scores)
#     print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

### Local Classifier Per Parent Node

In [13]:
def convert_ids_to_hierarchy(elem):
    # Split each tag by ': ' and convert it into a list
    tags = [labels_to_tags[ind] for ind in list(set(elem))]
    hierarchy = [tag.split(": ") for tag in tags]

    return hierarchy


def calc_iou_from_hierarchy(y_true, y_pred):
    iou_scores = []
    drop_empty_strings = lambda x: len(x) > 0
    
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_labels_dec = [': '.join(list(filter(drop_empty_strings, elem))) for elem in true_labels]
        pred_labels_dec = [': '.join(list(filter(drop_empty_strings, elem))) for elem in pred_labels]

        iou_score = iou_metric(split_tags(true_labels_dec), split_tags(pred_labels_dec))
        iou_scores.append(iou_score)
    return np.mean(iou_scores)

#### kNN

In [133]:
data_arr = [
    np.hstack([X_a, X_v, X_t]),
    np.hstack([X_a, X_v]),
    np.hstack([X_v, X_t]),
    np.hstack([X_a, X_t]),
    X_a,
    X_v,
    X_t
]

data_types = [
    'all',
    'audio + video',
    'video + text',
    'audio + text',
    'audio',
    'video',
    'text'
]

K_ARR = [1, 3, 5, 7, 9]

np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

for K in K_ARR:
    print(f'==================== {K}-NN ====================')
    for data, modalities in zip(data_arr, data_types):
        X = data[train_indices]
        data_test = data[test_indices]
        
        # Step 4: Define the cross-validation process
        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
        iou_scores = []
        
        for train_index, test_index in kf.split(X):
            # Split the data into training and test sets for this fold
            X_train, X_test = X[train_index], X[test_index]
            y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
            y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
            
            # KNN classifier for multilabel classification
            knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
            classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
            classifier.fit(X_train, y_train)
        
            # Decode the predictions back to original multilabel format
            y_pred = classifier.predict(X_test)
        
            # Calculate IoU for this fold
            fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
            iou_scores.append(fold_iou_score)

        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X, np.array(Y_enc, dtype='object'))
        Y_pred_test = classifier.predict(data_test)
        test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
            
        average_iou = np.mean(iou_scores)
        print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, val_iou: 0.6313, test_iou: 0.6655
Modalities: audio + video, val_iou: 0.5256, test_iou: 0.5621
Modalities: video + text, val_iou: 0.6240, test_iou: 0.6665
Modalities: audio + text, val_iou: 0.6281, test_iou: 0.6673
Modalities: audio, val_iou: 0.4676, test_iou: 0.5147
Modalities: video, val_iou: 0.6340, test_iou: 0.6728
Modalities: text, val_iou: 0.6204, test_iou: 0.6685


#### LogisticRegression

In [24]:
# data_arr = [
#     np.hstack([X_a, X_v, X_t]),
#     np.hstack([X_a, X_v]),
#     np.hstack([X_v, X_t]),
#     np.hstack([X_a, X_t]),
#     X_a,
#     X_v,
#     X_t
# ]

# data_types = [
#     'all',
#     'audio + video',
#     'video + text',
#     'audio + text',
#     'audio',
#     'video',
#     'text'
# ]

# np.random.seed(SEED)
# test_ratio = 0.2
# sample_size = X_a.shape[0]
# test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
# train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

# Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
# Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

# C_ARR = [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10]

# for C in C_ARR:
#     print(f'==================== LogisticRegression, C = {C} ====================')
#     for data, modalities in zip(data_arr, data_types):
#         X = data[train_indices]
#         data_test = data[test_indices]
        
#         # Step 4: Define the cross-validation process
#         kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
#         iou_scores = []
        
#         for train_index, test_index in kf.split(X):
#             # Split the data into training and test sets for this fold
#             X_train, X_test = X[train_index], X[test_index]
#             y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
#             y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
            
#             # KNN classifier for multilabel classification
#             logreg = LogisticRegression(C=C, penalty='l1', max_iter=500, random_state=SEED, solver='saga')
#             classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=logreg, n_jobs=-1)
#             classifier.fit(X_train, y_train)
        
#             # Decode the predictions back to original multilabel format
#             y_pred = classifier.predict(X_test)
        
#             # Calculate IoU for this fold
#             fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
#             iou_scores.append(fold_iou_score)

#         logreg = LogisticRegression(C=C, penalty='l1', max_iter=500, random_state=SEED, solver='saga')
#         classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=logreg, n_jobs=-1)
#         classifier.fit(X, np.array(Y_enc, dtype='object'))
#         Y_pred_test = classifier.predict(data_test)
#         test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
            
#         average_iou = np.mean(iou_scores)
#         print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

#### CatBoostClassifier

In [25]:
# data_arr = [
#     np.hstack([X_a, X_v, X_t]),
#     np.hstack([X_a, X_v]),
#     np.hstack([X_v, X_t]),
#     np.hstack([X_a, X_t]),
#     X_a,
#     X_v,
#     X_t
# ]

# data_types = [
#     'all',
#     'audio + video',
#     'video + text',
#     'audio + text',
#     'audio',
#     'video',
#     'text'
# ]


# np.random.seed(SEED)
# test_ratio = 0.2
# sample_size = X_a.shape[0]
# test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
# train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

# Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
# Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

# for data, modalities in zip(data_arr, data_types):
#     X = data[train_indices]
#     data_test = data[test_indices]
    
#     # Step 4: Define the cross-validation process
#     kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
#     iou_scores = []
    
#     for train_index, test_index in kf.split(X):
#         # Split the data into training and test sets for this fold
#         X_train, X_test = X[train_index], X[test_index]
#         y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
#         y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
        
#         # KNN classifier for multilabel classification
#         cb = CatBoostClassifier(learning_rate=0.01, iterations=100, verbose=False)
#         classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=cb, n_jobs=-1)
#         classifier.fit(X_train, y_train)
    
#         # Decode the predictions back to original multilabel format
#         y_pred = classifier.predict(X_test)
    
#         # Calculate IoU for this fold
#         fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
#         iou_scores.append(fold_iou_score)

#     cb = CatBoostClassifier(learning_rate=0.01, iterations=100, verbose=False)
#     classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=cb, n_jobs=-1)
#     classifier.fit(X, np.array(Y_enc, dtype='object'))
#     Y_pred_test = classifier.predict(data_test)
#     test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
        
#     average_iou = np.mean(iou_scores)
#     print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

## Embeddings from outer-product matrices

In [None]:
def compute_embedding_pytorch_chunked(X_a, X_v, X_t, chunk_size=64, device='cuda'):
    # Convert the inputs to float32 and move data to GPU if available
    X_a = torch.tensor(X_a, device=device, dtype=torch.float32)
    X_v = torch.tensor(X_v, device=device, dtype=torch.float32)
    X_t = torch.tensor(X_t, device=device, dtype=torch.float32)

    # Initialize a list to collect all embeddings
    embeddings = []

    # Function to calculate statistics for input arrays
    def compute_stats(arr):
        stats = []
        stats.append(torch.norm(arr, p=1, dim=1, keepdim=True))      # L1 norm
        stats.append(torch.norm(arr, p=2, dim=1, keepdim=True))      # L2 norm
        stats.append(torch.min(arr, dim=1, keepdim=True).values)     # Min
        stats.append(torch.max(arr, dim=1, keepdim=True).values)     # Max
        stats.append(torch.median(arr, dim=1, keepdim=True).values)  # Median
        stats.append(torch.mean(arr, dim=1, keepdim=True))           # Mean
        stats.append(torch.std(arr, dim=1, keepdim=True))            # Std
        return stats

    def compute_outer_product_stats(X_a_chunk, X_v_chunk, X_t_chunk):
        outer_products = [
            torch.bmm(X_a_chunk.unsqueeze(2), X_v_chunk.unsqueeze(1)),  # Outer product between X_a and X_v
            torch.bmm(X_a_chunk.unsqueeze(2), X_t_chunk.unsqueeze(1)),  # Outer product between X_a and X_t
            torch.bmm(X_v_chunk.unsqueeze(2), X_t_chunk.unsqueeze(1)),  # Outer product between X_v and X_t
        ]
        
        # Compute statistics for each outer product
        outer_prod_stats = []
        for op in outer_products:
            op_flat = op.view(op.shape[0], -1)  # Flatten each outer product for easier calculations
            outer_prod_stats.extend(compute_stats(op_flat))
        return outer_prod_stats

    # Process in chunks to avoid memory issues
    n = X_a.shape[0]
    for i in range(0, n, chunk_size):
        # Get the chunks of data
        X_a_chunk = X_a[i:i+chunk_size]
        X_v_chunk = X_v[i:i+chunk_size]
        X_t_chunk = X_t[i:i+chunk_size]

        # Compute statistics for input arrays
        input_stats = []
        for arr in [X_a_chunk, X_v_chunk, X_t_chunk]:
            input_stats.extend(compute_stats(arr))
        
        # Compute outer product statistics in chunks
        outer_prod_stats = compute_outer_product_stats(X_a_chunk, X_v_chunk, X_t_chunk)

        # Concatenate all stats into a single tensor and store
        chunk_embedding = torch.cat(input_stats + outer_prod_stats, dim=1)
        embeddings.append(chunk_embedding.cpu())  # Move back to CPU to reduce GPU memory usage

        # Free up memory
        del X_a_chunk, X_v_chunk, X_t_chunk, input_stats, outer_prod_stats, chunk_embedding
        torch.cuda.empty_cache()

    # Concatenate all chunks into the final embedding matrix
    return torch.cat(embeddings, dim=0).numpy()

In [None]:
embeddings = compute_embedding_pytorch_chunked(X_a, X_v, X_t)

### Naive multilabel classification

In [50]:
K_ARR = [1, 3, 5, 7, 9]

np.random.seed(SEED)
test_ratio = 0.2
sample_size = embeddings.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

for K in K_ARR:
    print(f'==================== {K}-NN ====================')
    X = embeddings[train_indices]
    data_test = embeddings[test_indices]
    
    mlb = MultiLabelBinarizer()
    mlb.fit(Y_ttl)
    Y_enc = mlb.transform(Y_ttl[train_indices])
    Y_test = Y_ttl[test_indices]
    
    # Step 4: Define the cross-validation process
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in kf.split(X):
        # Split the data into training and test sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y_enc[train_index], Y_enc[test_index]
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
        y_test_decoded = mlb.inverse_transform(y_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
        iou_scores.append(fold_iou_score)

    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X, Y_enc)
    Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
    test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, iou: 0.2233, test_iou: 0.2099
Modalities: all, iou: 0.2259, test_iou: 0.2066
Modalities: all, iou: 0.2169, test_iou: 0.1978
Modalities: all, iou: 0.2041, test_iou: 0.1963
Modalities: all, iou: 0.2029, test_iou: 0.1914


### Hierarchical multilabel classification

#### kNN

In [52]:
K_ARR = [1, 2]

np.random.seed(SEED)
test_ratio = 0.2
sample_size = embeddings.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

for K in K_ARR:
    print(f'==================== {K}-NN ====================')
    X = embeddings[train_indices]
    data_test = embeddings[test_indices]
    
    # Step 4: Define the cross-validation process
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    iou_scores = []
    
    for train_index, test_index in kf.split(X):
        # Split the data into training and test sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
        
    average_iou = np.mean(iou_scores)
    print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, val_iou: 0.2092, test_iou: 0.1899
Modalities: all, val_iou: 0.1658, test_iou: 0.1600


#### LogisticRegression

In [55]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = embeddings.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

C_ARR = [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10]

for C in C_ARR:
    print(f'==================== LogisticRegression, C = {C} ====================')
    X = embeddings[train_indices]
    data_test = embeddings[test_indices]
    
    # Step 4: Define the cross-validation process
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    iou_scores = []
    
    for train_index, test_index in kf.split(X):
        # Split the data into training and test sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
        
        # KNN classifier for multilabel classification
        logreg = LogisticRegression(C=C, penalty='l2', max_iter=500, random_state=SEED)
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=logreg, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    logreg = LogisticRegression(C=C, penalty='l2', max_iter=500, random_state=SEED)
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=logreg, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
        
    average_iou = np.mean(iou_scores)
    print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, val_iou: 0.3374, test_iou: 0.3282
Modalities: all, val_iou: 0.3349, test_iou: 0.3334
Modalities: all, val_iou: 0.3379, test_iou: 0.3292
Modalities: all, val_iou: 0.3352, test_iou: 0.3260
Modalities: all, val_iou: 0.3362, test_iou: 0.3260
Modalities: all, val_iou: 0.3361, test_iou: 0.3276
Modalities: all, val_iou: 0.3350, test_iou: 0.3284
Modalities: all, val_iou: 0.3344, test_iou: 0.3310
Modalities: all, val_iou: 0.3364, test_iou: 0.3300


In [56]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = embeddings.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

X = embeddings[train_indices]
data_test = embeddings[test_indices]

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
iou_scores = []

for train_index, test_index in kf.split(X):
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
    y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)
    
    # KNN classifier for multilabel classification
    cb = CatBoostClassifier(learning_rate=0.01, iterations=100, verbose=False)
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=cb, n_jobs=-1)
    classifier.fit(X_train, y_train)

    # Decode the predictions back to original multilabel format
    y_pred = classifier.predict(X_test)

    # Calculate IoU for this fold
    fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
    iou_scores.append(fold_iou_score)

cb = CatBoostClassifier(learning_rate=0.01, iterations=100, verbose=False)
classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=cb, n_jobs=-1)
classifier.fit(X, np.array(Y_enc, dtype='object'))
Y_pred_test = classifier.predict(data_test)
test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
    
average_iou = np.mean(iou_scores)
print(f"Modalities: {modalities}, val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

Modalities: all, val_iou: 0.3586, test_iou: 0.3518


## PLS-CCA experiment

**Methodology**

1. CCA to each pair of domains + concatenation + embeddings
2. CCA to one pair + concat with original vector + embeddings

In [15]:
def run_cca(
        X1: np.ndarray,
        X2: np.ndarray,
        train_indices: np.ndarray,
        test_indices: np.ndarray,
        n_comp: int = 32,
        normalize: bool = True,
        algo: str = 'pls'
    ) -> dict:
    X1_train_scaled = X1[train_indices]
    X2_train_scaled = X2[train_indices]
    
    X1_test_scaled = X1[test_indices]
    X2_test_scaled = X2[test_indices]

    if algo == 'pls':
        pls = PLSRegression(n_components=n_comp, scale=normalize)
    elif algo == 'cca':
        pls = CCA(n_components=n_comp, scale=normalize)

    pls.fit(X1_train_scaled, X2_train_scaled)
    
    X1_latent_train, X2_latent_train = pls.transform(X1_train_scaled, X2_train_scaled)
    X1_latent_test, X2_latent_test = pls.transform(X1_test_scaled, X2_test_scaled)
    
    latent_train = np.hstack([X1_latent_train, X2_latent_train])
    latent_test = np.hstack([X1_latent_test, X2_latent_test])

    return pls, latent_train, latent_test


def concatenate_cca_embeddings(X_a, X_v, X_t, train_indices, test_indices, n_components=64, normalize=True, algo='pls'):
    """
    Apply CCA between each pair of domains (audio, video, text) and concatenate the embeddings.
    
    Args:
    X_a: Audio data
    X_v: Video data
    X_t: Text data
    n_components: Number of CCA components to use
    normalize: Whether to normalize the data
    
    Returns:
    Concatenated latent embeddings from CCA on all pairs.
    """
    # Run CCA on each pair of domains
    pls_av, train_a_v, test_a_v = run_cca(X_a, X_v, train_indices, test_indices, n_components, normalize, algo=algo)
    pls_at, train_a_t, test_a_t = run_cca(X_a, X_t, train_indices, test_indices, n_components, normalize, algo=algo)
    pls_vt, train_v_t, test_v_t = run_cca(X_v, X_t, train_indices, test_indices, n_components, normalize, algo=algo)
    
    # Concatenate the embeddings from each CCA pair
    latent_train = np.hstack([train_a_v, train_a_t, train_v_t])
    latent_test = np.hstack([test_a_v, test_a_t, test_v_t])
    
    return (pls_av, pls_at, pls_vt), latent_train, latent_test

### Method 1. Concatenation of 3 CCA

kNN, base

In [36]:
K = 3

np.random.seed(SEED)
test_ratio = 0.2
sample_size = Y.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

mlb = MultiLabelBinarizer()
mlb.fit(Y)
Y_enc = mlb.transform(Y[train_indices])
Y_test = Y[test_indices]
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        # Split the data into training and test sets for this fold
        y_train, y_test = Y_enc[train_index], Y_enc[test_index]

        _, X_train, X_test = concatenate_cca_embeddings(X1, X3, X2, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
        y_test_decoded = mlb.inverse_transform(y_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_embeddings(X1, X3, X2, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X, Y_enc)
    Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
    test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

iou: 0.1589, test_iou: 0.6333


0it [00:00, ?it/s]

iou: 0.1621, test_iou: 0.6599


0it [00:00, ?it/s]

iou: 0.1563, test_iou: 0.6313


In [84]:
K = 3

np.random.seed(SEED)
test_ratio = 0.2
sample_size = embeddings.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

mlb = MultiLabelBinarizer()
mlb.fit(Y_ttl)
Y_enc = mlb.transform(Y_ttl[train_indices])
Y_test = Y_ttl[test_indices]
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        # Split the data into training and test sets for this fold
        y_train, y_test = Y_enc[train_index], Y_enc[test_index]

        _, X_train, X_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
        y_test_decoded = mlb.inverse_transform(y_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X, Y_enc)
    Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
    test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

Modalities: all, iou: 0.1446, test_iou: 0.9081


0it [00:00, ?it/s]

Modalities: all, iou: 0.1362, test_iou: 0.9089


0it [00:00, ?it/s]

Modalities: all, iou: 0.1382, test_iou: 0.9098


0it [00:00, ?it/s]

Modalities: all, iou: 0.1422, test_iou: 0.7918


0it [00:00, ?it/s]

KeyboardInterrupt: 

knn, hierarchy

In [18]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

K = 1
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)

        _, X_train, X_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
    
    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

iou: 0.1443, test_iou: 0.6680


0it [00:00, ?it/s]

iou: 0.1401, test_iou: 0.6687


0it [00:00, ?it/s]

iou: 0.1452, test_iou: 0.6722


In [28]:
for i in [12, 8, 14, 1, 0, 70]:
    print(Y_test[i], '===', Y_pred_test[i])

[['музыка и аудио'], ['массовая культура']] === [['массовая культура' 'отношения знаменитостей' '']]
[['хобби и интересы'], ['массовая культура']] === [['массовая культура' 'юмор и сатира' '']]
[['массовая культура', 'юмор и сатира'], ['массовая культура']] === [['массовая культура' 'юмор и сатира' '']]
[['спорт', 'борьба'], ['массовая культура'], ['спорт']] === [['массовая культура' 'юмор и сатира' '']]
[['транспорт'], ['транспорт', 'авторемонт'], ['личные финансы'], ['массовая культура'], ['карьера']] === [['карьера' '' '']]
[['религия и духовность'], ['религия и духовность', 'астрология']] === [['религия и духовность' 'астрология' '']]


In [29]:
for i in [12, 8, 14, 1, 0, 70]:
    print(y_test[i], '===', y_pred[i])

[['религия и духовность'], ['религия и духовность', 'астрология']] === [['хобби и интересы' 'декоративно-прикладное искусство' 'живопись']]
[['путешествия', 'направления путешествий'], ['путешествия', 'направления путешествий', 'азия'], ['путешествия']] === [['карьера' '' '']]
[['изобразительное искусство'], ['хобби и интересы'], ['хобби и интересы', 'декоративно-прикладное искусство']] === [['карьера' '' '']]
[['события и достопримечательности'], ['события и достопримечательности', 'исторические места и достопримечательности'], ['путешествия']] === [['карьера' '' '']]
[['события и достопримечательности'], ['события и достопримечательности', 'спортивные события'], ['массовая культура', 'юмор и сатира'], ['массовая культура']] === [['карьера' '' '']]
[['события и достопримечательности'], ['события и достопримечательности', 'исторические места и достопримечательности'], ['путешествия']] === [['фильмы и анимация' 'семейные и детские фильмы' '']]


### Method 2. 1 CCA with original features

In [30]:
def concatenate_cca_with_original(X_a, X_v, X_t, train_indices, test_indices, n_components=64, normalize=True, algo='pls'):
    """
    Apply CCA between each pair of domains (audio, video, text), concatenate the CCA features 
    and the original input features.
    
    Args:
    X_a: Audio data
    X_v: Video data
    X_t: Text data
    n_components: Number of CCA components to use
    normalize: Whether to normalize the data
    
    Returns:
    Concatenated latent embeddings from CCA on all pairs plus original features.
    """
    # Run CCA on each pair of domains
    pls_av, train_a_v, test_a_v = run_cca(X_a, X_v, train_indices, test_indices, n_components, normalize, algo=algo)
    train_t, text_t = X_t[train_indices], X_t[test_indices]
    
    # Concatenate CCA embeddings from each pair of domains
    cca_train = np.hstack([train_a_v, train_t])
    cca_test = np.hstack([test_a_v, text_t])
    
    return pls_av, cca_train, cca_test


kNN, base

In [33]:
K = 3

np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

mlb = MultiLabelBinarizer()
mlb.fit(Y_ttl)
Y_enc = mlb.transform(Y_ttl[train_indices])
Y_test = Y_ttl[test_indices]

algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        # Split the data into training and test sets for this fold
        y_train, y_test = Y_enc[train_index], Y_enc[test_index]

        _, X_train, X_test = concatenate_cca_with_original(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
        y_test_decoded = mlb.inverse_transform(y_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_with_original(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X, Y_enc)
    Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
    test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

    average_iou = np.mean(iou_scores)
    print(f"val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

val_iou: 0.1438, test_iou: 0.9095


0it [00:00, ?it/s]

val_iou: 0.1427, test_iou: 0.9181


0it [00:00, ?it/s]

val_iou: 0.1389, test_iou: 0.9184


0it [00:00, ?it/s]

val_iou: 0.1432, test_iou: 0.9016


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [37]:
K = 3

np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

mlb = MultiLabelBinarizer()
mlb.fit(Y_ttl)
Y_enc = mlb.transform(Y_ttl[train_indices])
Y_test = Y_ttl[test_indices]

for algo in ('pls', 'cca'):
    for n_comp in [32, 48, 64]:
        print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
        kf = KFold(n_splits=4, shuffle=True, random_state=42)
        iou_scores = []
        
        for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
            # Split the data into training and test sets for this fold
            y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
            _, X_train, X_test = concatenate_cca_with_original(X_a, X_t, X_v, train_index, test_index, n_components=n_comp, algo=algo)
            
            # KNN classifier for multilabel classification
            knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
            multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
            multi_knn.fit(X_train, y_train)
        
            # Decode the predictions back to original multilabel format
            y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
            y_test_decoded = mlb.inverse_transform(y_test)
        
            # Calculate IoU for this fold
            fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
            iou_scores.append(fold_iou_score)
    
        _, X, data_test = concatenate_cca_with_original(X_a, X_t, X_v, train_indices, test_indices, n_components=n_comp, algo=algo)
        
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X, Y_enc)
        Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
        test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)
    
        average_iou = np.mean(iou_scores)
        print(f"val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

val_iou: 0.1378, test_iou: 0.8975


0it [00:00, ?it/s]

val_iou: 0.1386, test_iou: 0.9028


0it [00:00, ?it/s]

val_iou: 0.1424, test_iou: 0.9049


0it [00:00, ?it/s]

val_iou: 0.1424, test_iou: 0.8696


0it [00:00, ?it/s]

val_iou: 0.1428, test_iou: 0.8280


0it [00:00, ?it/s]

KeyboardInterrupt: 

knn, hierarchy

In [38]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

K = 1
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)

        _, X_train, X_test = concatenate_cca_with_original(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_with_original(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
    
    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

iou: 0.1482, test_iou: 0.6733


0it [00:00, ?it/s]

iou: 0.1451, test_iou: 0.6698


0it [00:00, ?it/s]

iou: 0.1496, test_iou: 0.6762


In [39]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

K = 1
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)

        _, X_train, X_test = concatenate_cca_with_original(X_a, X_t, X_v, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concatenate_cca_with_original(X_a, X_t, X_v, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
    
    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

iou: 0.1404, test_iou: 0.6665


0it [00:00, ?it/s]

iou: 0.1401, test_iou: 0.6683


0it [00:00, ?it/s]

iou: 0.1463, test_iou: 0.6729


### 1 CCA with original features and cross-product embeddings

In [41]:
def concat_cca_and_cross_prod_emb(X_a, X_v, X_t, embeddings, train_indices, test_indices, n_components=64, normalize=True, algo='pls'):
    """
    Apply CCA between each pair of domains (audio, video, text), concatenate the CCA features 
    and the original input features.
    
    Args:
    X_a: Audio data
    X_v: Video data
    X_t: Text data
    n_components: Number of CCA components to use
    normalize: Whether to normalize the data
    
    Returns:
    Concatenated latent embeddings from CCA on all pairs plus original features.
    """
    # Run CCA on each pair of domains
    pls_av, train_a_v, test_a_v = run_cca(X_a, X_v, train_indices, test_indices, n_components, normalize, algo=algo)
    train_t, text_t = X_t[train_indices], X_t[test_indices]
    
    # Concatenate CCA embeddings from each pair of domains
    cca_train = np.hstack([train_a_v, train_t, embeddings[train_indices]])
    cca_test = np.hstack([test_a_v, text_t, embeddings[test_indices]])
    
    return pls_av, cca_train, cca_test


In [42]:
embeddings = compute_embedding_pytorch_chunked(X_a, X_v, X_t)

knn, base

In [43]:
K = 3

np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

mlb = MultiLabelBinarizer()
mlb.fit(Y_ttl)
Y_enc = mlb.transform(Y_ttl[train_indices])
Y_test = Y_ttl[test_indices]

algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        # Split the data into training and test sets for this fold
        y_train, y_test = Y_enc[train_index], Y_enc[test_index]

        _, X_train, X_test = concat_cca_and_cross_prod_emb(X_a, X_v, X_t, embeddings, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
        multi_knn.fit(X_train, y_train)
    
        # Decode the predictions back to original multilabel format
        y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
        y_test_decoded = mlb.inverse_transform(y_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concat_cca_and_cross_prod_emb(X_a, X_v, X_t, embeddings, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X, Y_enc)
    Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
    test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

    average_iou = np.mean(iou_scores)
    print(f"val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

val_iou: 0.1509, test_iou: 0.3655


0it [00:00, ?it/s]

val_iou: 0.1474, test_iou: 0.3643


0it [00:00, ?it/s]

val_iou: 0.1463, test_iou: 0.3502


knn, hierarchy

In [45]:
np.random.seed(SEED)
test_ratio = 0.2
sample_size = X_a.shape[0]
test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

Y_enc = list(map(convert_ids_to_hierarchy, Y_ttl[train_indices]))
Y_test = np.array(list(map(convert_ids_to_hierarchy, Y_ttl[test_indices])), dtype='object')

K = 1
algo = 'pls'

for n_comp in [32, 48, 64]:
    print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    iou_scores = []
    
    for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
        y_train = np.array([Y_enc[ind] for ind in train_index], dtype=object)
        y_test = np.array([Y_enc[ind] for ind in test_index], dtype=object)

        _, X_train, X_test = concat_cca_and_cross_prod_emb(X_a, X_v, X_t, embeddings, train_index, test_index, n_components=n_comp, algo=algo)
        
        # KNN classifier for multilabel classification
        knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
        classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict(X_test)
    
        # Calculate IoU for this fold
        fold_iou_score = calc_iou_from_hierarchy(y_test, y_pred)
        iou_scores.append(fold_iou_score)

    _, X, data_test = concat_cca_and_cross_prod_emb(X_a, X_v, X_t, embeddings, train_indices, test_indices, n_components=n_comp, algo=algo)
    
    knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
    classifier = MultiLabelLocalClassifierPerParentNode(local_classifier=knn, n_jobs=-1)
    classifier.fit(X, np.array(Y_enc, dtype='object'))
    Y_pred_test = classifier.predict(data_test)
    test_iou = calc_iou_from_hierarchy(Y_test, Y_pred_test)
    
    average_iou = np.mean(iou_scores)
    print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")



0it [00:00, ?it/s]

iou: 0.1521, test_iou: 0.3688


0it [00:00, ?it/s]

iou: 0.1490, test_iou: 0.3729


0it [00:00, ?it/s]

iou: 0.1468, test_iou: 0.3773


## Final solution

0. 1-NN ONLY on video

1. Concatenation of 3 PLS, kNN naive

In [46]:
# K = 3

# np.random.seed(SEED)
# test_ratio = 0.2
# sample_size = embeddings.shape[0]
# test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
# train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

# mlb = MultiLabelBinarizer()
# mlb.fit(Y_ttl)
# Y_enc = mlb.transform(Y_ttl[train_indices])
# Y_test = Y_ttl[test_indices]
# algo = 'pls'

# for n_comp in [32, 48, 64]:
#     print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
#     kf = KFold(n_splits=4, shuffle=True, random_state=42)
#     iou_scores = []
    
#     for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
#         # Split the data into training and test sets for this fold
#         y_train, y_test = Y_enc[train_index], Y_enc[test_index]

#         _, X_train, X_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
#         # KNN classifier for multilabel classification
#         knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
#         multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
#         multi_knn.fit(X_train, y_train)
    
#         # Decode the predictions back to original multilabel format
#         y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
#         y_test_decoded = mlb.inverse_transform(y_test)
    
#         # Calculate IoU for this fold
#         fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
#         iou_scores.append(fold_iou_score)

#     _, X, data_test = concatenate_cca_embeddings(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
#     knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
#     multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
#     multi_knn.fit(X, Y_enc)
#     Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
#     test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

#     average_iou = np.mean(iou_scores)
#     print(f"iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")

2. 1 PLS with original features, kNN naive

In [None]:
# K = 3

# np.random.seed(SEED)
# test_ratio = 0.2
# sample_size = X_a.shape[0]
# test_indices = np.random.choice(sample_size, int(test_ratio*sample_size), replace=False)
# train_indices = np.array(list(set(np.arange(sample_size)) - set(test_indices)))

# mlb = MultiLabelBinarizer()
# mlb.fit(Y_ttl)
# Y_enc = mlb.transform(Y_ttl[train_indices])
# Y_test = Y_ttl[test_indices]

# algo = 'pls'

# for n_comp in [32, 48, 64]:
#     print(f'==================== algo = {algo}, n_comp = {n_comp} ====================')
#     kf = KFold(n_splits=4, shuffle=True, random_state=42)
#     iou_scores = []
    
#     for train_index, test_index in tqdm.tqdm(kf.split(train_indices)):
#         # Split the data into training and test sets for this fold
#         y_train, y_test = Y_enc[train_index], Y_enc[test_index]

#         _, X_train, X_test = concatenate_cca_with_original(X_a, X_v, X_t, train_index, test_index, n_components=n_comp, algo=algo)
        
#         # KNN classifier for multilabel classification
#         knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
#         multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
#         multi_knn.fit(X_train, y_train)
    
#         # Decode the predictions back to original multilabel format
#         y_pred = mlb.inverse_transform(multi_knn.predict(X_test))
#         y_test_decoded = mlb.inverse_transform(y_test)
    
#         # Calculate IoU for this fold
#         fold_iou_score = calc_iou_from_ids(y_test_decoded, y_pred, labels_to_tags)
#         iou_scores.append(fold_iou_score)

#     _, X, data_test = concatenate_cca_with_original(X_a, X_v, X_t, train_indices, test_indices, n_components=n_comp, algo=algo)
    
#     knn = KNeighborsClassifier(n_neighbors=K, p=1, weights='distance')
#     multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
#     multi_knn.fit(X, Y_enc)
#     Y_pred_test = mlb.inverse_transform(multi_knn.predict(data_test))
#     test_iou = calc_iou_from_ids(Y_test, Y_pred_test, labels_to_tags)

#     average_iou = np.mean(iou_scores)
#     print(f"val_iou: {average_iou:.4f}, test_iou: {test_iou:.4f}")