## Импорты

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torchaudio

import yt_dlp
import subprocess

import os
import pickle
import gc
import json
import locale
import re
import tqdm.notebook as tqdm
from urllib.parse import urlparse
import requests
import math
import random

import av
from huggingface_hub import hf_hub_download
from typing import Callable

from transformers import VivitImageProcessor, VivitModel
from transformers import AutoImageProcessor, VideoMAEModel
from transformers import TimesformerConfig, TimesformerModel
from transformers import XCLIPProcessor, XCLIPModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from sentence_transformers import SentenceTransformer

import librosa
from moviepy.editor import VideoFileClip
import ast
import openunmix

from minio import Minio
from minio.error import S3Error

import hydra
import soundfile as sf
from omegaconf import OmegaConf

import logging
from typing import List

import numpy as np
from sklearn.cross_decomposition import PLSRegression, CCA
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import NCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

locale.getpreferredencoding = lambda: "UTF-8"

logging.basicConfig(
    filename='vivit_inference.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## Вспомогательные объекты

In [2]:
class EmbeddingStorage:
    def __init__(self, labels=None, filenames=None, embeddings=None):
        """
        Initialize the EmbeddingStorage class.
        
        Args:
            labels (list or np.ndarray): An array of labels for the embeddings.
            filenames (list or np.ndarray): An array of filenames associated with the embeddings.
            embeddings (np.ndarray): A NumPy array containing all embeddings.
        """
        self.labels = np.array(labels) if labels is not None else np.array([])
        self.filenames = np.array(filenames) if filenames is not None else np.array([])
        self.embeddings = np.array(embeddings) if embeddings is not None else np.empty((0,))

    def add_embedding(self, label, filename, embedding):
        """
        Add a new embedding, along with its label and filename.
        
        Args:
            label (int or str): The label of the embedding.
            filename (str): The filename associated with the embedding.
            embedding (np.ndarray or torch.Tensor): The embedding to add (can be a NumPy array or Tensor).
        """
        if isinstance(embedding, np.ndarray):
            emb_array = embedding
        else:
            # Convert torch.Tensor to NumPy
            emb_array = embedding.cpu().numpy()
        
        # Append the new data
        self.labels = np.append(self.labels, label)
        self.filenames = np.append(self.filenames, filename)
        
        if self.embeddings.size == 0:
            self.embeddings = emb_array.reshape(1, -1)
        else:
            self.embeddings = np.vstack([self.embeddings, emb_array])

    def save_to_file(self, file_path):
        np.savez(file_path, labels=self.labels, filenames=self.filenames, embeddings=self.embeddings)

    @classmethod
    def load_from_file(cls, file_path):
        data = np.load(file_path, allow_pickle=True)
        return cls(labels=data['labels'], filenames=data['filenames'], embeddings=data['embeddings'])

    def get_embedding_by_filename(self, filename):
        if filename in self.filenames:
            idx = np.where(self.filenames == filename)[0][0]
            return self.embeddings[idx]
        else:
            return None

    def join_on_videoname(self, other_storage):
        # Find common filenames
        common_filenames = np.intersect1d(self.filenames, other_storage.filenames)
        
        # Initialize lists to store merged data
        merged_labels = []
        merged_filenames = []
        merged_embeddings = []
        
        for filename in common_filenames:
            # Get embeddings for the common filename from both storages
            idx_self = np.where(self.filenames == filename)[0][0]
            idx_other = np.where(other_storage.filenames == filename)[0][0]
            
            emb_self = self.embeddings[idx_self]
            emb_other = other_storage.embeddings[idx_other]
            
            # Store embeddings as a tuple
            merged_embedding = (emb_self, emb_other)
            
            # Get the label from the first storage (could be changed based on use case)
            merged_label = self.labels[idx_self]
            
            # Append to the merged data
            merged_labels.append(merged_label)
            merged_filenames.append(filename)
            merged_embeddings.append(merged_embedding)
        
        # Convert lists to numpy arrays
        merged_labels = np.array(merged_labels)
        merged_filenames = np.array(merged_filenames)
        merged_embeddings = np.array(merged_embeddings, dtype=object)
        
        # Return a new EmbeddingStorage instance with merged data
        return EmbeddingStorage(labels=merged_labels, filenames=merged_filenames, embeddings=merged_embeddings)

    def __len__(self):
        """
        Return the number of embeddings stored.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieve the label, filename, and embedding by index.
        
        Args:
            idx (int): The index of the embedding to retrieve.
        
        Returns:
            tuple: A tuple containing (label, filename, embedding).
        """
        if idx >= len(self.labels):
            raise IndexError("Index out of range")
        return self.labels[idx], self.filenames[idx], self.embeddings[idx]

    def __repr__(self):
        return f"EmbeddingStorage(labels={len(self.labels)}, filenames={len(self.filenames)}, embeddings_shape={self.embeddings.shape})"

In [3]:
def iou_metric(ground_truth, predictions):
    iou =  len(set.intersection(set(ground_truth), set(predictions)))
    iou = iou/(len(set(ground_truth).union(set(predictions))))
    print(iou, ground_truth, predictions)
    return iou

def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = [tag.strip().lower() for tag in tag.split(":")]
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list


def find_iou_for_sample_submission(pred_submission, true_submission):
    ground_truth_df = true_submission
    ground_truth_df["tags"] = ground_truth_df["tags"].apply(lambda l: l.split(', '))
    ground_truth_df["tags_split"] = ground_truth_df["tags"].apply(lambda l: split_tags(l))

    predictions_df = pred_submission
    # predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(ast.literal_eval)
    predictions_df["predicted_tags_split"] = predictions_df["predicted_tags"].apply(lambda l: split_tags(l))
    iou=0
    counter = 0
    for i, row in ground_truth_df.iterrows():
        predicted_tags = predictions_df[predictions_df["video_id"]==row["video_id"]]["predicted_tags_split"].values[0]
        iou_temp=iou_metric(row['tags_split'], predicted_tags)
        iou+=iou_temp
        counter+=1

    return iou/counter


def create_tags_to_labels(taxonomy):
    tags = {}
    for i, row in tqdm.tqdm(taxonomy.iterrows()):
        if isinstance(row['Уровень 3 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()+": "+row['Уровень 3 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 2 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 1 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()] = i
    return tags

## Get description's embeddings

In [4]:
data = pd.read_csv("train_data_categories.csv", index_col=0)
taxonomy = pd.read_csv("IAB_tags.csv")

In [28]:
np.vstack(encoded_values.values)

array([[-0.422692  , -1.0444894 ,  0.24923438, ..., -0.17118785,
        -0.5136193 ,  1.7617834 ],
       [-1.1715692 , -1.151326  ,  0.23327589, ...,  0.13894735,
        -0.9368831 ,  0.80480146],
       [-0.9565279 , -0.4599986 ,  0.59576195, ..., -0.51299   ,
        -0.5534946 ,  0.46441787],
       ...,
       [-0.86119896, -1.0143728 ,  0.04060396, ...,  0.477794  ,
        -0.55605567,  1.7086926 ],
       [-0.75098073, -1.5116149 ,  0.28733295, ...,  1.100044  ,
         0.19477315,  0.7473654 ],
       [-0.8942005 , -1.4155393 ,  0.55456275, ...,  0.84573764,
        -0.42715627,  0.37667847]], dtype=float32)

In [30]:
# model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence')
# model.save_pretrained('models/rubert')
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = SentenceTransformer('models/rubert/')
# model.to(device)
# model.eval()
# encoded_values = data['description'].apply(lambda l: model.encode(l, convert_to_tensor=True).cpu().numpy())
rubert_emb = EmbeddingStorage(filenames=data.index, embeddings=np.vstack(encoded_values.values))
rubert_emb.save_to_file('data/new_embeddings/description_emb.npz')

## Clear tags data

In [6]:
tags_to_labels = create_tags_to_labels(taxonomy)
labels_to_tags = {v: k for k, v in tags_to_labels.items()}

0it [00:00, ?it/s]

In [7]:
def assign_category_ids(data, tags_to_labels):
    category_ids = []
    
    for i, row in tqdm.tqdm(data.iterrows()):
        tags = row['tags'].split(', ')  # Split tags by comma
        split_tag_list = split_tags(tags)  # Split hierarchical tags

        # Convert the tags to category IDs
        valid_ids = [tags_to_labels[tag] for tag in split_tag_list if tag in tags_to_labels]

        if len(valid_ids) > 0:
            # If valid category IDs are found, use all of them
            category_ids.append(valid_ids)  # Append the list of valid IDs
        else:
            # If no valid tags, assign a random category
            print("Empty list", )
            category_ids.append([random.choice(list(tags_to_labels.values()))])  # Random category from available tags

    return category_ids


data['category_id'] = assign_category_ids(data, tags_to_labels)

0it [00:00, ?it/s]

Empty list
Empty list
Empty list
Empty list


In [8]:
data['category_id']

video_id
9007f33c8347924ffa12f922da2a179d              [398, 406]
9012707c45233bd601dead57bc9e2eca    [137, 162, 398, 406]
e01d6ebabbc27e323fa1b7c581e9b96a    [398, 403, 398, 404]
a00b145242be3ebc3b311455e94917af      [0, 447, 480, 398]
b01a682bf4dfcc09f1e8fac5bc18785a              [398, 406]
                                            ...         
5fe16aa2869667bc1519e32a4c536b26              [398, 403]
4ffa5fbb2a410aa841659d8890ae5e3f              [196, 202]
3fc81df4bfe121ce2bc33dd581f5efeb                   [447]
efe0b4139ef82ec270b9e2fe0216214e      [40, 47, 313, 112]
fff1ef66d848bc8987ac5126f05b053b              [313, 344]
Name: category_id, Length: 1049, dtype: object

In [55]:
# whisper_emb = EmbeddingStorage.load_from_file('data/new_embeddings/whisper.npz')
# rubert_emb = EmbeddingStorage.load_from_file('data/new_embeddings/description_emb.npz')
# xclip_emb = EmbeddingStorage.load_from_file('data/new_embeddings/xclip_emb.npz')
# xclip_emb.embeddings = xclip_emb.embeddings.mean(axis=1)

data['audio_emb'] = [None] * len(data)
data['text_emb'] = [None] * len(data)
data['video_emb'] = [None] * len(data)

In [61]:
for filename, row in data.iterrows():
    data.at[filename, 'video_emb'] = xclip_emb.get_embedding_by_filename(filename + '.mp4')
    data.at[filename, 'audio_emb'] = whisper_emb.get_embedding_by_filename(filename + '.mp4')
    data.at[filename, 'text_emb'] = rubert_emb.get_embedding_by_filename(filename)

In [68]:
X1 = np.vstack(data.audio_emb.values)
X2 = np.vstack(data.text_emb.values)
X3 = np.vstack(data.video_emb.values)
Y = data.category_id.values

All

In [73]:
X = np.hstack([X1, X2, X3])  # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.5821
Training fold 2...
Fold 2 IoU score: 0.5749
Training fold 3...
Fold 3 IoU score: 0.5654
Training fold 4...
Fold 4 IoU score: 0.5720
Training fold 5...
Fold 5 IoU score: 0.5589
Average IoU score across all 5 folds: 0.5707


Audio and text

In [74]:
X = np.hstack([X1, X2])  # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.5821
Training fold 2...
Fold 2 IoU score: 0.5761
Training fold 3...
Fold 3 IoU score: 0.5654
Training fold 4...
Fold 4 IoU score: 0.5720
Training fold 5...
Fold 5 IoU score: 0.5589
Average IoU score across all 5 folds: 0.5709


Video and text

In [75]:
X = np.hstack([X2, X3])  # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.5785
Training fold 2...
Fold 2 IoU score: 0.5813
Training fold 3...
Fold 3 IoU score: 0.5674
Training fold 4...
Fold 4 IoU score: 0.5722
Training fold 5...
Fold 5 IoU score: 0.5190
Average IoU score across all 5 folds: 0.5637


Audio and video

In [76]:
X = np.hstack([X1, X3])  # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.4688
Training fold 2...
Fold 2 IoU score: 0.5100
Training fold 3...
Fold 3 IoU score: 0.4698
Training fold 4...
Fold 4 IoU score: 0.4796
Training fold 5...
Fold 5 IoU score: 0.4600
Average IoU score across all 5 folds: 0.4776


Audio

In [77]:
X = X1  # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.4577
Training fold 2...
Fold 2 IoU score: 0.4789
Training fold 3...
Fold 3 IoU score: 0.4540
Training fold 4...
Fold 4 IoU score: 0.4648
Training fold 5...
Fold 5 IoU score: 0.4323
Average IoU score across all 5 folds: 0.4575


Text

In [78]:
X = X2 # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.5809
Training fold 2...
Fold 2 IoU score: 0.5870
Training fold 3...
Fold 3 IoU score: 0.5653
Training fold 4...
Fold 4 IoU score: 0.5770
Training fold 5...
Fold 5 IoU score: 0.5151
Average IoU score across all 5 folds: 0.5651


Video

In [79]:
X = X3 # Concatenate audio, text, and video embeddings

# Step 2: Convert Y to a binary matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_enc = mlb.fit_transform(Y)  # Y is your category_id, encoded as multilabel

# Step 3: Define IoU (Jaccard Similarity) for multilabel classification
def calculate_iou(y_true, y_pred):
    iou_scores = []
    for true_labels, pred_labels in zip(y_true, y_pred):
        true_set = set(true_labels)
        pred_set = set(pred_labels)
        intersection = len(true_set.intersection(pred_set))
        union = len(true_set.union(pred_set))
        if union == 0:
            iou_scores.append(1.0)
        else:
            iou_scores.append(intersection / union)
    return np.mean(iou_scores)

# Step 4: Define the cross-validation process
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
iou_scores = []

for train_index, test_index in kf.split(X):
    print(f"Training fold {fold}...")
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y_enc[train_index], Y_enc[test_index]
    
    # KNN classifier for multilabel classification
    knn = KNeighborsClassifier(n_neighbors=3)
    multi_knn = MultiOutputClassifier(knn, n_jobs=-1)
    multi_knn.fit(X_train, y_train)

    # Predict using the KNN classifier
    y_pred_enc = multi_knn.predict(X_test)

    # Decode the predictions back to original multilabel format
    y_pred = mlb.inverse_transform(y_pred_enc)
    y_test_decoded = mlb.inverse_transform(y_test)

    # Calculate IoU for this fold
    fold_iou_score = calculate_iou(y_test_decoded, y_pred)
    iou_scores.append(fold_iou_score)
    
    print(f"Fold {fold} IoU score: {fold_iou_score:.4f}")
    fold += 1

# Step 5: Calculate the average IoU across all folds
average_iou = np.mean(iou_scores)
print(f"Average IoU score across all 5 folds: {average_iou:.4f}")

Training fold 1...
Fold 1 IoU score: 0.6485
Training fold 2...
Fold 2 IoU score: 0.6692
Training fold 3...
Fold 3 IoU score: 0.6248
Training fold 4...
Fold 4 IoU score: 0.5852
Training fold 5...
Fold 5 IoU score: 0.5880
Average IoU score across all 5 folds: 0.6231
