In [1]:
from tqdm.notebook import tqdm
import pickle
import torch
from torch import nn
from utils import *
from sklearn.metrics import *
import pandas as pd
import torch.nn.functional as F
import numpy as np


In [2]:
device = torch.device('cuda')

# Load datasets

In [3]:
with open("test_data.pkl", "rb") as f:
    test_dataset = pickle.load(f)

In [4]:
loss_fn = nn.CrossEntropyLoss()

In [5]:
evs_path = 'weights/evs_model.pth'
main_model_path = "weights/main_model.pth"


evs = torch.load(evs_path, map_location=device)
main_model = torch.load(main_model_path, map_location=device)

In [6]:

class Evaluate:
    def __init__(
        self,
        main_model,
        evs_model,
        device,
        loss_function,
        multi_model=False,
        batch_size=12,
    ):
        self.main_model = main_model.eval()
        self.evs_model = evs_model.eval()
        self.device = device
        self.multi_model = multi_model
        self.loss = loss_function
        self.batch_size = batch_size

    def eval(self, dataset):
        batch_size = self.batch_size
        real = []
        pred = []
        y_scores = []
        video_ids = []
        losses = []
        with torch.no_grad():
            for i in range(0, len(dataset), batch_size):
                batch = [x[0] for x in dataset[i : i + batch_size]]
                labels = torch.stack([x[1] for x in dataset[i : i + batch_size]])
                video_ids.append([x[-1] for x in dataset[i : i + batch_size]])
                audio_data = [torch.stack([y[1] for y in x]).squeeze(1) for x in batch]
                video_data = [[y[0] for y in x] for x in batch]
                video_data = [
                    [torch.stack(y).squeeze(1) if type(y) is list else y for y in x]
                    for x in video_data
                ]
                video_data = [
                    self.evs_model(*pad(x, self.evs_model, device=self.device))
                    for x in video_data
                ]
                video_data, v_mask = pad(
                    video_data, self.main_model, device=self.device
                )
                audio_data, a_mask = pad(
                    audio_data, self.main_model, device=self.device
                )
                if self.multi_model:
                    final_data = audio_data + video_data
                else:
                    final_data = audio_data
                label_output = self.main_model(final_data, attention_mask=a_mask)
                losses.append(
                    self.loss(label_output.view(-1, 3), labels.to(self.device).view(-1))
                )
                
                # Store raw model outputs (probabilities)
                y_scores.append(F.softmax(label_output.view(-1, 5, 3),dim=-1))#label_output.view(-1, 5, 3)
                
                pred.append(torch.argmax(label_output.view(-1, 5, 3), dim=-1))
                real.append(labels)
                
        
        real, pred = torch.cat(real).cpu(), torch.cat(pred).cpu() 
        y_scores = torch.cat(y_scores).cpu()  # Concatenate all y_scores
        y_scores_aspects = y_scores[:, :, 1:].max(dim=-1).values.numpy()
        y_scores_complaint = y_scores[:, :, 2:].max(dim=-1).values.numpy()
        
        return (
            f1_score(real >= 1, pred >= 1, average="micro"),
            f1_score(real >= 2, pred >= 2, average="micro"),
            torch.mean(torch.stack(losses)).item(),
            pd.DataFrame(
                {
                    "Video ID": [y for x in video_ids for y in x],
                    "Aspects real": (real >= 1).to(torch.long).tolist(),
                    "Aspects pred": (pred >= 1).to(torch.long).tolist(),
                    "Complaint real": (real >= 2).to(torch.long).tolist(),
                    "Complaint pred": (pred >= 2).to(torch.long).tolist(),
                }
            )
        )


In [7]:
evaluator  = Evaluate(
    main_model, evs_model=evs, device=device, loss_function=loss_fn, multi_model=True
)
f1_aspect, f1_complaint, loss_cal, data= evaluator.eval(test_dataset)
f1_aspect

0.6025641025641025

In [8]:
df = data.copy()
df

Unnamed: 0,Video ID,Aspects real,Aspects pred,Complaint real,Complaint pred
0,51e4YIHE5sU.mp4,"[0, 1, 0, 0, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]"
1,FI65fJQ6bEM.mp4,"[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]"
2,1705732629717258718.mp4,"[0, 0, 0, 0, 1]","[1, 0, 0, 1, 0]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 0]"
3,1705745405131051310.mp4,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]"
4,yrgnxaAOuws.mp4,"[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]"
...,...,...,...,...,...
59,k4gZTf2LVdU.mp4,"[1, 0, 1, 1, 0]","[0, 0, 1, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]"
60,J8-pMPud7gE.mp4,"[0, 1, 1, 1, 0]","[0, 0, 1, 1, 0]","[0, 1, 0, 1, 0]","[0, 0, 0, 0, 0]"
61,b6PolFcMVpI.mp4,"[0, 0, 1, 1, 1]","[0, 0, 1, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]"
62,1468611338192797702.mp4,"[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]"


In [9]:
for col in df.columns:
    if isinstance(df[col][0], list):  # Check if column contains lists (one-hot encoded arrays)
        expanded_array = np.array(df[col].tolist())
        num_categories = expanded_array.shape[1]
        category_columns = [f'{col}_{i+1}' for i in range(num_categories)]
        df[category_columns] = pd.DataFrame(expanded_array, index=df.index)
        df.drop(columns=[col], inplace=True)  
df.columns

Index(['Video ID', 'Aspects real_1', 'Aspects real_2', 'Aspects real_3',
       'Aspects real_4', 'Aspects real_5', 'Aspects pred_1', 'Aspects pred_2',
       'Aspects pred_3', 'Aspects pred_4', 'Aspects pred_5',
       'Complaint real_1', 'Complaint real_2', 'Complaint real_3',
       'Complaint real_4', 'Complaint real_5', 'Complaint pred_1',
       'Complaint pred_2', 'Complaint pred_3', 'Complaint pred_4',
       'Complaint pred_5'],
      dtype='object')

In [10]:
def calculate_metrics(df):
    # Initialize dictionaries to store results
    metrics_results_aspects = {}
    metrics_results_complaint = {}

    # Calculate metrics for Aspects columns
    for i in range(1, 6):
        y_true_aspects = df[f'Aspects real_{i}'].values
        y_pred_aspects = df[f'Aspects pred_{i}'].values
        
        hamming_loss_aspect = hamming_loss(y_true_aspects, y_pred_aspects)
        micro_f1_aspect = f1_score(y_true_aspects, y_pred_aspects, average='micro')
        macro_f1_aspect = f1_score(y_true_aspects, y_pred_aspects, average='macro')
        
        metrics_results_aspects[f'Aspect_{i}'] = {
            'Hamming Loss': hamming_loss_aspect,
            'Accuracy': accuracy_score(y_true_aspects, y_pred_aspects),
            #'Zero-One Loss': zero_one_loss(y_true_aspects, y_pred_aspects),
            'Micro-F1 Score': micro_f1_aspect,
            'Macro-F1 Score': macro_f1_aspect
        }

    # Calculate metrics for Complaint columns
    for i in range(1, 6):
        y_true_complaint = df[f'Complaint real_{i}'].values
        y_pred_complaint = df[f'Complaint pred_{i}'].values
        
        hamming_loss_complaint = hamming_loss(y_true_complaint, y_pred_complaint)
        micro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='micro')
        macro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='macro')
        
        metrics_results_complaint[f'Complaint_{i}'] = {
            'Hamming Loss': hamming_loss_complaint,
            'Accuracy': accuracy_score(y_true_complaint, y_pred_complaint),
            #'Zero-One Loss': zero_one_loss(y_true_complaint, y_pred_complaint),
            'Micro-F1 Score': micro_f1_complaint,
            'Macro-F1 Score': macro_f1_complaint
        }

    return metrics_results_aspects, metrics_results_complaint



# Print results for Aspects and Complaint
def print_metrics(metrics_results, title):
    print(f"Metrics Results for {title}:")
    for key, metrics in metrics_results.items():
        print(f"{key}:")
        for metric_name, value in metrics.items():
            print(f"\t{metric_name}: {value}")

# Example usage:
metrics_results_aspects, metrics_results_complaint = calculate_metrics(df)
# Example printing:
print_metrics(metrics_results_aspects, "Aspects")
print_metrics(metrics_results_complaint, "Complaint")


Metrics Results for Aspects:
Aspect_1:
	Hamming Loss: 0.234375
	Accuracy: 0.765625
	Micro-F1 Score: 0.765625
	Macro-F1 Score: 0.5727636849132176
Aspect_2:
	Hamming Loss: 0.1875
	Accuracy: 0.8125
	Micro-F1 Score: 0.8125
	Macro-F1 Score: 0.518796992481203
Aspect_3:
	Hamming Loss: 0.078125
	Accuracy: 0.921875
	Micro-F1 Score: 0.921875
	Macro-F1 Score: 0.8885405781957506
Aspect_4:
	Hamming Loss: 0.28125
	Accuracy: 0.71875
	Micro-F1 Score: 0.71875
	Macro-F1 Score: 0.708502024291498
Aspect_5:
	Hamming Loss: 0.1875
	Accuracy: 0.8125
	Micro-F1 Score: 0.8125
	Macro-F1 Score: 0.5714285714285714
Metrics Results for Complaint:
Complaint_1:
	Hamming Loss: 0.0625
	Accuracy: 0.9375
	Micro-F1 Score: 0.9375
	Macro-F1 Score: 0.4838709677419355
Complaint_2:
	Hamming Loss: 0.09375
	Accuracy: 0.90625
	Micro-F1 Score: 0.90625
	Macro-F1 Score: 0.6
Complaint_3:
	Hamming Loss: 0.0
	Accuracy: 1.0
	Micro-F1 Score: 1.0
	Macro-F1 Score: 1.0
Complaint_4:
	Hamming Loss: 0.265625
	Accuracy: 0.734375
	Micro-F1 Score: 