<a href="https://colab.research.google.com/github/suhanasuffer/CV_Lesion/blob/main/CV_Proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading dataset and libraries

In [None]:
from google.colab import drive
import os, zipfile, shutil

# STEP 1: Mount Drive
drive.mount('/content/drive')

# STEP 2: Set paths
ZIP_PATH = "/content/drive/My Drive/ZippedBackups/Processed_Kvasir_labeled_images.zip"
EXTRACTED_DIR = "/content/drive/My Drive/ZippedBackups/Kvasir_raw"
MERGED_OUTPUT_DIR = "/content/drive/My Drive/ZippedBackups/Kvasir_merged"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import shutil

# ==== CONFIG ====
source_root = "/content/Drive/My Drive/ZippedBackups/Processes_Kvasir_labeled_images"     # Root folder with class subfolders
target_folder = "/contetn/Drive/My Drive/ZippedBackups/all_frames"         # Flat output folder
os.makedirs(target_folder, exist_ok=True)

In [None]:
import pandas as pd
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

In [None]:
# --- Configuration ---
METADATA_PATH = '/content/drive/MyDrive/CV_Backup/metadata.csv'
IMAGE_BASE_PATH = '/content/drive/My Drive/ZippedBackups/Kvasir_merged' # Base path where images are stored
FEATURE_EXTRACTOR_PATH = '/content/drive/MyDrive/CV_Backup/feature_extractor_densenet121.pth'
IMG_SIZE = 224
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# --- Load Metadata ---
import pandas as pd
try:
    df_meta = pd.read_csv(METADATA_PATH, delimiter=';')
    print(f"Successfully loaded metadata from {METADATA_PATH}")
    print("Metadata columns:", df_meta.columns.tolist())
    print("First 5 rows:\n", df_meta.head())
except FileNotFoundError:
    print(f"Error: Metadata file not found at {METADATA_PATH}")
    # Exit or raise error if metadata is essential
    exit()
except Exception as e:
    print(f"Error loading metadata: {e}")
    exit()

Successfully loaded metadata from /content/drive/MyDrive/CV_Backup/metadata.csv
Metadata columns: ['filename', 'video_id', 'frame_number', 'finding_category', 'finding_class', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4']
First 5 rows:
                      filename          video_id  frame_number  \
0  0728084c8da942d9_22803.jpg  0728084c8da942d9         22803   
1  0728084c8da942d9_22804.jpg  0728084c8da942d9         22804   
2  0728084c8da942d9_22805.jpg  0728084c8da942d9         22805   
3  0728084c8da942d9_22806.jpg  0728084c8da942d9         22806   
4  0728084c8da942d9_22807.jpg  0728084c8da942d9         22807   

  finding_category        finding_class  x1  y1  x2  y2  x3  y3  x4  y4  
0          Luminal  Normal clean mucosa NaN NaN NaN NaN NaN NaN NaN NaN  
1          Luminal  Normal clean mucosa NaN NaN NaN NaN NaN NaN NaN NaN  
2          Luminal  Normal clean mucosa NaN NaN NaN NaN NaN NaN NaN NaN  
3          Luminal  Normal clean mucosa NaN NaN NaN NaN NaN NaN NaN NaN  


In [None]:
df_meta = df_meta.drop(columns=['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4'])
df_meta.head()

Unnamed: 0,filename,video_id,frame_number,finding_category,finding_class
0,0728084c8da942d9_22803.jpg,0728084c8da942d9,22803,Luminal,Normal clean mucosa
1,0728084c8da942d9_22804.jpg,0728084c8da942d9,22804,Luminal,Normal clean mucosa
2,0728084c8da942d9_22805.jpg,0728084c8da942d9,22805,Luminal,Normal clean mucosa
3,0728084c8da942d9_22806.jpg,0728084c8da942d9,22806,Luminal,Normal clean mucosa
4,0728084c8da942d9_22807.jpg,0728084c8da942d9,22807,Luminal,Normal clean mucosa


In [None]:
# --- Define Image Transformations ---
preprocess = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

In [None]:
import numpy as np
# --- Inspect Labels Before Mapping ---
print("\nInspecting unique labels found in metadata:")
label_column_name = 'finding_class'

if label_column_name not in df_meta.columns:
    print(f"Error: Label column '{label_column_name}' not found in metadata!")
    print(f"Available columns: {df_meta.columns.tolist()}")
    exit()

unique_labels = df_meta[label_column_name].unique()
print(f"Unique values in '{label_column_name}': {np.sort(unique_labels)}")


Inspecting unique labels found in metadata:
Unique values in 'finding_class': ['Ampulla of Vater' 'Angiectasia' 'Blood - fresh' 'Blood - hematin'
 'Erosion' 'Erythema' 'Foreign Body' 'Ileocecal valve' 'Lymphangiectasia'
 'Normal clean mucosa' 'Polyp' 'Pylorus' 'Reduced Mucosal View' 'Ulcer']


In [None]:
# --- Define the Label Mapping ---

LABEL_MAP = {
    'Normal clean mucosa': 0,
    'Ampulla of Vater': 1,
    'Angiectasia': 2,
    'Blood - fresh': 3,
    'Blood - hematin': 3,
    'Erosion': 4,
    'Erythema': 5,
    'Foreign Body':6,
    'Ileocecal valve':7,
    'Lymphangiectasia': 8,
    'Polyp':9,
    'Pylorus':10,
    'Reduced Mucosal View': 11,
    'Ulcer':12

}
NUM_CLASSES = len(LABEL_MAP)
print(f"\nDefined LABEL_MAP: {LABEL_MAP}")
print(f"Number of classes: {NUM_CLASSES}")


Defined LABEL_MAP: {'Normal clean mucosa': 0, 'Ampulla of Vater': 1, 'Angiectasia': 2, 'Blood - fresh': 3, 'Blood - hematin': 3, 'Erosion': 4, 'Erythema': 5, 'Foreign Body': 6, 'Ileocecal valve': 7, 'Lymphangiectasia': 8, 'Polyp': 9, 'Pylorus': 10, 'Reduced Mucosal View': 11, 'Ulcer': 12}
Number of classes: 14


In [None]:
# --- Apply Label Mapping ---
df_meta['label_idx'] = df_meta[label_column_name].map(LABEL_MAP)

# --- Check for Unmapped Labels ---
unmapped_count = df_meta['label_idx'].isnull().sum()
if unmapped_count > 0:
    print(f"\nWarning: {unmapped_count} rows have labels not found in LABEL_MAP.")
    unmapped_values = df_meta[df_meta['label_idx'].isnull()][label_column_name].unique()
    print(f"Labels without mapping: {unmapped_values}")
    print("Please update LABEL_MAP or handle these rows.")
else:
    print("\nAll labels successfully mapped.")


All labels successfully mapped.


In [None]:
# Convert label_idx to integer type
df_meta['label_idx'] = df_meta['label_idx'].astype(int)

# --- Ensure correct data types for sorting ---
df_meta['frame_number'] = df_meta['frame_number'].astype(int)

# --- Sort by video and frame order ---
df_meta = df_meta.sort_values(by=['video_id', 'frame_number']).reset_index(drop=True)
print("\nMetadata sorted by video_id and frame_number.")


Metadata sorted by video_id and frame_number.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# ---Custom PyTorch Dataset ---
class KvasirFrameDataset(Dataset):
    def __init__(self, dataframe, image_base_path, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.image_base_path = image_base_path
        # --- Pre-check base path existence ---
        if not os.path.isdir(self.image_base_path):
             print(f"ERROR: IMAGE_BASE_PATH '{self.image_base_path}' does not exist or is not a directory.")

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.dataframe.iloc[idx]

        img_filename = row['filename']
        img_path = os.path.join(self.image_base_path, img_filename)

        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Warning: Image not found at {img_path}. Returning black image.")
            image = Image.new('RGB', (IMG_SIZE, IMG_SIZE), color='black')
        except Exception as e:
             print(f"Warning: Error loading {img_path}: {e}. Returning black image.")
             image = Image.new('RGB', (IMG_SIZE, IMG_SIZE), color='black')

        label = row['label_idx']
        video_id = row['video_id']
        frame_number = row['frame_number']

        if self.transform:
            image = self.transform(image)

        return image, label, video_id, frame_number

In [None]:
# Instantiate the dataset
full_dataset = KvasirFrameDataset(df_meta, IMAGE_BASE_PATH, transform=preprocess)
# Create a DataLoader (shuffle=False crucial for sequential processing)
frame_dataloader = DataLoader(full_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

print(f"\nDataset and DataLoader created. Number of frames: {len(full_dataset)}")


Dataset and DataLoader created. Number of frames: 47248




# Feature extraction

In [None]:
import torch.nn as nn
import torchvision.models as models

def load_feature_extractor(model_path, device):
    model = models.densenet121(weights=None)

    num_ftrs = model.classifier.in_features #no. of features

    model.classifier = nn.Identity() # Output features directly, no classification layer

    try:
         model.load_state_dict(torch.load(model_path, map_location=device))
         print("Successfully loaded model state_dict.")
    except RuntimeError:
         print("Loading full state_dict failed. Trying to load with strict=False.")
         try:
            state_dict = torch.load(model_path, map_location=device)
            model.load_state_dict(state_dict, strict=False)
            print("Loaded state_dict with strict=False. Check for missing/unexpected keys.")
         except Exception as e:
            print(f"Error loading state_dict even with strict=False: {e}")
            print("Could not load feature extractor weights. Please check the .pth file and model architecture.")
            return None

    model = model.to(device)
    model.eval()
    print(f"Feature extractor loaded on {device}. Output feature dimension: {num_ftrs}")
    return model, num_ftrs

In [None]:
feature_extractor, feature_dim = load_feature_extractor(FEATURE_EXTRACTOR_PATH, DEVICE)

if feature_extractor is None:
     raise ValueError("Failed to load feature extractor model.")

Successfully loaded model state_dict.
Feature extractor loaded on cpu. Output feature dimension: 1024


In [17]:
import numpy as np
from collections import defaultdict

video_features = defaultdict(lambda: {'features': [], 'labels': [], 'frame_ids': []})

print("Starting feature extraction...")
with torch.no_grad():
    for batch_idx, (images, labels, video_ids, frame_ids) in enumerate(frame_dataloader):
        images = images.to(DEVICE)
        features = feature_extractor(images)

        # Move features to CPU and store them per video
        features_cpu = features.cpu().numpy()
        labels_cpu = labels.numpy()
        video_ids_cpu = video_ids.numpy() if not isinstance(video_ids[0], str) else video_ids
        frame_ids_cpu = frame_ids.numpy()

        for i in range(len(images)):
            v_id = video_ids_cpu[i]
            f_id = frame_ids_cpu[i]
            lbl = labels_cpu[i]
            feat = features_cpu[i]

            video_features[v_id]['features'].append(feat)
            video_features[v_id]['labels'].append(lbl)
            video_features[v_id]['frame_ids'].append(f_id)

        if (batch_idx + 1) % 50 == 0:
             print(f"Processed batch {batch_idx + 1}/{len(frame_dataloader)}")

print("Feature extraction complete.")

Starting feature extraction...

Processed batch 50/1477
Processed batch 100/1477
Processed batch 150/1477
Processed batch 200/1477
Processed batch 250/1477
Processed batch 300/1477
Processed batch 350/1477
Processed batch 400/1477
Processed batch 450/1477
Processed batch 500/1477
Processed batch 550/1477
Processed batch 600/1477
Processed batch 650/1477
Processed batch 700/1477
Processed batch 750/1477
Processed batch 800/1477
Processed batch 850/1477
Processed batch 900/1477
Processed batch 950/1477
Processed batch 1000/1477
Processed batch 1050/1477
Processed batch 1100/1477
Processed batch 1150/1477
Processed batch 1200/1477
Processed batch 1250/1477
Processed batch 1300/1477
Processed batch 1350/1477
Processed batch 1400/1477
Processed batch 1450/1477
Feature extraction complete.


In [28]:
# Convert lists of features/labels/frame_ids to numpy arrays for easier handling
for v_id in video_features:
    video_features[v_id]['features'] = np.array(video_features[v_id]['features'])
    video_features[v_id]['labels'] = np.array(video_features[v_id]['labels'])
    video_features[v_id]['frame_ids'] = np.array(video_features[v_id]['frame_ids'])
    # Ensure sorting just in case dataloader ordering wasn't perfect (though shuffle=False should guarantee it)
    sort_indices = np.argsort(video_features[v_id]['frame_ids'])
    video_features[v_id]['features'] = video_features[v_id]['features'][sort_indices]
    video_features[v_id]['labels'] = video_features[v_id]['labels'][sort_indices]
    video_features[v_id]['frame_ids'] = video_features[v_id]['frame_ids'][sort_indices]

# Selective Frame Sampling

In [20]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

def selective_frame_sampling(features, frame_ids, method='cosine', threshold=0.95):

    if len(features) <= 1:
        return np.arange(len(features)), features, frame_ids

    selected_indices = [0]
    selected_features_list = [features[0]]
    selected_frame_ids_list = [frame_ids[0]]
    last_selected_feature = features[0]

    for i in range(1, len(features)):
        current_feature = features[i]
        keep_frame = False

        if method == 'cosine':
            sim = cosine_similarity(last_selected_feature.reshape(1, -1), current_feature.reshape(1, -1))[0, 0]
            if sim < threshold:
                keep_frame = True
        elif method == 'euclidean':
            dist = euclidean_distances(last_selected_feature.reshape(1, -1), current_feature.reshape(1, -1))[0, 0]
            if dist > threshold:
                keep_frame = True
        else:
            raise ValueError("Method must be 'cosine' or 'euclidean'")

        if keep_frame:
            selected_indices.append(i)
            selected_features_list.append(current_feature)
            selected_frame_ids_list.append(frame_ids[i])
            last_selected_feature = current_feature

    selected_features = np.array(selected_features_list)
    selected_frame_ids = np.array(selected_frame_ids_list)

    return np.array(selected_indices), selected_features, selected_frame_ids

In [22]:
# --- Example Usage ---
video_id_to_process = list(video_features.keys())[0] # Get the first video ID
original_feats = video_features[video_id_to_process]['features']
original_fids = video_features[video_id_to_process]['frame_ids']

print(f"Video {video_id_to_process}: Original frames = {len(original_feats)}")

# # Sample using Cosine Similarity (keep if similarity < 0.95)
sel_indices_cos, sel_feats_cos, sel_fids_cos = selective_frame_sampling(
    original_feats, original_fids, method='cosine', threshold=0.95
)
print(f"Cosine Sampling (thresh=0.98): Selected frames = {len(sel_feats_cos)}")

# # Sample using Euclidean Distance (keep if distance > 5.0 - threshold depends heavily on feature scale)
sel_indices_euc, sel_feats_euc, sel_fids_euc = selective_frame_sampling(
    original_feats, original_fids, method='euclidean', threshold=5.0
)
print(f"Euclidean Sampling (thresh=5.0): Selected frames = {len(sel_feats_euc)}")

Video 04a78ef00c5245e0: Original frames = 1292
Cosine Sampling (thresh=0.98): Selected frames = 611
Euclidean Sampling (thresh=5.0): Selected frames = 1150


# LSTM model definition

In [23]:
import torch.nn as nn

class LesionLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0,
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)

        # Apply dropout
        out = self.dropout(lstm_out)
        out = self.fc(out.reshape(-1, out.shape[2]))
        out = out.view(x.size(0), x.size(1), -1)

        return out

In [24]:
# --- Hyperparameters for LSTM ---
HIDDEN_DIM = 256
LSTM_LAYERS = 2
DROPOUT = 0.5

In [25]:
# --- Instantiate the temporal model ---
temporal_model = LesionLSTM(feature_dim, HIDDEN_DIM, LSTM_LAYERS, NUM_CLASSES, DROPOUT).to(DEVICE)

print(temporal_model)

LesionLSTM(
  (lstm): LSTM(1024, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=14, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


# Dataset

In [26]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class VideoFeatureDataset(Dataset):
    def __init__(self, video_feature_dict, use_sampling=False, sampling_params=None):
        self.video_ids = list(video_feature_dict.keys())
        self.video_data = video_feature_dict
        self.use_sampling = use_sampling
        self.sampling_params = sampling_params if sampling_params else {}

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]
        data = self.video_data[video_id]
        features = data['features']
        labels = data['labels']
        frame_ids = data['frame_ids']

        if self.use_sampling:
            _, sampled_features, sampled_frame_ids = selective_frame_sampling(
                features, frame_ids, **self.sampling_params
            )
            frame_id_to_label = {fid: lbl for fid, lbl in zip(frame_ids, labels)}
            sampled_labels = np.array([frame_id_to_label[fid] for fid in sampled_frame_ids])

            features = sampled_features
            labels = sampled_labels
            frame_ids = sampled_frame_ids

        # Convert numpy arrays to tensors
        features_tensor = torch.tensor(features, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        frame_ids_tensor = torch.tensor(frame_ids, dtype=torch.long)

        return features_tensor, labels_tensor, frame_ids_tensor, video_id

In [29]:
# ... video_features now holds numpy arrays ...

print("\nChecking for empty video sequences...")
feature_dim = 1024
non_empty_video_features = {}
empty_video_ids = []
total_videos = len(video_features)

for v_id, data in video_features.items():
    features_array = data['features']
    # Check if the array is actually empty or has zero frames
    if features_array.shape[0] > 0:
         # Optional but good: Check if feature dimension is correct
         if len(features_array.shape) == 2 and features_array.shape[1] == feature_dim:
              non_empty_video_features[v_id] = data
         else:
              print(f"WARNING: Video {v_id} has unexpected feature shape {features_array.shape}. Skipping.")
              empty_video_ids.append(v_id)
    else:
        print(f"WARNING: Video {v_id} has 0 frames/features after extraction. Skipping.")
        empty_video_ids.append(v_id)

print(f"Removed {len(empty_video_ids)} videos with empty or malformed features out of {total_videos}.")
print(f"Proceeding with {len(non_empty_video_features)} videos.")


Checking for empty video sequences...
Removed 1 videos with empty or malformed features out of 44.
Proceeding with 43 videos.


# Collate function


In [30]:
# --- Collate function for padding sequences ---
def collate_fn(batch):
    # batch is a list of tuples: [(feats1, labels1, fids1, vid1), (feats2, labels2, fids2, vid2), ...]
    features_list, labels_list, frame_ids_list, video_ids = zip(*batch)

    # Pad sequences: batch_first=True means output shape (batch_size, max_seq_len, feature_dim)
    features_padded = pad_sequence(features_list, batch_first=True, padding_value=0.0)
    # Pad labels: Use a value like -100 for padding for CrossEntropyLoss ignore_index
    labels_padded = pad_sequence(labels_list, batch_first=True, padding_value=-100)
    # Pad frame_ids (optional, but useful for tracking)
    frame_ids_padded = pad_sequence(frame_ids_list, batch_first=True, padding_value=-1) # Use -1 for padding ID

    # Also return sequence lengths for potential use with pack_padded_sequence (optimizes RNN computation)
    lengths = torch.tensor([len(f) for f in features_list])

    return features_padded, labels_padded, frame_ids_padded, lengths, video_ids

# Train epoch definition

In [71]:
import torch.optim as optim

# --- Training Hyperparameters ---
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(temporal_model.parameters(), lr=LEARNING_RATE)

In [72]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    total_samples = 0

    for features, labels, _, lengths, _ in dataloader: # Ignore frame_ids, video_ids for loss calc
        features, labels = features.to(device), labels.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(features) # Output: (batch, seq_len, num_classes)

        # Reshape for CrossEntropyLoss: (batch * seq_len, num_classes)
        # Labels: (batch, seq_len) -> (batch * seq_len)
        loss = criterion(outputs.view(-1, NUM_CLASSES), labels.view(-1))

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * features.size(0) # Loss per batch * batch size
        total_samples += features.size(0) # Count number of videos processed

    epoch_loss = running_loss / total_samples
    return epoch_loss

# Evaluation definiton

In [73]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import time

def evaluate_model(model, dataloader, device, num_classes, frame_rate=30):
    model.eval()
    all_preds = []
    all_labels = []
    all_video_ids = []
    all_frame_ids = []
    all_probs = []

    # Store results per video for easier analysis and timestamp generation
    results_per_video = defaultdict(lambda: {'preds': [], 'labels': [], 'frame_ids': [], 'probs': []})


    with torch.no_grad():
        for features, labels, frame_ids_padded, lengths, video_ids_batch in dataloader:
            features = features.to(device)
            outputs = model(features) # (batch, seq_len, num_classes)
            probs = torch.softmax(outputs, dim=2) # Get probabilities
            preds = torch.argmax(probs, dim=2)    # Get predicted class index

            # Move results to CPU for processing
            preds_cpu = preds.cpu().numpy()
            probs_cpu = probs.cpu().numpy()
            labels_cpu = labels.numpy()
            frame_ids_cpu = frame_ids_padded.numpy()

            # Process results sequence by sequence in the batch
            for i in range(features.size(0)): # Iterate through batch items
                seq_len = lengths[i].item() # Get original sequence length before padding
                video_id = video_ids_batch[i]

                # Get valid (non-padded) predictions, labels, frame_ids, probs for this sequence
                valid_preds = preds_cpu[i, :seq_len]
                valid_labels = labels_cpu[i, :seq_len]
                valid_frame_ids = frame_ids_cpu[i, :seq_len]
                valid_probs = probs_cpu[i, :seq_len, :]

                # Append to overall lists (for flat metrics)
                all_preds.extend(valid_preds)
                all_labels.extend(valid_labels)
                all_frame_ids.extend(valid_frame_ids) # Keep track of frame_id for each prediction
                all_video_ids.extend([video_id] * seq_len) # Keep track of video_id
                all_probs.extend(valid_probs)

                # Store results grouped by video
                results_per_video[video_id]['preds'].extend(valid_preds)
                results_per_video[video_id]['labels'].extend(valid_labels)
                results_per_video[video_id]['frame_ids'].extend(valid_frame_ids)
                results_per_video[video_id]['probs'].extend(valid_probs)


    # --- Calculate Frame-Level Metrics ---
    valid_indices = [i for i, lbl in enumerate(all_labels) if lbl >= 0]
    all_preds = np.array(all_preds)[valid_indices]
    all_labels = np.array(all_labels)[valid_indices]
    all_probs = np.array(all_probs)[valid_indices]
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted', zero_division=0)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro', zero_division=0)

    try:
        roc_auc = roc_auc_score(all_labels, all_probs, multi_class='ovr', average='weighted')
    except ValueError as e:
         print(f"Could not calculate ROC AUC: {e}. Check if all classes are present in predictions/labels.")
         roc_auc = float('nan')

    print("\n--- Frame-Level Metrics ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Weighted Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print(f"Micro Precision: {precision_micro:.4f}, Recall: {recall_micro:.4f}, F1-Score: {f1_micro:.4f}")
    print(f"Weighted ROC AUC: {roc_auc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))

    # --- Identify Lesion Timestamps ---
    detected_lesions = [] # List of tuples: (video_id, frame_id, timestamp_sec, predicted_class_index)
    inv_label_map = {v: k for k, v in LABEL_MAP.items()}

    for video_id, data in results_per_video.items():
        frame_ids = data['frame_ids']
        preds = data['preds']
        for i in range(len(preds)):
            pred_class_idx = preds[i]
            # Check if the prediction is NOT the 'negative' class (assuming index 0)
            if pred_class_idx != LABEL_MAP['Normal clean mucosa']:
                frame_id = frame_ids[i]
                timestamp_sec = frame_id / frame_rate
                predicted_class_name = inv_label_map.get(pred_class_idx, 'Unknown')
                detected_lesions.append({
                    'video_id': video_id,
                    'frame_id': frame_id,
                    'timestamp_sec': timestamp_sec,
                    'predicted_class_idx': pred_class_idx,
                    'predicted_class_name': predicted_class_name
                })

    print(f"\n--- Detected Lesions ---")
    for detection in detected_lesions[:15]:
         print(f"  Video: {detection['video_id']}, Frame: {detection['frame_id']}, "
               f"Time: {detection['timestamp_sec']:.2f}s, Class: {detection['predicted_class_name']}")
    if not detected_lesions:
         print("  No lesions detected (or 'negative' class ID is incorrect).")

    return detected_lesions, results_per_video

RRR

In [74]:
def calculate_rrr(total_original_frames, total_sampled_frames):
     if total_original_frames == 0:
         return 0.0
     return 1.0 - (total_sampled_frames / total_original_frames)

LDR

In [75]:
from collections import defaultdict

negative_label_index = LABEL_MAP.get('Normal clean mucosa')

# Mark a video as True (positive) if at least one of its frames has a lesion label
ground_truth_video_labels = defaultdict(bool)
for _, row in df_meta.iterrows():
    if row['label_idx'] != negative_label_index:
        ground_truth_video_labels[row['video_id']] = True

ground_truth_video_labels = dict(ground_truth_video_labels)


In [76]:
import numpy as np
from collections import defaultdict # Make sure it's imported

def calculate_ldr(results_per_video, ground_truth_video_labels, label_map):
    gt_positive_videos = {vid for vid, has_lesion in ground_truth_video_labels.items() if has_lesion}
    if not gt_positive_videos:
        print("Warning: No ground truth positive videos found for LDR calculation.")
        return 0.0

    detected_gt_positive_videos = 0
    negative_class_index = label_map.get('Normal clean mucosa', label_map.get('Normal clean mucosa', 0))

    for video_id in gt_positive_videos:
        if video_id in results_per_video:
            # Check if any frame in this GT positive video was predicted as a lesion
            predicted_classes = results_per_video[video_id]['preds']
            if any(pred_class != negative_class_index for pred_class in predicted_classes):
                detected_gt_positive_videos += 1

    ldr = detected_gt_positive_videos / len(gt_positive_videos)
    return ldr

# Visualizations definition

In [97]:
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import os

# --- Helper function to get original frame ---
def get_original_frame(video_id, frame_number, base_path, metadata_df):
    row = metadata_df[(metadata_df['video_id'] == video_id) & (metadata_df['frame_number'] == frame_number)]
    if row.empty:
        print(f"Warning: Could not find metadata for {video_id}, frame {frame_number}")
        return None

    img_filename = row.iloc[0]['filename']
    video_id_str = str(video_id)
    img_path = os.path.join(base_path, img_filename)

    frame = cv2.imread(img_path)
    if frame is None:
        print(f"Warning: Failed to read image at {img_path}")
    return frame

In [98]:
# --- Function to draw predictions on a frame ---
def draw_prediction_on_frame(frame, frame_number, timestamp_sec, pred_class_idx, pred_class_name, is_lesion):
    if frame is None:
        return None

    # Text settings
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.6
    thickness = 1
    text_color = (0, 0, 255) if is_lesion else (0, 255, 0) # Red for lesion, Green for normal
    bg_color = (255, 255, 255)

    # Timestamp text
    ts_text = f"Frame: {frame_number} | Time: {timestamp_sec:.2f}s"
    (ts_w, ts_h), _ = cv2.getTextSize(ts_text, font, font_scale, thickness)
    cv2.rectangle(frame, (5, 5), (5 + ts_w, 5 + ts_h + 5), bg_color, -1)
    cv2.putText(frame, ts_text, (5, 5 + ts_h), font, font_scale, (0,0,0), thickness, cv2.LINE_AA)

    # Prediction text
    pred_text = f"Pred: {pred_class_name} ({pred_class_idx})"
    (pred_w, pred_h), _ = cv2.getTextSize(pred_text, font, font_scale, thickness)
    cv2.rectangle(frame, (5, 15 + ts_h), (5 + pred_w, 15 + ts_h + pred_h + 5), bg_color, -1)
    cv2.putText(frame, pred_text, (5, 15 + ts_h + pred_h), font, font_scale, text_color, thickness, cv2.LINE_AA)

    return frame

In [79]:
def visualize_detections(detected_lesions_list, base_image_path, metadata_df, output_dir="visualization", frame_rate=30):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"\nGenerating visualizations for {len(detected_lesions_list)} detected lesion frames...")

    DISPLAY_NAME_MAP = {
         0: 'Normal clean mucosa',
         1: 'Ampulla of Vater',
         2: 'Angiectasia',
         3: 'Bleeding',
         4: 'Bleeding',
         5: 'Erosion' ,
         6: 'Erythema',
         7: 'Foreign Body',
         8: 'Ileocecal valve',
         9: 'Lymphangiectasia',
         10: 'Polyp',
         11: 'Pylorus',
         12: 'Reduced Mucosal View',
         13: 'Ulcer'

    }
    count = 0
    for detection in detected_lesions_list:
        video_id = detection['video_id']
        frame_number = detection['frame_id']
        pred_class_idx = detection['predicted_class_idx']
        timestamp_sec = detection['timestamp_sec']

        pred_class_name = DISPLAY_NAME_MAP.get(pred_class_idx, f"Unknown ({pred_class_idx})")

        original_frame = get_original_frame(video_id, frame_number, base_image_path, metadata_df)
        if original_frame is not None:
            annotated_frame = draw_prediction_on_frame(
                original_frame.copy(),
                frame_number,
                timestamp_sec,
                pred_class_idx,
                pred_class_name,
                is_lesion=True )

            # Save the annotated frame
            output_filename = f"detection_{video_id}_frame_{frame_number}_{pred_class_name}.jpg"
            output_path = os.path.join(output_dir, output_filename)
            cv2.imwrite(output_path, annotated_frame)
            count += 1
            if count % 50 == 0: # Print progress
                 print(f"  Saved {count} annotated frames...")

    print(f"Finished saving {count} annotated lesion frames to '{output_dir}'.")

In [99]:
# --- Generate Summary Report ---
def create_summary_report(detected_lesions_list, output_csv="detection_summary_cv.csv"):
    if not detected_lesions_list:
        print("No lesions detected to create a summary report.")
        return

    # Use the same display name mapping as in visualization
    DISPLAY_NAME_MAP = {
         0: 'Normal clean mucosa', 1: 'Ampulla of Vater', 2: 'Angiectasia', 3: 'Bleeding', 4: 'Bleeding', 5: 'Erosion' , 6: 'Erythema',
         7: 'Foreign Body', 8: 'Ileocecal valve', 9: 'Lymphangiectasia', 10: 'Polyp', 11: 'Pylorus', 12: 'Reduced Mucosal View', 13: 'Ulcer'
    }

    report_data = []
    for detection in detected_lesions_list:
         report_entry = detection.copy() # Start with existing data
         pred_idx = report_entry['predicted_class_idx']
         report_entry['predicted_class_name'] = DISPLAY_NAME_MAP.get(pred_idx, f"Unknown ({pred_idx})")
         report_data.append(report_entry)


    df_report = pd.DataFrame(report_data)
    df_report = df_report[['video_id', 'frame_id', 'timestamp_sec', 'predicted_class_idx', 'predicted_class_name']]
    df_report.to_csv(output_csv, index=False)
    print(f"Saved detection summary report to '{output_csv}'.")

# Dataset and DataLoader

# Without Sampling

In [81]:
# Dataset without sampling
temporal_dataset_no_sampling = VideoFeatureDataset(non_empty_video_features, use_sampling=False)

In [82]:
# DataLoaders
# Adjust batch size for temporal model based on sequence lengths and memory
TEMPORAL_BATCH_SIZE = 8
dataloader_no_sampling = DataLoader(
    temporal_dataset_no_sampling,
    batch_size=TEMPORAL_BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

print(f"Temporal Dataloader (No Sampling): {len(dataloader_no_sampling)} batches")

Temporal Dataloader (No Sampling): 6 batches


In [83]:
# --- Training Loop ---
print("Starting Temporal Model Training...")

train_dataloader = dataloader_no_sampling

for epoch in range(NUM_EPOCHS):
    epoch_loss = train_epoch(temporal_model, train_dataloader, criterion, optimizer, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss:.4f}")

print("Training finished.")

# Save the trained temporal model
torch.save(temporal_model.state_dict(), 'temporal_model_trained_wo_sampling.pth')

Starting Temporal Model Training...
Epoch 1/20, Loss: 0.7590
Epoch 2/20, Loss: 0.6740
Epoch 3/20, Loss: 0.5580
Epoch 4/20, Loss: 0.5526
Epoch 5/20, Loss: 0.5072
Epoch 6/20, Loss: 0.4610
Epoch 7/20, Loss: 0.4586
Epoch 8/20, Loss: 0.5301
Epoch 9/20, Loss: 0.5010
Epoch 10/20, Loss: 0.4216
Epoch 11/20, Loss: 0.4049
Epoch 12/20, Loss: 0.4230
Epoch 13/20, Loss: 0.3652
Epoch 14/20, Loss: 0.3808
Epoch 15/20, Loss: 0.3641
Epoch 16/20, Loss: 0.2978
Epoch 17/20, Loss: 0.3266
Epoch 18/20, Loss: 0.3769
Epoch 19/20, Loss: 0.3308
Epoch 20/20, Loss: 0.3026
Training finished.


In [55]:
from google.colab import files
files.download('temporal_model_trained_wo_sampling.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [87]:
# --- Run Evaluation ---
print("Evaluating without Selective Sampling")

# Need a dataloader with shuffle=False for consistent evaluation order if needed
eval_dataloader_no_sampling = DataLoader(
    temporal_dataset_no_sampling, batch_size=TEMPORAL_BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)
detections_no_sampling, results_per_video_no_sampling = evaluate_model(temporal_model, eval_dataloader_no_sampling, DEVICE, NUM_CLASSES)

Evaluating without Selective Sampling
Could not calculate ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'. Check if all classes are present in predictions/labels.

--- Frame-Level Metrics ---
Accuracy: 0.9303
Weighted Precision: 0.9169, Recall: 0.9303, F1-Score: 0.9188
Micro Precision: 0.9303, Recall: 0.9303, F1-Score: 0.9303
Weighted ROC AUC: nan
Confusion Matrix:
 [[34168     0     0     0     0     0     0    94     0     0    68     8
      0]
 [    0     0     0     0     0     0     0     0     0     0    10     0
      0]
 [  123     0   582     0     0     0    19    41     0     0    90     0
     11]
 [   17     0     8   337     0     0    30    27     0     0    39     0
      0]
 [  312     0     0     0     0     0     0    84     0     0    98    13
      0]
 [  146     0     0     0     0     0     0     9     0     0     0     4
      0]
 [  213     0     0     0     0     0   502    37     0     0    19     0
      5]
 [  190     0

# With Sampling

Cosine

In [85]:
# Dataset with sampling (using cosine similarity)
sampling_config = {'method': 'cosine', 'threshold': 0.95}
temporal_dataset_with_sampling = VideoFeatureDataset(non_empty_video_features, use_sampling=True, sampling_params=sampling_config)

In [49]:
dataloader_with_sampling = DataLoader(
    temporal_dataset_with_sampling,
    batch_size=TEMPORAL_BATCH_SIZE,
    shuffle=True, # Shuffle videos for training
    collate_fn=collate_fn
)
print(f"Temporal Dataloader (With Sampling): {len(dataloader_with_sampling)} batches")

Temporal Dataloader (With Sampling): 6 batches


In [50]:
# --- Training Loop ---
print("Starting Temporal Model Training (with sampling)...")
train_dataloader = dataloader_with_sampling

for epoch in range(NUM_EPOCHS):
    epoch_loss = train_epoch(temporal_model, train_dataloader, criterion, optimizer, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss:.4f}")
print("Training finished.")

Starting Temporal Model Training (with sampling)...
Epoch 1/20, Loss: 0.8831
Epoch 2/20, Loss: 0.9144
Epoch 3/20, Loss: 0.8462
Epoch 4/20, Loss: 0.8274
Epoch 5/20, Loss: 0.8674
Epoch 6/20, Loss: 0.8523
Epoch 7/20, Loss: 0.8589
Epoch 8/20, Loss: 0.6967
Epoch 9/20, Loss: 0.6772
Epoch 10/20, Loss: 0.6216
Epoch 11/20, Loss: 0.7294
Epoch 12/20, Loss: 0.7785
Epoch 13/20, Loss: 0.8091
Epoch 14/20, Loss: 0.7051
Epoch 15/20, Loss: 0.6549
Epoch 16/20, Loss: 0.6808
Epoch 17/20, Loss: 0.7135
Epoch 18/20, Loss: 0.6325
Epoch 19/20, Loss: 0.6531
Epoch 20/20, Loss: 0.6555
Training finished.


In [57]:
torch.save(temporal_model.state_dict(), 'temporal_model_cosine_sampling95.pth')

In [58]:
from google.colab import files
files.download('temporal_model_cosine_sampling95.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [88]:

print(f"Evaluating with Selective Sampling ({sampling_config['method']}, thresh={sampling_config['threshold']})")
eval_dataloader_with_sampling = DataLoader(
    temporal_dataset_with_sampling, batch_size=TEMPORAL_BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)
detections_with_sampling, results_per_video_with_sampling = evaluate_model(temporal_model, eval_dataloader_with_sampling, DEVICE, NUM_CLASSES)

Evaluating with Selective Sampling (cosine, thresh=0.95)
Could not calculate ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'. Check if all classes are present in predictions/labels.

--- Frame-Level Metrics ---
Accuracy: 0.8985
Weighted Precision: 0.8926, Recall: 0.8985, F1-Score: 0.8877
Micro Precision: 0.8985, Recall: 0.8985, F1-Score: 0.8985
Weighted ROC AUC: nan
Confusion Matrix:
 [[11711     0     0     0     0     0     0   108     0     0   244    15
      0]
 [    0     0     0     0     0     0     0     0     0     0    10     0
      0]
 [   38     0   133     2     0     0     0    37     3     0    49     0
      7]
 [    0     0     0   130     0     0     0    22     0     0    26     0
      0]
 [  116     0     0     0     0     0     0    54     0     0    43     0
      4]
 [   42     0     0     0     0     0     0     0    13     0     0     4
      0]
 [   27     0     0     0     0     0   331    38     0     0    15     0
   

# RRR

In [89]:
for vid, data in video_features.items():
    features = data['features']
    frame_ids = data['frame_ids']

    _, sampled_features, sampled_frame_ids = selective_frame_sampling(
        features, frame_ids,
        method='cosine', threshold=0.95
    )
    print(f"Video {vid}: selected {len(sampled_frame_ids)} / {len(frame_ids)} frames")


Video 04a78ef00c5245e0: selected 611 / 1292 frames
Video 0531325b64674948: selected 360 / 1191 frames
Video 0728084c8da942d9: selected 599 / 1499 frames
Video 07c1fa15a20a4398: selected 214 / 678 frames
Video 131368cc17e44240: selected 823 / 2034 frames
Video 2fc3db471f9d44c0: selected 49 / 127 frames
Video 39960e5e099a45ca: selected 93 / 237 frames
Video 3ada4222967f421d: selected 405 / 1002 frames
Video 3c8d5f0b90d7475d: selected 550 / 1704 frames
Video 4560e83f9afc4685: selected 283 / 780 frames
Video 48579eec79784294: selected 604 / 1379 frames
Video 495f16498db34d3c: selected 182 / 323 frames
Video 4aebc5cb2d4847aa: selected 64 / 90 frames
Video 5bb1d3cc7dc64cec: selected 343 / 788 frames
Video 5e59c7fdb16c4228: selected 385 / 2394 frames
Video 5e9beaf4e66142c8: selected 80 / 464 frames
Video 64440803f87b4843: selected 764 / 2159 frames
Video 6cb700585c4f4070: selected 275 / 884 frames
Video 7a47e8eacea04e64: selected 289 / 436 frames
Video 7ad22d50ebaf4596: selected 312 / 740 fra

In [90]:
total_selected_frames = 0
for vid, data in video_features.items():
    features = data['features']
    frame_ids = data['frame_ids']

    _, sampled_features, _ = selective_frame_sampling(
        features, frame_ids,
        method='cosine', threshold=0.95
    )
    total_selected_frames += len(sampled_features)

print(f"Total selected frames after cosine sampling: {total_selected_frames}")


Total selected frames after cosine sampling: 17033


In [92]:
total_original_frames = len(df_meta)
rrr = calculate_rrr(total_original_frames, total_selected_frames)
print(f"Redundancy Reduction Ratio (RRR) for sampling run: {rrr:.4f}")

Redundancy Reduction Ratio (RRR) for sampling run: 0.6395


In [93]:
print(f"Total original frames: {total_original_frames}")
print(f"Total sampled frames: {total_selected_frames}")


Total original frames: 47248
Total sampled frames: 17033


# Visualization

In [96]:
print(detections_with_sampling[0])


{'video_id': '04a78ef00c5245e0', 'frame_id': np.int64(887), 'timestamp_sec': np.float64(29.566666666666666), 'predicted_class_idx': np.int64(2), 'predicted_class_name': 'Angiectasia'}


In [100]:
visualize_detections(detections_with_sampling, IMAGE_BASE_PATH, df_meta)


Generating visualizations for 4927 detected lesion frames...
  Saved 50 annotated frames...
  Saved 100 annotated frames...
  Saved 150 annotated frames...
  Saved 200 annotated frames...
  Saved 250 annotated frames...
  Saved 300 annotated frames...
  Saved 350 annotated frames...
  Saved 400 annotated frames...
  Saved 450 annotated frames...
  Saved 500 annotated frames...
  Saved 550 annotated frames...
  Saved 600 annotated frames...
  Saved 650 annotated frames...
  Saved 700 annotated frames...
  Saved 750 annotated frames...
  Saved 800 annotated frames...
  Saved 850 annotated frames...
  Saved 900 annotated frames...
  Saved 950 annotated frames...
  Saved 1000 annotated frames...
  Saved 1050 annotated frames...
  Saved 1100 annotated frames...
  Saved 1150 annotated frames...
  Saved 1200 annotated frames...
  Saved 1250 annotated frames...
  Saved 1300 annotated frames...
  Saved 1350 annotated frames...
  Saved 1400 annotated frames...
  Saved 1450 annotated frames...
 

# LDR

In [68]:
ground_truth_video_labels = defaultdict(bool)
negative_label_index = LABEL_MAP.get('Normal clean mucosa', LABEL_MAP.get('Normal clean mucosa', 0))
for index, row in df_meta.iterrows():
    if row['label_idx'] != negative_label_index:
         ground_truth_video_labels[row['video_id']] = True

ground_truth_video_labels = dict(ground_truth_video_labels)

print(f"\nCalculating LDR for evaluation run...")


Calculating LDR for evaluation run...


In [94]:
ldr_no_sampling = calculate_ldr(results_per_video_no_sampling, ground_truth_video_labels, LABEL_MAP)
print(f"LDR (No Sampling): {ldr_no_sampling:.4f}")

ldr_value = calculate_ldr(results_per_video_with_sampling, ground_truth_video_labels, LABEL_MAP)
print(f"LDR (with sampling): {ldr_value:.4f}")

LDR (No Sampling): 1.0000
LDR (with sampling): 1.0000


# With sampling- euclidean

In [101]:
sampling_config = {'method': 'euclidean', 'threshold': 0.5}
temporal_dataset_w_sampling = VideoFeatureDataset(non_empty_video_features, use_sampling=True, sampling_params=sampling_config)

In [102]:
dataloader_w_sampling = DataLoader(
    temporal_dataset_w_sampling,
    batch_size=TEMPORAL_BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)
print(f"Temporal Dataloader (With Sampling): {len(dataloader_w_sampling)} batches")

Temporal Dataloader (With Sampling): 6 batches


In [103]:
# --- Training Loop ---
print("Starting Temporal Model Training (with sampling)...")
train_dataloader = dataloader_w_sampling

for epoch in range(NUM_EPOCHS):
    epoch_loss = train_epoch(temporal_model, train_dataloader, criterion, optimizer, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss:.4f}")
print("Training finished.")

Starting Temporal Model Training (with sampling)...
Epoch 1/20, Loss: 0.3094
Epoch 2/20, Loss: 0.2694
Epoch 3/20, Loss: 0.2456
Epoch 4/20, Loss: 0.2719
Epoch 5/20, Loss: 0.2458
Epoch 6/20, Loss: 0.2315
Epoch 7/20, Loss: 0.2505
Epoch 8/20, Loss: 0.1942
Epoch 9/20, Loss: 0.1891
Epoch 10/20, Loss: 0.1904
Epoch 11/20, Loss: 0.1937
Epoch 12/20, Loss: 0.2139
Epoch 13/20, Loss: 0.1772
Epoch 14/20, Loss: 0.1823
Epoch 15/20, Loss: 0.1779
Epoch 16/20, Loss: 0.1903
Epoch 17/20, Loss: 0.4146
Epoch 18/20, Loss: 0.3338
Epoch 19/20, Loss: 0.3624
Epoch 20/20, Loss: 0.3760
Training finished.


In [104]:
torch.save(temporal_model.state_dict(), 'temporal_model_ed_sampling05.pth')

In [106]:
from google.colab import files
files.download('temporal_model_ed_sampling05.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [107]:
print(f"Evaluating with Selective Sampling ({sampling_config['method']}, thresh={sampling_config['threshold']})")
eval_dataloader_w_sampling = DataLoader(
    temporal_dataset_w_sampling, batch_size=TEMPORAL_BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)
detections_w_sampling, results_per_video_w_sampling = evaluate_model(temporal_model, eval_dataloader_w_sampling, DEVICE, NUM_CLASSES)

Evaluating with Selective Sampling (euclidean, thresh=0.5)
Could not calculate ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'. Check if all classes are present in predictions/labels.

--- Frame-Level Metrics ---
Accuracy: 0.9084
Weighted Precision: 0.8828, Recall: 0.9084, F1-Score: 0.8831
Micro Precision: 0.9084, Recall: 0.9084, F1-Score: 0.9084
Weighted ROC AUC: nan
Confusion Matrix:
 [[34194     0     0     0     0     0     0    38     0     0    53    53
      0]
 [    0     0     0     0     0     0     0     0     0     0    10     0
      0]
 [  594     0    15     0     0     0     0    40   108     0   101     0
      8]
 [   39     0     0     0     0     0     0   182     0     0    44     0
    193]
 [  341     0     0     0     0     0     0    82     0     0    69    14
      0]
 [  149     0     0     0     0     0     0     0     0     0     0    10
      0]
 [  268     0     0     0     0     0   382    34     0     0    89     0
 

RRR

In [109]:
total_selected_frames = 0
for vid, data in video_features.items():
    features = data['features']
    frame_ids = data['frame_ids']

    _, sampled_features, _ = selective_frame_sampling(
        features, frame_ids,
        method='euclidean', threshold=0.5
    )
    total_selected_frames += len(sampled_features)

print(f"Total selected frames after euclidean sampling: {total_selected_frames}")

Total selected frames after euclidean sampling: 47229


In [110]:
total_original_frames = len(df_meta)
rrr = calculate_rrr(total_original_frames, total_selected_frames)
print(f"Redundancy Reduction Ratio (RRR) for sampling run: {rrr:.4f}")

Redundancy Reduction Ratio (RRR) for sampling run: 0.0004


In [111]:
print(f"Total original frames: {total_original_frames}")
print(f"Total sampled frames: {total_selected_frames}")


Total original frames: 47248
Total sampled frames: 47229


LDR

In [112]:
ground_truth_video_labels = defaultdict(bool)
negative_label_index = LABEL_MAP.get('Normal clean mucosa', LABEL_MAP.get('Normal clean mucosa', 0))
for index, row in df_meta.iterrows():
    if row['label_idx'] != negative_label_index:
         ground_truth_video_labels[row['video_id']] = True

ground_truth_video_labels = dict(ground_truth_video_labels)

In [113]:
ldr_value = calculate_ldr(results_per_video_w_sampling, ground_truth_video_labels, LABEL_MAP)
print(f"LDR (with sampling): {ldr_value:.4f}")

LDR (with sampling): 1.0000


Visualization and Summary

In [114]:
visualize_detections(detections_w_sampling, IMAGE_BASE_PATH, df_meta)
create_summary_report(detections_w_sampling)


Generating visualizations for 10713 detected lesion frames...
  Saved 50 annotated frames...
  Saved 100 annotated frames...
  Saved 150 annotated frames...
  Saved 200 annotated frames...
  Saved 250 annotated frames...
  Saved 300 annotated frames...
  Saved 350 annotated frames...
  Saved 400 annotated frames...
  Saved 450 annotated frames...
  Saved 500 annotated frames...
  Saved 550 annotated frames...
  Saved 600 annotated frames...
  Saved 650 annotated frames...
  Saved 700 annotated frames...
  Saved 750 annotated frames...
  Saved 800 annotated frames...
  Saved 850 annotated frames...
  Saved 900 annotated frames...
  Saved 950 annotated frames...
  Saved 1000 annotated frames...
  Saved 1050 annotated frames...
  Saved 1100 annotated frames...
  Saved 1150 annotated frames...
  Saved 1200 annotated frames...
  Saved 1250 annotated frames...
  Saved 1300 annotated frames...
  Saved 1350 annotated frames...
  Saved 1400 annotated frames...
  Saved 1450 annotated frames...


In [115]:
source_folder = '/content/visualization'
destination_folder = '/content/drive/My Drive/ZippedBackups/visualizationCosineEuclidean'

shutil.copytree(source_folder, destination_folder)

'/content/drive/My Drive/ZippedBackups/visualizationCosineEuclidean'