In [1]:
# Import cell
'''
 * Copyright (c) 2022, salesforce.com, inc.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 * By Junnan Li
'''
import argparse
import os
import ruamel.yaml
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader

from models.blip_tsf_retrieval import blip_tsf_retrieval
import utils
from data.video_dataset import VideoDataset
from tqdm import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())
torch.cuda.empty_cache()
if torch.cuda.is_available():
    # Print memory stats before and after clearing
    print(f"GPU Memory before clearing: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
    torch.cuda.reset_peak_memory_stats()
    print(f"GPU Memory after clearing: {torch.cuda.memory_allocated()/1024**2:.2f}MB")

True
GPU Memory before clearing: 0.00MB
GPU Memory after clearing: 0.00MB


In [3]:
# Evaluation function
@torch.no_grad()
def evaluation(model, data_loader, tokenizer, device, config):
    # test
    model.eval() 
    
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Evaluation:'    
    
    # Add these three lines at the start
    num_text = len(data_loader.dataset.text)
    num_videos = len(data_loader)
    print(f'Starting evaluation with {num_text} texts and {num_videos} videos...')
          
    print('Computing features for evaluation...')
    start_time = time.time()  

    texts = data_loader.dataset.text
    num_text = len(texts)
    text_bs = 256
    text_ids = []
    text_embeds = []  
    text_atts = []

    # Replace this for-loop
    print('Computing text features...')
    for i in tqdm(range(0, num_text, text_bs), desc="Processing text batches"):
        text = texts[i: min(num_text, i+text_bs)]
        text_input = tokenizer(text, padding='max_length', truncation=True, max_length=35, return_tensors="pt").to(device) 
        text_output = model.text_encoder(text_input.input_ids, attention_mask = text_input.attention_mask, mode='text')  
        text_embed = F.normalize(model.text_proj(text_output.last_hidden_state[:,0,:]))
        text_embeds.append(text_embed)   
        text_ids.append(text_input.input_ids)
        text_atts.append(text_input.attention_mask)
    
    text_embeds = torch.cat(text_embeds,dim=0)
    text_ids = torch.cat(text_ids,dim=0)
    text_atts = torch.cat(text_atts,dim=0)
    text_ids[:,0] = tokenizer.additional_special_tokens_ids[0]
    
    
    print('Computing video features...')
    video_feats = []
    video_embeds = []
    # Replace this for-loop
    for video, video_id in tqdm(data_loader, desc="Processing videos"): 
        B,N,C,W,H = video.size()
        # video = video.view(-1,C,W,H)
        
        # Permute dimensions to: [B, C, N, H, W]
        video = video.permute(0, 2, 1, 4, 3)

        video = video.to(device,non_blocking=True)
        video_feat = model.visual_encoder(video)
        video_embed = model.vision_proj(video_feat[:,0,:])   
        video_embed = video_embed.view(B,N,-1).mean(dim=1)
        video_embed = F.normalize(video_embed,dim=-1)  
       
        video_feat = video_feat.view(B,-1,video_feat.shape[-1])
        video_feats.append(video_feat.cpu())
        video_embeds.append(video_embed)
     
    video_feats = torch.cat(video_feats,dim=0)
    video_embeds = torch.cat(video_embeds,dim=0)
    
    print(f"video_feats shape: {video_feats.shape}")
    print(f"video_embeds shape: {video_embeds.shape}")
    print(f"text_embeds shape: {text_embeds.shape}")
    
    sims_matrix = video_embeds @ text_embeds.t()
    score_matrix_v2t = torch.full((len(texts),len(texts)),-100.0).to(device) 
    
    num_tasks = utils.get_world_size()
    rank = utils.get_rank() 
    step = sims_matrix.size(0)//num_tasks + 1
    start = rank*step
    end = min(sims_matrix.size(0),start+step)

    for i,sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)): 
        topk_sim, topk_idx = sims.topk(k=config['k_test'], dim=0)
        
        encoder_output = video_feats[start+i].repeat(config['k_test'],1,1).to(device,non_blocking=True) 
        encoder_att = torch.ones(encoder_output.size()[:-1],dtype=torch.long).to(device,non_blocking=True) 
        output = model.text_encoder(text_ids[topk_idx], 
                                    attention_mask = text_atts[topk_idx],
                                    encoder_hidden_states = encoder_output,
                                    encoder_attention_mask = encoder_att,                             
                                    return_dict = True,
                                   )
        score = model.itm_head(output.last_hidden_state[:,0,:])[:,1]
        score_matrix_v2t[start+i,topk_idx] = score + topk_sim
        
    sims_matrix = sims_matrix.t()
    score_matrix_t2v = torch.full((len(texts),len(texts)),-100.0).to(device) 
    
    step = sims_matrix.size(0)//num_tasks + 1
    start = rank*step
    end = min(sims_matrix.size(0),start+step)    
    
    for i,sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)): 
        
        topk_sim, topk_idx = sims.topk(k=config['k_test'], dim=0)
        topk_idx = topk_idx.cpu()  # Move indices to CPU
        encoder_output = video_feats[topk_idx].to(device,non_blocking=True) 
        encoder_att = torch.ones(encoder_output.size()[:-1],dtype=torch.long).to(device,non_blocking=True) 
        output = model.text_encoder(text_ids[start+i].repeat(config['k_test'],1), 
                                    attention_mask = text_atts[start+i].repeat(config['k_test'],1),
                                    encoder_hidden_states = encoder_output,
                                    encoder_attention_mask = encoder_att,                             
                                    return_dict = True,
                                   )
        score = model.itm_head(output.last_hidden_state[:,0,:])[:,1]
        score_matrix_t2v[start+i,topk_idx] = score + topk_sim

    if args.distributed:
        dist.barrier()   
        torch.distributed.all_reduce(score_matrix_v2t, op=torch.distributed.ReduceOp.SUM) 
        torch.distributed.all_reduce(score_matrix_t2v, op=torch.distributed.ReduceOp.SUM)        
        
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Evaluation time {}'.format(total_time_str)) 

    return score_matrix_v2t.cpu().numpy(), score_matrix_t2v.cpu().numpy()



# Metric calculation function            
@torch.no_grad()
def itm_eval(scores_v2t, scores_t2v, txt2vmg, vid2txt):
    
    #Video->Text 
    ranks = np.zeros(scores_v2t.shape[0])
    for index,score in enumerate(scores_v2t):
        inds = np.argsort(score)[::-1]
        ranks[index] = np.where(inds == vid2txt[index])[0][0]

    # Compute metrics
    tr1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    tr5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    tr10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
  
    #Text->Video 
    ranks = np.zeros(scores_t2v.shape[0])
    
    for index,score in enumerate(scores_t2v):
        inds = np.argsort(score)[::-1]
        ranks[index] = np.where(inds == txt2vmg[index])[0][0]
    
    mdR = np.median(ranks+1)
        
    # Compute metrics
    vr1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    vr5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    vr10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)        

    tr_mean = (tr1 + tr5 + tr10) / 3
    vr_mean = (vr1 + vr5 + vr10) / 3
    r_mean = (tr_mean + vr_mean) / 2

    eval_result =  {'txt_r1': tr1,
                    'txt_r5': tr5,
                    'txt_r10': tr10,
                    'txt_r_mean': tr_mean,
                    'vid_r1': vr1,
                    'vid_r5': vr5,
                    'vid_r10': vr10,
                    'vid_r_mean': vr_mean,
                    'vid_mdR': mdR,
                    'r_mean': r_mean}
    return eval_result



# Main execution
def main(args, config):
    utils.init_distributed_mode(args)    
    
    device = torch.device(args.device)
    print(f"Using device: {device}")

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    cudnn.benchmark = True

    #### Dataset #### 
    print("Creating retrieval dataset")
    test_dataset = VideoDataset(config['video_root'],config['ann_root'],num_frm=config['num_frm_test'],
                                max_img_size=config['image_size'], frm_sampling_strategy='uniform') 
    
    # Add these diagnostic prints
    print("\nDataset Statistics:")
    print(f"Video directory: {config['video_root']}")
    print(f"Number of videos found: {len(test_dataset)}")
    print(f"Number of videos in annotation: {len(test_dataset.video2txt)}")
    
    # List a few video paths to verify
    video_files = os.listdir(config['video_root'])
    mp4_files = [f for f in video_files if f.endswith('.mp4')]
    print(f"\nTotal MP4 files in directory: {len(mp4_files)}")
    print("First few videos in directory:", mp4_files[:5])

    print(f"Batch size: {config['batch_size']}")
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['batch_size'],
        num_workers=1,
        pin_memory=True,
        drop_last=False,
        shuffle=False,
    )

    #### Model #### 
    print("Creating model")
    model = blip_tsf_retrieval(pretrained=config['pretrained'], eval=True, image_size=config['image_size'], vit=config['vit'])
    
    model = model.to(device)   
    
    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module   
    
    score_v2t, score_t2v, = evaluation(model_without_ddp, test_loader, model_without_ddp.tokenizer, device, config)

    if utils.is_main_process():  

        test_result = itm_eval(score_v2t, score_t2v, test_loader.dataset.txt2video, test_loader.dataset.video2txt)  
        print(test_result)

        log_stats = {**{f'{k}': v for k, v in test_result.items()},}
        with open(os.path.join(args.output_dir, "test_result.txt"),"a") as f:
            f.write(json.dumps(log_stats) + "\n")     

    

In [4]:
# Configuration cell - replace argparse
config_path = './configs/retrieval_msrvtt_test.yaml'
yaml = ruamel.yaml.YAML(typ='safe')  # Create a YAML object with safe loading
with open(config_path, 'r') as f:
    config = yaml.load(f)

# Define args as a simple namespace object
class Args:
    def __init__(self):
        self.config = config_path
        self.output_dir = 'output/Retrieval_msrvtt'
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.seed = 42
        self.world_size = 1
        self.dist_url = 'env://'
        self.distributed = False  # Changed to False for notebook usage
        self.gpu = 0

args = Args()

# Create output directory
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))

In [5]:
def update_config():
    # Load the current config
    yaml = ruamel.yaml.YAML()
    config_path = './configs/retrieval_msrvtt_test.yaml'
    
    with open(config_path, 'r') as f:
        config = yaml.load(f)
    
    # Update the annotation path
    old_path = config['ann_root']
    config['ann_root'] = 'annotation/data/refined_msrvtt_ret'
    
    # Save the updated config
    with open(config_path, 'w') as f:
        yaml.dump(config, f)
    
    print("Config updated:")
    print(f"- Changed ann_root from '{old_path}' to '{config['ann_root']}'")
    print("\nVerification:")
    print(f"- Annotation directory exists: {os.path.exists(config['ann_root'])}")
    print(f"- Video directory exists: {os.path.exists(config['video_root'])}")
    print(f"- Found annotation files: {os.listdir(os.path.join(config['ann_root'], 'txt'))}")

update_config()

Config updated:
- Changed ann_root from 'annotation/data/refined_msrvtt_ret' to 'annotation/data/refined_msrvtt_ret'

Verification:
- Annotation directory exists: True
- Video directory exists: True
- Found annotation files: ['val.jsonl', 'test.jsonl', 'train_with_duplicates.jsonl', 'train.jsonl']


In [6]:
def calculate_batch_info():
    import json
    from pathlib import Path
    
    batch_size = 64  # from your config
    txt_dir = Path('./annotation/data/msrvtt_ret/txt')
    
    print("Batch calculations for each split:")
    print(f"Batch size: {batch_size}")
    print("-" * 50)
    
    total_videos = 0
    for split in ['train.jsonl', 'val.jsonl', 'test.jsonl']:
        file_path = txt_dir / split
        if file_path.exists():
            with open(file_path) as f:
                lines = f.readlines()
                unique_videos = len(set(json.loads(line)['clip_name'] for line in lines))
                num_batches = (unique_videos + batch_size - 1) // batch_size
                total_videos += unique_videos
                
                print(f"\n{split}:")
                print(f"- Unique videos: {unique_videos}")
                print(f"- Number of batches: {num_batches}")
                print(f"- Total captions: {len(lines)}")
    
    print(f"\nTotal across all splits:")
    print(f"- Total unique videos: {total_videos}")
    print(f"- Total batches if processing all splits: {(total_videos + batch_size - 1) // batch_size}")
    
    # Memory usage estimate (rough calculation)
    avg_video_size_mb = 50  # rough estimate, adjust based on your actual videos
    print(f"\nEstimated memory usage per batch:")
    print(f"- Batch size: {batch_size} videos")
    print(f"- Approximate memory per batch: {batch_size * avg_video_size_mb}MB")

calculate_batch_info()

Batch calculations for each split:
Batch size: 64
--------------------------------------------------

train.jsonl:
- Unique videos: 7010
- Number of batches: 110
- Total captions: 140200

val.jsonl:
- Unique videos: 1000
- Number of batches: 16
- Total captions: 1000

test.jsonl:
- Unique videos: 1000
- Number of batches: 16
- Total captions: 1000

Total across all splits:
- Total unique videos: 9010
- Total batches if processing all splits: 141

Estimated memory usage per batch:
- Batch size: 64 videos
- Approximate memory per batch: 3200MB


In [7]:
# Main execution
if __name__ == '__main__':
    main(args, config)

Not using distributed mode
Using device: cuda
Creating retrieval dataset
Using downloaded and verified file: annotation/data/refined_msrvtt_ret/msrvtt_test.jsonl

Dataset Statistics:
Video directory: ./video_data
Number of videos found: 1000
Number of videos in annotation: 1000

Total MP4 files in directory: 10000
First few videos in directory: ['video6664.mp4', 'video9704.mp4', 'video8582.mp4', 'video2885.mp4', 'video4403.mp4']
Batch size: 8
Creating model
Trainable parameter: visual_encoder.model.cls_token
Trainable parameter: visual_encoder.model.pos_embed
Trainable parameter: visual_encoder.model.time_embed
Trainable parameter: visual_encoder.model.patch_embed.proj.weight
Trainable parameter: visual_encoder.model.patch_embed.proj.bias
Trainable parameter: visual_encoder.model.blocks.0.norm1.weight
Trainable parameter: visual_encoder.model.blocks.0.norm1.bias
Trainable parameter: visual_encoder.model.blocks.0.attn.qkv.weight
Trainable parameter: visual_encoder.model.blocks.0.attn.qk

Processing text batches: 100%|██████████| 4/4 [00:00<00:00,  5.67it/s]


Computing video features...


Processing videos:  34%|███▎      | 42/125 [01:01<02:01,  1.46s/it]


KeyboardInterrupt: 