In [1]:
from datasets import load_dataset
dataset = load_dataset("lmms-lab/ActivityNetQA")

In [2]:
dataset['test'][0]

{'video_name': '1QIUV7WYKXg',
 'question_id': 'v_1QIUV7WYKXg_3',
 'question': 'is the athlete wearing trousers',
 'answer': 'no',
 'type': '3'}

In [3]:
import torch

import os

class ActivityQaDataset(torch.utils.data.Dataset):

    def __init__(self, dataset, question_prompt = " The answer you give MUST be \"Yes\" or \"No\"."):
        
        self.data_list = []
        self.question_prompt = question_prompt

        video_formats = ['.mp4', '.avi', '.mov', '.mkv']

        for elem in dataset:
            for fmt in video_formats:  # Added this line
                full_video_path = "/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/activitynet_qa/videos/all_test/v_" + elem['video_name'] + fmt
                if os.path.exists(full_video_path):
                    self.data_list.append({'full_video_path':full_video_path, 'question':elem['question'], 'answer':elem['answer'], 'type':elem['type']})
            

        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        data_item = self.data_list[idx]
        video_path = data_item['full_video_path']
        
        if not os.path.exists(video_path):
            print (video_path)
            print(f"Warning: Video file not found at {video_path}, skipping this item.")
            return None  
    

        question = data_item['question']
        answer = data_item['answer']
    
        return {
            'question': question.capitalize() + "? " + self.question_prompt,
            'video_path': video_path,
            'type':data_item['type'],
            #'pixel_values': pixel_values,
            'answer': answer.capitalize(),
        }

In [4]:
ds = ActivityQaDataset(dataset['test'], question_prompt = "Answer the question using a single word.")

In [5]:
ds[4]

{'question': 'Is the person in white a man? Answer the question using a single word.',
 'video_path': '/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/activitynet_qa/videos/all_test/v_mV07bEBkIcM.mp4',
 'type': '3',
 'answer': 'Yes'}

## Inference

In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from models import build_model

Please 'pip install apex'
Please 'pip install apex'
Please 'pip install apex'
Please 'pip install apex'


In [7]:
import os
import sys
import torch
import numpy as np
import random
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from models import build_model

model_path = "/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft"#"/home/jinyang06/models/VideoLaVIT-v1/language_model_sft"
model_dtype='bf16'

max_video_clips = 16
device_id = 0
torch.cuda.set_device(device_id)
device = torch.device('cuda')

seed = 42
#torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# For Multi-Modal Understanding
runner = build_model(model_path=model_path, model_dtype=model_dtype, understanding=True, 
        device_id=device_id, use_xformers=False, max_video_clips=max_video_clips,)

Loading Video LaVIT Model Weight from /home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft, model precision: bf16
Not used {}


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of the model checkpoint at /home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft were not used when initializing VideoLaVITLlamaForCausalLM: ['model.motion_tokenizer.quantize.cluster_size', 'model.motion_tokenizer.quantize.embedding.cluster_size', 'model.motion_tokenizer.quantize.embedding.embed_avg', 'model.motion_tokenizer.quantize.embedding.initted']
- This IS expected if you are initializing VideoLaVITLlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VideoLaVITLlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The Visual Vocab Size is 16384
The llama tokenizer vocab size is 32000
The maximal clip number is 16


In [8]:
video_path = '/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/activitynet_qa/videos/all_test/v_mV07bEBkIcM.mp4'
prompt = 'Is the person in white a man? Answer the question using a single word.'

output = runner({"video": video_path, "text_input": prompt}, length_penalty=1, \
        use_nucleus_sampling=True, num_beams=1, max_length=512, temperature=1.0)[0]
print(output)



No


## Validation:

In [9]:
import transformers
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path, use_fast=False, padding_side='left')
tokenizer.pad_token = tokenizer.unk_token

In [10]:
from functools import partial
import transformers
from tqdm import tqdm
import pickle

def collate_fn(batches, tokenizer):
    
    questions = [_['question'] for _ in batches]
    video_path = [_['video_path'] for _ in batches]
    answer = [_['answer'] for _ in batches]
    type_ =  [_['type'] for _ in batches]
    
    return questions, video_path, answer

dataloader = torch.utils.data.DataLoader(
        dataset=ds,
        batch_size=1,
        num_workers=1,
        pin_memory=True,
        drop_last=False,
        collate_fn=partial(collate_fn, tokenizer=tokenizer)
    )

iterator = iter(dataloader)
first_batch = next(iterator)


question = first_batch[0]
video_path = first_batch[1]
answer = first_batch[2]


print(question)
print(answer)
print(video_path)

['Is the athlete wearing trousers? Answer the question using a single word.']
['No']
['/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/activitynet_qa/videos/all_test/v_1QIUV7WYKXg.mp4']


In [None]:
y_pred = []
y_real = []

progress_bar = tqdm(
        dataloader, total=len(dataloader), desc=f"Epoch 1"
    )

for step, batch in enumerate(progress_bar, start=1):
    question = batch[0]
    video_path = batch[1]
    answer = batch[2]

    # print(pixel_values.size())
    # print(question)
    # print(answer)
    # print(num_patches_list)
    # print(task_type)
    
    # with autocast():
        # 执行推理
    outputs = runner({"video": video_path[0], "text_input": question[0]}, length_penalty=1, \
        use_nucleus_sampling=True, num_beams=1, max_length=512, temperature=1.0)[0]

    y_pred.append(outputs.strip())
    y_real.append(answer[0].strip())
    if ((step - 1)%100 == 0):
        with open('y_pred_actnet.pkl', 'wb') as f:
            pickle.dump(y_pred, f)
        with open('y_real_actnet.pkl', 'wb') as f:
            pickle.dump(y_real, f)
        print("="*20 + "question" + "="*20)
        print (question[0])
        print("="*20 + "output" + "="*20)
        print(outputs)
        print("="*20 + "real answers" + "="*20)
        print(answer[0], flush = True)

Epoch 1:   0%|          | 0/7990 [00:00<?, ?it/s]

Is the athlete wearing trousers? Answer the question using a single word.
No
No


Epoch 1:   3%|▎         | 200/7990 [35:25<30:31:56, 14.11s/it]

How about the dance in the video? Answer the question using a single word.
She swings her arms and legs from side to side as she dances along the water while birds land on the sidewalk beside her.
Good looking


Epoch 1:   4%|▍         | 300/7990 [57:10<28:16:42, 13.24s/it]

How many people are playing games in the video? Answer the question using a single word.
2
4


Epoch 1:   6%|▋         | 500/7990 [1:40:25<24:37:22, 11.83s/it]

How many people are there in the video? Answer the question using a single word.
1
1


Epoch 1:   8%|▊         | 600/7990 [1:56:43<12:45:07,  6.21s/it]

How many women are there in the video? Answer the question using a single word.
There is one woman in the video.
2


Epoch 1:  10%|█         | 800/7990 [2:35:25<25:24:57, 12.73s/it]

Is the person in the video indoors? Answer the question using a single word.
Yes
Yes


Epoch 1:  11%|█▏        | 900/7990 [2:53:35<16:23:17,  8.32s/it]

Does the person in blue have long hair? Answer the question using a single word.
Yes
No


Epoch 1:  13%|█▎        | 1000/7990 [3:13:06<20:45:00, 10.69s/it]

Is the person in the video wearing a white clothes? Answer the question using a single word.
Yes
Yes


Epoch 1:  14%|█▍        | 1100/7990 [3:31:55<12:56:57,  6.77s/it]

Is the woman in the video wearing a white dress? Answer the question using a single word.
No
Yes


Epoch 1:  15%|█▍        | 1198/7990 [3:48:47<11:22:03,  6.03s/it]

In [None]:
f = 8

In [None]:
questions = []
progress_bar = tqdm(
        dataloader, total=len(dataloader), desc=f"Epoch 1"
    )

for step, batch in enumerate(progress_bar, start=1):
    question = batch[0]
    questions.append(question[0])

with open('questions_actnet.pkl', 'wb') as f:
            pickle.dump(questions, f)