In [1]:
import torch
from torch.utils.data import DataLoader
from functools import partial
import transformers
from tqdm import tqdm
import pickle

In [2]:
import pickle
with open('y_pred_msvdqa.pkl', 'rb') as f:
    y_pred = pickle.load(f)

with open('y_real_msvdqa.pkl', 'rb') as f:
    y_real = pickle.load(f)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [7]:
len(y_pred)

800

In [6]:
y_real[:30]

['No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes']

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from models import build_model

Please 'pip install apex'
Please 'pip install apex'
Please 'pip install apex'
Please 'pip install apex'


In [3]:
import json

with open('val_qa.json') as f:
    dataset = json.load(f)
    print(len(dataset))

6415


In [4]:
dataset[1]

{'answer': 'man',
 'id': 30934,
 'question': 'who pours a seasoning liquid from a plastic container over chicken pieces placed in a plastic pouch?',
 'video_id': 1201}

In [5]:
from collections import defaultdict

id2video_mapper = defaultdict()

with open("youtube_mapping.txt") as file:
    lines = [line.rstrip() for line in file]

for line in lines:
    key, value = line.split()
    value = value.split('vid')[1]
    id2video_mapper[value] = key


In [6]:

class MSVDQaDataset(torch.utils.data.Dataset):

    def __init__(self, module_path, dataset, mapper, task_type = 'qa', question_prompt = " Answer the question using a single word or a short phrase with multiple words."):
        
        self.data_list = []
        self.question_prompt = question_prompt

        self.movie_dir = module_path

        for elem in dataset:
            full_video_path = module_path + '/eval/YouTubeClips' +'/' + mapper[str(elem['video_id'])] +".avi"
            self.data_list.append({'full_video_path':full_video_path, 'question':elem['question'], 'answer':elem['answer']})
            

        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        data_item = self.data_list[idx]
        video_path = data_item['full_video_path']
        
        if not os.path.exists(video_path):
            print (video_path)
            print(f"Warning: Video file not found at {video_path}, skipping this item.")
            return None  
    

        question = data_item['question']
        answer = data_item['answer']
    
        return {
            'question': question + self.question_prompt,
            'video_path': video_path,
            #'pixel_values': pixel_values,
            'answer': answer.capitalize(),
        }
    

In [7]:
msvdq = MSVDQaDataset(module_path, dataset, id2video_mapper, task_type = 'qa') 

## Inference

In [8]:
import os
import sys
import torch
import numpy as np
import random
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from models import build_model

model_path = "/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft"#"/home/jinyang06/models/VideoLaVIT-v1/language_model_sft"
model_dtype='bf16'

max_video_clips = 16
device_id = 0
torch.cuda.set_device(device_id)
device = torch.device('cuda')

seed = 42
#torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# For Multi-Modal Understanding
runner = build_model(model_path=model_path, model_dtype=model_dtype, understanding=True, 
        device_id=device_id, use_xformers=False, max_video_clips=max_video_clips,)

Loading Video LaVIT Model Weight from /home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft, model precision: bf16
Not used {}


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of the model checkpoint at /home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/models/language_model_sft were not used when initializing VideoLaVITLlamaForCausalLM: ['model.motion_tokenizer.quantize.cluster_size', 'model.motion_tokenizer.quantize.embedding.embed_avg', 'model.motion_tokenizer.quantize.embedding.initted', 'model.motion_tokenizer.quantize.embedding.cluster_size']
- This IS expected if you are initializing VideoLaVITLlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VideoLaVITLlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The Visual Vocab Size is 16384
The llama tokenizer vocab size is 32000
The maximal clip number is 16


In [9]:
video_path = '/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/YouTubeClips/bQJQGoJF7_k_162_169.avi'
prompt = "who pours marinade in a bag of chicken? Answer the question using a single word or phrase."

output = runner({"video": video_path, "text_input": prompt}, length_penalty=1, \
        use_nucleus_sampling=True, num_beams=1, max_length=512, temperature=1.0)[0]
print(output)



Man


In [10]:
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path, use_fast=False, padding_side='left')
tokenizer.pad_token = tokenizer.unk_token

In [11]:
def collate_fn(batches, tokenizer):
    
    questions = [_['question'] for _ in batches]
    video_path = [_['video_path'] for _ in batches]
    answer = [_['answer'] for _ in batches]
    
    return questions, video_path, answer

dataloader = torch.utils.data.DataLoader(
        dataset=msvdq,
        batch_size=1,
        num_workers=1,
        pin_memory=True,
        drop_last=False,
        collate_fn=partial(collate_fn, tokenizer=tokenizer)
    )

iterator = iter(dataloader)
first_batch = next(iterator)


question = first_batch[0]
video_path = first_batch[1]
answer = first_batch[2]


print(question)
print(answer)
print(video_path)


first_batch = next(iterator)


question = first_batch[0]
video_path = first_batch[1]
answer = first_batch[2]


print(question)
print(answer)
print(video_path)


first_batch = next(iterator)


question = first_batch[0]
video_path = first_batch[1]
answer = first_batch[2]


print(question)
print(answer)
print(video_path)


['who pours liquid from a plastic container into a ziploc bag containing meat pieces? Answer the question using a single word or a short phrase with multiple words.']
['Someone']
['/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/YouTubeClips/bQJQGoJF7_k_162_169.avi']
['who pours a seasoning liquid from a plastic container over chicken pieces placed in a plastic pouch? Answer the question using a single word or a short phrase with multiple words.']
['Man']
['/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/YouTubeClips/bQJQGoJF7_k_162_169.avi']
['who pours marinade in a bag of chicken? Answer the question using a single word or a short phrase with multiple words.']
['Person']
['/home/jovyan/shares/SR004.nfs2/chekalina/LaVIT/VideoLaVIT/eval/YouTubeClips/bQJQGoJF7_k_162_169.avi']


In [None]:
y_pred = []
y_real = []

progress_bar = tqdm(
        dataloader, total=len(dataloader), desc=f"Epoch 1"
    )

for step, batch in enumerate(progress_bar, start=1):
    question = batch[0]
    video_path = batch[1]
    answer = batch[2]

    # print(pixel_values.size())
    # print(question)
    # print(answer)
    # print(num_patches_list)
    # print(task_type)
    
    # with autocast():
        # 执行推理
    outputs = runner({"video": video_path[0], "text_input": question[0]}, length_penalty=1, \
        use_nucleus_sampling=True, num_beams=1, max_length=512, temperature=1.0)[0]

    y_pred.append(outputs.strip())
    y_real.append(answer[0].strip())
    if (step%100 == 0):
        with open('y_pred_msvdqa.pkl', 'wb') as f:
            pickle.dump(y_pred, f)
        with open('y_real_msvdqa.pkl', 'wb') as f:
            pickle.dump(y_real, f)
        print("="*20 + "question" + "="*20)
        print (question[0])
        print("="*20 + "output" + "="*20)
        print(outputs)
        print("="*20 + "real answers" + "="*20)
        print(answer[0], flush = True)

Epoch 1:   2%|▏         | 99/6415 [04:26<2:27:03,  1.40s/it]

who made a great catch? Answer the question using a single word or a short phrase with multiple words.
Man
Baseball


Epoch 1:  16%|█▌        | 999/6415 [39:59<2:57:50,  1.97s/it]

who is stroking a baby beaver? Answer the question using a single word or a short phrase with multiple words.
Man
Woman


Epoch 1:  17%|█▋        | 1099/6415 [42:42<2:04:05,  1.40s/it]

what is a male flight attendant closing the door of? Answer the question using a single word or a short phrase with multiple words.
Bathroom
Plane


Epoch 1:  19%|█▊        | 1199/6415 [46:53<3:45:56,  2.60s/it] 

what is eating a trail of food? Answer the question using a single word or a short phrase with multiple words.
Hamster
Hamster


Epoch 1:  20%|██        | 1299/6415 [50:21<3:20:52,  2.36s/it]

who plays the piano? Answer the question using a single word or a short phrase with multiple words.
Boy
Man


Epoch 1:  22%|██▏       | 1399/6415 [55:16<6:40:03,  4.79s/it]

what is a fox walking slowly in? Answer the question using a single word or a short phrase with multiple words.
Snow
Ice


Epoch 1:  23%|██▎       | 1499/6415 [59:24<2:30:23,  1.84s/it]

what is a kid playing? Answer the question using a single word or a short phrase with multiple words.
Flute
Instrument


Epoch 1:  25%|██▍       | 1599/6415 [1:01:45<2:10:22,  1.62s/it]

what is a person on a wooded path? Answer the question using a single word or a short phrase with multiple words.
Unknown
Jogging


Epoch 1:  26%|██▋       | 1699/6415 [1:05:10<1:58:40,  1.51s/it]

what does a man push another man into? Answer the question using a single word or a short phrase with multiple words.
Yes
Ocean


Epoch 1:  28%|██▊       | 1799/6415 [1:08:23<2:21:06,  1.83s/it]

what is a person slicing? Answer the question using a single word or a short phrase with multiple words.
Tomato
Tomato


Epoch 1:  30%|██▉       | 1899/6415 [1:13:08<1:34:01,  1.25s/it]

what is the person making a new file folder on? Answer the question using a single word or a short phrase with multiple words.
Computer
Computer


Epoch 1:  31%|███       | 1999/6415 [1:16:36<3:06:26,  2.53s/it]

who pours a milkshake into two glasses? Answer the question using a single word or a short phrase with multiple words.
Boy
Someone


Epoch 1:  33%|███▎      | 2099/6415 [1:19:56<1:20:41,  1.12s/it]

what is a man mixing in a bowl? Answer the question using a single word or a short phrase with multiple words.
Flour
Dough


Epoch 1:  34%|███▍      | 2199/6415 [1:23:25<2:37:32,  2.24s/it]

who is making a cup of coffee? Answer the question using a single word or a short phrase with multiple words.
Woman
Woman


Epoch 1:  36%|███▌      | 2299/6415 [1:29:33<6:24:37,  5.61s/it]

who used a stick to pound a tool into a block of wood? Answer the question using a single word or a short phrase with multiple words.
Man
Man


Epoch 1:  37%|███▋      | 2399/6415 [1:35:41<2:22:26,  2.13s/it] 

who is a bunny playing with? Answer the question using a single word or a short phrase with multiple words.
People
Person


Epoch 1:  39%|███▉      | 2499/6415 [1:40:30<3:51:22,  3.54s/it]

who is playing piano and singing? Answer the question using a single word or a short phrase with multiple words.
Child
Boy


Epoch 1:  41%|████      | 2599/6415 [1:45:30<2:06:20,  1.99s/it]

who rode his motorcycle through the water at the shore? Answer the question using a single word or a short phrase with multiple words.
Man
Man


Epoch 1:  42%|████▏     | 2699/6415 [1:49:54<2:12:36,  2.14s/it]

what is a cat doing? Answer the question using a single word or a short phrase with multiple words.
Sleeping
Jump


Epoch 1:  44%|████▎     | 2799/6415 [1:55:06<4:06:16,  4.09s/it]

who drys off a woman? Answer the question using a single word or a short phrase with multiple words.
Man
Man


Epoch 1:  45%|████▌     | 2899/6415 [2:01:55<1:15:58,  1.30s/it]

who is cleaning a garden? Answer the question using a single word or a short phrase with multiple words.
Woman
Woman


Epoch 1:  47%|████▋     | 2999/6415 [2:05:08<2:42:23,  2.85s/it]

who does aerobic exercise? Answer the question using a single word or a short phrase with multiple words.
Lady
Woman


Epoch 1:  48%|████▊     | 3099/6415 [2:07:43<1:02:52,  1.14s/it]

who are singing? Answer the question using a single word or a short phrase with multiple words.
People
Church


Epoch 1:  50%|████▉     | 3199/6415 [2:12:39<2:22:09,  2.65s/it]

what is playing with the dog? Answer the question using a single word or a short phrase with multiple words.
Cat
Duck


Epoch 1:  51%|█████▏    | 3299/6415 [2:16:14<1:07:06,  1.29s/it]

who is making a ball of dough? Answer the question using a single word or a short phrase with multiple words.
You
Person


Epoch 1:  53%|█████▎    | 3399/6415 [2:19:24<1:52:16,  2.23s/it]

what do two guys play? Answer the question using a single word or a short phrase with multiple words.
Table tennis
Tenni


Epoch 1:  55%|█████▍    | 3499/6415 [2:27:15<8:38:50, 10.68s/it]

what is a man doing? Answer the question using a single word or a short phrase with multiple words.
Grating lemon
Squeeze


Epoch 1:  56%|█████▌    | 3599/6415 [2:31:58<1:28:04,  1.88s/it]

what are ants eating? Answer the question using a single word or a short phrase with multiple words.
Fruit
Food


Epoch 1:  58%|█████▊    | 3699/6415 [2:36:29<2:00:14,  2.66s/it]

who takes a piece of pepperoni pizza out of a pizza box? Answer the question using a single word or a short phrase with multiple words.
No one
Man


Epoch 1:  59%|█████▉    | 3799/6415 [2:41:18<1:19:16,  1.82s/it]

what is the man driving? Answer the question using a single word or a short phrase with multiple words.
Car
Car


Epoch 1:  61%|██████    | 3899/6415 [2:44:24<1:37:59,  2.34s/it]

what is a woman poking a potato with? Answer the question using a single word or a short phrase with multiple words.
A knife
Fork


Epoch 1:  62%|██████▏   | 3999/6415 [2:49:39<1:19:47,  1.98s/it]

what is the boy doing? Answer the question using a single word or a short phrase with multiple words.
He is playing a violin
Play


Epoch 1:  64%|██████▍   | 4099/6415 [2:52:56<1:32:24,  2.39s/it]

what is a car spinning in? Answer the question using a single word or a short phrase with multiple words.
Smoke
Lot


Epoch 1:  65%|██████▌   | 4199/6415 [2:56:06<38:17,  1.04s/it]  

what is a man doing? Answer the question using a single word or a short phrase with multiple words.
He is going to heat his soup in microwave
Use


Epoch 1:  67%|██████▋   | 4299/6415 [2:58:49<53:03,  1.50s/it]  

what are people doing? Answer the question using a single word or a short phrase with multiple words.
People are spinning their bodies rapidly. The women are performing a type of dance with a rod, while the men are dancing around them.
Dance


Epoch 1:  69%|██████▊   | 4399/6415 [3:02:17<1:32:41,  2.76s/it]

what is a man placing in a cup containing tea and a woman? Answer the question using a single word or a short phrase with multiple words.
Sugar
Sugar


Epoch 1:  70%|███████   | 4499/6415 [3:05:26<56:06,  1.76s/it]  

who is pouring olive oil into a pan? Answer the question using a single word or a short phrase with multiple words.
Man
Someone


Epoch 1:  72%|███████▏  | 4599/6415 [3:08:11<1:40:40,  3.33s/it]

what is the cook doing? Answer the question using a single word or a short phrase with multiple words.
The video shows the cook pouring a brown liquid into a small black pot with food in it and then covering it.
Pmy


Epoch 1:  73%|███████▎  | 4699/6415 [3:11:21<1:02:12,  2.18s/it]

who is removing food from a pot? Answer the question using a single word or a short phrase with multiple words.
Woman
Man


Epoch 1:  75%|███████▍  | 4799/6415 [3:15:24<58:18,  2.16s/it]  

what is a woman seated by a lake pulled at by a black gloved hand? Answer the question using a single word or a short phrase with multiple words.
Grabbed
Ankle


Epoch 1:  76%|███████▋  | 4899/6415 [3:19:00<1:12:45,  2.88s/it]

what is karate kicking a person? Answer the question using a single word or a short phrase with multiple words.
Karate kicking is done with one leg and a swift movement, using body muscles for power and accuracy, and it's usually done in self-defense in combat or martial arts settings. In the video, we can see the man kicks the man lying on the floor, which is done for self defense by the person who is on the ground facing down.
Monkey


Epoch 1:  78%|███████▊  | 4999/6415 [3:24:44<37:20,  1.58s/it]  

who is washing his pet rat? Answer the question using a single word or a short phrase with multiple words.
Man
Person


Epoch 1:  79%|███████▉  | 5099/6415 [3:27:27<53:56,  2.46s/it]  

what does a man play? Answer the question using a single word or a short phrase with multiple words.
Piano
Keyboard


Epoch 1:  81%|████████  | 5199/6415 [3:30:58<31:30,  1.56s/it]  

what is a man playing while seated? Answer the question using a single word or a short phrase with multiple words.
Guitar
Guitar


Epoch 1:  83%|████████▎ | 5299/6415 [3:34:35<35:18,  1.90s/it]  

what are two players playing? Answer the question using a single word or a short phrase with multiple words.
Ping pong
Tenni


Epoch 1:  84%|████████▍ | 5399/6415 [3:39:21<34:18,  2.03s/it]  

what are two men doing? Answer the question using a single word or a short phrase with multiple words.
Fighting
Struggle


Epoch 1:  86%|████████▌ | 5499/6415 [3:41:50<31:46,  2.08s/it]

who plays with a plastic tea set? Answer the question using a single word or a short phrase with multiple words.
Toddler
Toddler


Epoch 1:  87%|████████▋ | 5599/6415 [3:43:54<13:11,  1.03it/s]

what is someone doing? Answer the question using a single word or a short phrase with multiple words.
There are two people, one person is petting a tabby cat that is laying on their lap and another person is gently petting the cat's head.
Pet


Epoch 1:  89%|████████▉ | 5699/6415 [3:47:26<35:40,  2.99s/it]

what is a man spooning onto a tray? Answer the question using a single word or a short phrase with multiple words.
Pancake
Pancake


Epoch 1:  90%|█████████ | 5799/6415 [3:50:30<10:14,  1.00it/s]

who is dancing outside? Answer the question using a single word or a short phrase with multiple words.
Girl
Woman


Epoch 1:  92%|█████████▏| 5899/6415 [3:52:32<06:57,  1.24it/s]

what is someone doing? Answer the question using a single word or a short phrase with multiple words.
Cutting
Cut


Epoch 1:  95%|█████████▌| 6099/6415 [3:56:42<13:34,  2.58s/it]

what is someone kneeding? Answer the question using a single word or a short phrase with multiple words.
Dough
Dough


Epoch 1:  97%|█████████▋| 6199/6415 [3:58:36<02:52,  1.25it/s]

what does a man cut a chicken with? Answer the question using a single word or a short phrase with multiple words.
Gun
Axe


Epoch 1:  98%|█████████▊| 6299/6415 [4:00:39<01:48,  1.07it/s]

what is putting a lizard into a box? Answer the question using a single word or a short phrase with multiple words.
Person
Someone


Epoch 1: 100%|█████████▉| 6399/6415 [4:03:05<00:17,  1.08s/it]

what is a man sliding across with a shopping cart? Answer the question using a single word or a short phrase with multiple words.
No idea
Floor


Epoch 1: 100%|██████████| 6415/6415 [4:03:26<00:00,  2.28s/it]


In [13]:
y_pred[:30]

['Man',
 'Guy',
 'Man',
 'Chef',
 'Guy',
 'Man',
 'Chef',
 'Chef',
 'Butcher',
 'Chef',
 'Unknown',
 'Marinade',
 'Woman',
 'Man',
 'Sauce',
 'Person',
 'A cook pours sauce on slices of meat.',
 'Raw chicken',
 'Bag',
 'Plastic bag',
 "The video doesn't show what the marinade is made from.",
 'Chicken',
 'Tray',
 'Spices',
 'Sauce',
 'Sauce',
 'Seasoning',
 'Seasoning',
 'Chicken',
 'Ziplock']

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [14]:
y_real[:30]

['Someone',
 'Man',
 'Person',
 'Man',
 'Man',
 'Man',
 'Man',
 'Cook',
 'Man',
 'Person',
 'Person',
 'Seasoning',
 'Man',
 'Man',
 'Sauce',
 'Man',
 'Marinade',
 'Chicken',
 'Bag',
 'Bag',
 'Bowl',
 'Chicken',
 'Bag',
 'Marinade',
 'Sauce',
 'Marinade',
 'Marinade',
 'Sauce',
 'Meat',
 'Bag']

In [15]:
result = [1 if i == j else 0 for i, j in zip(y_real, y_pred)]

print("VideoLAVIT_MSVDQA, accuracy is: " + str(sum(result) / len(result)))

VideoLAVIT_MSVDQA, accuracy is: 0.27030397505845677


## Evaluation by Gpt-3

In [None]:
import openai

# Set your OpenAI API key
openai.api_key = "YOUR_API_KEY"

# Define the test cases
test_cases = [
    {"input": "What is the capital of France?", "expected_output": "Paris"},
    {"input": "What is 2 + 2?", "expected_output": "4"},
    {"input": "Explain photosynthesis briefly.", "expected_output": "Photosynthesis is the process by which plants convert sunlight into energy."},
]

# Validate the model
for test in test_cases:
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": test["input"]}]
    )
    output = response['choices'][0]['message']['content'].strip()
    print(f"Input: {test['input']}")
    print(f"Expected: {test['expected_output']}")
    print(f"Output: {output}")
    print(f"Match: {output == test['expected_output']}")
    print("-" * 30)