In [None]:
from huggingface_hub import login

login('Provide Your token')

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import av
from sklearn.model_selection import train_test_split



def create_and_prepare_model(model_name):
    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    )
    model = VideoLlavaForConditionalGeneration.from_pretrained(
        model_name,
        quantization_config = quantization_config,
        device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=["gate_proj", "o_proj","k_proj","v_proj"],#['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, config)
    return model


# Helper function to check if video file exists
def is_video_available(video_path):
    
    return os.path.exists(video_path)

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    # print(container.decode(video=0))
    for i, frame in enumerate(container.decode(video=0)):
        x = indices.count(i)
        if i > end_index:
            break
        for _ in range(x):# if x == 0, is not happend anyway
            frames.append(frame)
    while len(frames) < len(indices):
        frames.append(frames[-1])
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

class VideoTextDataset(Dataset):
    def __init__(self, video_dir, df, num_frames=8, max_length=512):
        self.video_dir = video_dir
        self.df = df
        self.processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
        self.num_frames = num_frames
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_file = row['ID'] + '.mp4'
        title = row['Title']
        rating = row['Rating']
        label = row['Text']

        video_path = os.path.join(self.video_dir, video_file)
        container = av.open(video_path)
        total_frames = container.streams.video[0].frames
        indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)
        video = read_video_pyav(container, list(indices))
        # print(video.shape)
        #prompt = f"USER: <video>What complaint is conveyed by the user in the video? ASSISTANT: {label}"
        #inputs = self.processor(text=prompt, videos=video, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        prompt = "USER: <video>What complaint is conveyed by the user in the video? ASSISTANT:"
        
        inputs = self.processor(text=prompt, videos=video, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        labels = self.processor(text=label, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)['input_ids'].squeeze(0)
        
        # Remove the extra batch dimension added by the processor
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        
        inputs['labels'] = labels
        
        return inputs

    def collate_fn(self, batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.stack([item['labels'] for item in batch])
        pixel_values = torch.stack([item['pixel_values'] for item in batch])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'pixel_values': pixel_values
        }

def load_and_filter_dataset(csv_file, video_dir):
    df = pd.read_csv(csv_file)
    
    # Add a column to check if the video file exists
    df['video_path'] = df['ID'].apply(lambda x: os.path.join(video_dir, x+'.mp4'))
    df['exists'] = df['video_path'].apply(is_video_available)

    # Filter out rows where video files do not exist
    filtered_df = df[df['exists']].copy()
    
    # Remove the 'exists' column as it was just for filtering
    filtered_df.drop(columns=['exists'], inplace=True)

    print(f"Loaded {len(df)} entries, {len(filtered_df)} valid video entries after filtering.")
    
    return filtered_df

def load_and_split_data(csv_file, video_dir, train_size=0.8, val_size=0.1, test_size=0.1):
    # Load and filter dataset
    df = load_and_filter_dataset(csv_file, video_dir)
    
    # First split: separate test set
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
    
    # Second split: separate train and validation from the remaining data
    train_df, val_df = train_test_split(train_val_df, test_size=val_size/(train_size + val_size), random_state=42)
    
    print(f"Dataset splits: Train {len(train_df)}, Validation {len(val_df)}, Test {len(test_df)}")
    
    return train_df, val_df, test_df



In [None]:
model_name = "LanguageBind/Video-LLaVA-7B-hf"
# Create and prepare the model
model = create_and_prepare_model(model_name)

In [None]:
# Prepare dataset with real videos
video_dir = '/videos'
csv_file = '/com-vid.csv'
    

# Load and split the data
train_df, val_df, test_df = load_and_split_data(csv_file, video_dir)

# Create Dataset objects for each split
train_dataset = VideoTextDataset(video_dir, train_df)
val_dataset = VideoTextDataset(video_dir, val_df)
test_dataset = VideoTextDataset(video_dir, test_df)


In [None]:
# Define training arguments
training_args = TrainingArguments(
        output_dir="./New_video_llava_qlora",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=False,
        save_steps=20,
        eval_steps=10,
        logging_steps=1,
        eval_strategy="steps",
        save_total_limit=1,
        remove_unused_columns=False,
        push_to_hub=False,
        load_best_model_at_end=False,
        optim='paged_adamw_32bit',
        metric_for_best_model="eval_loss",
    )


In [None]:
# Initialize Trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=lambda data: {key: torch.stack([example[key] for example in data]) for key in data[0]},
    )


In [None]:
trainer.model.save_pretrained('/results')

In [3]:
#generation


processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
print("Generating predictions...")

In [None]:
import av
import numpy as np
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

num_frames=8
max_length=512
video_dir = '/videos'


def get_inputs(row):
        video_file = row['ID'] + '.mp4'
        label = row['Text']

        video_path = os.path.join(video_dir, video_file)
        container = av.open(video_path)
        total_frames = container.streams.video[0].frames
        indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
        video = read_video_pyav(container, indices)
        
        prompt = "USER: <video>What complaint is conveyed by the user in the video? ASSISTANT:"
        
        inputs = processor(text=prompt, videos=video, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)
        #labels = processor(text=label, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)['input_ids'].squeeze(0)
        
        # inputs['labels'] = labels
        
        return inputs

import pandas as pd

df = pd.read_csv('/test.csv')

prediction_list=[]

for idx, row in df.iterrows():
    inputs = get_inputs(row)
    out = model.generate(**inputs, max_new_tokens=100)
    
    prediction = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(row['ID'], prediction)
    prediction_list.append(prediction)

In [4]:
!srun gpustat

dgx01                     Tue Aug 26 15:48:22 2025  535.183.06
[0] NVIDIA A100-SXM4-80GB | 30°C,  ?? % |     1 / 81920 MB |
[1] NVIDIA A100-SXM4-80GB | 29°C,  ?? % |    87 / 81920 MB |
[2] NVIDIA A100-SXM4-80GB | 30°C,  ?? % |    87 / 81920 MB |
[3] NVIDIA A100-SXM4-80GB | 31°C,  ?? % |    87 / 81920 MB |
[4] NVIDIA A100-SXM4-80GB | 34°C,  ?? % | 52225 / 81920 MB | 65070044(52212M)
[5] NVIDIA A100-SXM4-80GB | 32°C,  ?? % |  3837 / 81920 MB | sarmistha(3736M)
[6] NVIDIA A100-SXM4-80GB | 57°C,  ?? % | 40483 / 81920 MB | yap(10004M) praphan(30454M)
[7] NVIDIA A100-SXM4-80GB | 38°C,  ?? % | 79086 / 81920 MB | sarmistha(31640M) sarmistha(33026M) sarmistha(13114M)
