In [1]:
from gmflow.gmflow.gmflow import GMFlow

ModuleNotFoundError: No module named 'gmflow'

!gpustat

In [None]:
CREATE THE DATASET

In [2]:
import torch
import torchvision.transforms as transforms
import cv2 as cv
import pandas as pd
import os
import numpy as np
from gmflow.gmflow.gmflow import GMFlow
from datasets import Dataset, DatasetDict, Features, Image, Value, ClassLabel
from huggingface_hub import HfApi, HfFolder
from tqdm import tqdm
class VideoProcessor:
    def __init__(self, device):
        self.device = device
        self.model = None
        self.dataset = None

    def load_models(self, gm_loc: str) -> None:
        """
        Load GMFlow model for frame extraction.

        Args:
            gm_loc (str): Path to the model checkpoint for GMFlow.
        """
        self.model = GMFlow(
            feature_channels=128,
            num_scales=1,
            upsample_factor=8,
            num_head=1,
            attention_type="swin",
            ffn_dim_expansion=4,
            num_transformer_layers=6,
        ).to(self.device)
        print("Loading GMFlow")
        checkpoint = torch.load(gm_loc, map_location=self.device)
        weights = checkpoint["model"] if "model" in checkpoint else checkpoint
        self.model.load_state_dict(weights, strict=True)
        self.model.eval()

    def extract_relevant_frames_with_gmflow(self, video_loc, initial_frames_to_skip=4, initial_velocity_threshold=10, max_frames=3, min_frames=1):
        """
        Extract relevant frames based on optical flow using GMFlow with dynamic frames_to_skip and velocity_threshold.
    
        Args:
            video_loc (str): Path to the video file.
            initial_frames_to_skip (int): Initial number of frames to skip between each comparison.
            initial_velocity_threshold (float): Initial threshold to decide if the frame has significant motion.
            max_frames (int): Maximum number of frames to extract.
            min_frames (int): Minimum number of frames to extract.
    
        Returns:
            List of tuples with relevant frames (images) and the frame number.
        """
        video = cv.VideoCapture(video_loc)
        good_frames = []
        
        total_frames = int(video.get(cv.CAP_PROP_FRAME_COUNT))
        count = 0
        
        frames_to_skip = initial_frames_to_skip
        velocity_threshold = initial_velocity_threshold
    
        while True:
            ret1, frame1 = video.read()  
            for _ in range(frames_to_skip):
                ret2, frame2 = video.read()  
            
            if not ret1 or not ret2:
                break
            
            # Resize and prepare frames for GMFlow
            prev_frame = torch.from_numpy(cv.resize(frame1, (320, 160)).astype(np.float32)).permute(2, 0, 1).unsqueeze(0).to(self.device)
            next_frame = torch.from_numpy(cv.resize(frame2, (320, 160)).astype(np.float32)).permute(2, 0, 1).unsqueeze(0).to(self.device)
    
            with torch.no_grad():
                flow_results = self.model(prev_frame, next_frame, attn_splits_list=[2],
                                          corr_radius_list=[-1],
                                          prop_radius_list=[-1],
                                          pred_bidir_flow=False)
                flow = flow_results["flow_preds"][-1]
                velocity = torch.sqrt(flow[:, 0]**2 + flow[:, 1]**2)
                mean_velocity = velocity.mean().item()
            
            if mean_velocity > velocity_threshold:
                good_frames.append((count, frame2))  
            
            count += frames_to_skip
    
            if len(good_frames) < min_frames and frames_to_skip > 1:
                frames_to_skip -= 1
            elif len(good_frames) > max_frames and frames_to_skip < initial_frames_to_skip:
                frames_to_skip += 1
    
            # Adjust velocity threshold to fine-tune frame selection
            if len(good_frames) < min_frames:
                velocity_threshold = max(velocity_threshold - 1, 1) 
            elif len(good_frames) > max_frames:
                velocity_threshold += 1
    
        video.release()
        
        if len(good_frames) < min_frames:
            video.set(cv.CAP_PROP_POS_FRAMES, total_frames - 1)
            ret, last_frame = video.read()
            if ret:
                good_frames.append((total_frames - 1, last_frame))
        
        return good_frames[:max_frames]


    def process_video_dataset(self, csv_file, videos_dir):
        """
        Process the videos from the dataset and prepare relevant frames.

        Args:
            csv_file (str): CSV file containing the video names, questions, and answers.
            videos_dir (str): Directory containing the video files.
        
        Returns:
            Hugging Face dataset dictionary for uploading.
        """
        df = pd.read_csv(csv_file)
        data = []
        
        for index, row in tqdm(df.iterrows()):
            video_name = row['ID']+".mp4"
            video_path = os.path.join(videos_dir, video_name)

            if not os.path.exists(video_path):
                print(f"Video file {video_path} does not exist, skipping.")
                continue
            # print(f"Processing video: {video_name}")
            relevant_frames = self.extract_relevant_frames_with_gmflow(video_path)
            # print(len(relevant_frames))
            resize_transform = transforms.Resize((360,360))
            for frame_num, frame in relevant_frames:
                question = "<image>\nWhat complaint is conveyed by the user in this image?"
                answer = row['Label']
                if isinstance(frame, np.ndarray):
                    frame = PILImage.fromarray(cv.cvtColor(frame, cv.COLOR_BGR2RGB)) 
                frame=resize_transform(frame)
                f2=Image()
                frame=f2.encode_example(frame)
                data.append({
                    "image": frame,  
                    "ID": video_name+str(frame_num),
                    "question": question,
                    "answer": answer
                })
        features = Features({
            "image": Image(), 
            "ID": Value("string"),
            "question": Value("string"),
            "answer": Value("string")
        })
        return Dataset.from_pandas(pd.DataFrame(data), features=features)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



ModuleNotFoundError: No module named 'gmflow'

In [8]:
from PIL import Image as PILImage
video_processor = VideoProcessor("cuda")
gmflow_checkpoint = "gmflow/pretrained/gmflow_sintel-0c07dcb3.pth"  # Path to the GMFlow checkpoint
video_processor.load_models(gmflow_checkpoint)
csv_file = '/home/sarmistha/Testing2/Copy of New_200_text - Sheet1.csv'
videos_dir = '/home/sarmistha/Research/videos'
dataset = video_processor.process_video_dataset(csv_file, videos_dir)


Loading GMFlow


168it [09:58,  3.56s/it]


In [9]:
hf_repo_id = "cerelac2/consumer-complaint-vqa"
hf_token = "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" 
dataset.push_to_hub(hf_repo_id)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/411 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cerelac2/consumer-complaint-vqa/commit/def97b9531b5e73c50f67bf180449857d2b1d527', commit_message='Upload dataset', commit_description='', oid='def97b9531b5e73c50f67bf180449857d2b1d527', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!gpustat

In [8]:
from datasets import load_dataset, DatasetDict, Dataset
from tqdm import tqdm  # Import tqdm for the progress bar

# Load ds1 and ds2 datasets
ds1 = load_dataset("zera09/Video_complaint")
ds2 = load_dataset("cerelac2/consumer-complaint-vqa")

# Extract unique video names from ds1 train and test
train_video_names = set(row['video'] for row in ds1['train'])
test_video_names = set(row['video'] for row in ds1['test'])

# Initialize lists to store new train and test samples
new_train = []
new_test = []

# Function to check if an ID matches any video name in a set
def matches_video(video_names, id):
    return any(id.startswith(video) for video in video_names)

# Process ds2 splits all at once, filtering based on video names
print("Filtering ds2 for train split matches...")
for row in tqdm(ds2['train'], desc="Processing ds2"):
    if matches_video(train_video_names, row['ID']):
        new_train.append(row)
    elif matches_video(test_video_names, row['ID']):
        new_test.append(row)
for row in tqdm(ds2['test'], desc="Processing ds2"):
    if matches_video(train_video_names, row['ID']):
        new_train.append(row)
    elif matches_video(test_video_names, row['ID']):
        new_test.append(row)
for row in tqdm(ds2['validation'], desc="Processing ds2"):
    if matches_video(train_video_names, row['ID']):
        new_train.append(row)
    elif matches_video(test_video_names, row['ID']):
        new_test.append(row)
# Convert lists to Dataset format
new_train_dataset = Dataset.from_list(new_train)
new_test_dataset = Dataset.from_list(new_test)

# Create a new DatasetDict
new_dataset = DatasetDict({
    "train": new_train_dataset,
    "test": new_test_dataset
})

# Push new dataset to the Hugging Face Hub


Filtering ds2 for train split matches...


Processing ds2: 100%|██████████| 1280/1280 [00:07<00:00, 179.95it/s]
Processing ds2: 100%|██████████| 160/160 [00:00<00:00, 182.45it/s]
Processing ds2: 100%|██████████| 160/160 [00:01<00:00, 137.34it/s]


In [9]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'ID', 'question', 'answer', 'emotion', 'sentiment', 'sarcasm'],
        num_rows: 1355
    })
    test: Dataset({
        features: ['image', 'ID', 'question', 'answer', 'emotion', 'sentiment', 'sarcasm'],
        num_rows: 245
    })
})

In [12]:
from huggingface_hub import notebook_login
# notebook_login()
new_dataset.push_to_hub("cerelac2/consumer-complaint-vqa")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cerelac2/consumer-complaint-vqa/commit/441c28d7446789a3e57e3384e47e868b5a45045a', commit_message='Upload dataset', commit_description='', oid='441c28d7446789a3e57e3384e47e868b5a45045a', pr_url=None, pr_revision=None, pr_num=None)