#  Analyzing Qualcom dataset

This is a very big dataset including more than a thousand of labeled classes.

In [None]:
import json
from collections import Counter
import re

# Load annotations
with open("fine_grained_labels.json", "r") as f:
    data = json.load(f) # Analyzing dataset

## Get top labels

In [None]:
# Flatten all labels
all_labels = [label for item in data for label in item["labels"]]

# Count occurrences
label_counts = Counter(all_labels)

# Print top 30 most common classes
for label, count in label_counts.most_common(100):
    print(f"{label}: {count}")

fire hydrant - head straight: 3377
fire hydrant - knee on ground: 3343
fire hydrant - arms shoulder width: 3043
squats - shoulder-width: 2794
side plank - elbow below shoulder: 2639
fire hydrant - no obvious issue: 2161
fire hydrant (standing) - head straight: 2148
raised leg circles (clockwise) - clockwise: 2006
squats - no obvious issue: 1839
downward dog - heels off the floor: 1804
squat kick - speed=0.40 rps: 1794
side plank - both knees on floor: 1643
squats - 90 degrees: 1641
fire hydrant (standing) - shin not parallel: 1539
pushups (on knees) - wide: 1396
raised leg circles (clockwise) - 70 degres: 1365
fire hydrant (standing) - torso too high: 1339
quad stretch (right) - getting into position: 1274
pushups (on knees) - 90 degrees: 1265
puddle jump - normal width: 1264
alternating forward lunges - no obvious issue: 1263
air jump rope - no obvious issue: 1261
toe touch - legs straight: 1209
alternating forward lunges - normal speed: 1207
quick feet - shoulder-width: 1194
squat ki

In [None]:
# Print labels
print(len(label_counts))

counter = 0
for label, count in label_counts.items():
  if count > 100:
    print('Label: ', label, ' Count: ', count)
    counter += 1

print(counter)

1851
Label:  elbow plank - stopping early  Count:  386
Label:  elbow plank - arms facing inward and legs and hips on the floor  Count:  361
Label:  elbow plank - arms too wide  Count:  411
Label:  elbow plank - legs and hips on the floor  Count:  463
Label:  elbow plank - shaking  Count:  328
Label:  elbow plank - arms facing inward and on knees  Count:  359
Label:  alternating v ups - no reach  Count:  300
Label:  alternating v ups - rom=1  Count:  702
Label:  criss-cross (feet on the floor) - feet on the floor  Count:  584
Label:  alternating v ups - touching wrong foot  Count:  294
Label:  alternating v ups - rom=5  Count:  713
Label:  alternating v ups - butt off the ground  Count:  289
Label:  alternating v ups - rom=4  Count:  553
Label:  alternating v ups - stopping early  Count:  236
Label:  alternating v ups - legs too low  Count:  288
Label:  child pose - hands as far as possible  Count:  253
Label:  criss-cross (feet on the floor) - torso_rotation=4  Count:  172
Label:  the 

In [None]:
tempo_keywords = ['slow', 'fast', 'tempo', 'controlled', 'jerky', 'shaking', 'explosive', 'pause', 'unstable', 'swing', 'bounce']

tempo_related = [item for item in data if any(
    any(kw in label.lower() for kw in tempo_keywords) for label in item["labels_descriptive"]
)]

print(f"Found {len(tempo_related)} tempo-related clips.")

# Optional: print some examples
for item in tempo_related[:5]:
    print(item["video_path"], "-", item["labels_descriptive"])


Found 30417 tempo-related clips.
./00000004.mp4 - ['elbow plank - User is shaking']
./00000021.mp4 - ['alternating v ups - User is going as fast as possible', 'alternating v ups - rom=5']
./00000041.mp4 - ['the hundred (feet on the floor) - head up - hunched', 'the hundred (feet on the floor) - User is moving their arms as fast as possible']
./00000043.mp4 - ["the hundred (feet on the floor) - User's head is up", 'the hundred (feet on the floor) - User is moving their arms as fast as possible']
./00000052.mp4 - ["the hundred (table-top position) - User's head is up", "the hundred (table-top position) - User's legs are bent 90 degrees", 'the hundred (table-top position) - User is moving their arms as fast as possible']


In [None]:
pushup_clips = [item for item in data if any("pushup" in label for label in item["labels"])]
print(f"Push-up clips: {len(pushup_clips)}")

# You can then filter those with tempo cues
pushup_with_tempo = [item for item in pushup_clips if any(
    any(kw in desc.lower() for kw in tempo_keywords) for desc in item["labels_descriptive"]
)]

for item in pushup_with_tempo[:5]:
    print(item["video_path"], "-", item["labels_descriptive"])


Push-up clips: 19526
./00001222.mp4 - ["burpee (no pushup) - User isn't stepping their feet back", 'burpee (no pushup) - User is going as fast as possible']
./00003041.mp4 - ['pushups - User is going fast', 'pushups - Push-ups are too shallow', "pushups - User has good arm placement that's shoulder width apart"]
./00003542.mp4 - ['spider man pushup - User is going too fast', 'spider man pushup - rom=1']
./00004152.mp4 - ['burpee (no pushup) - User has good form', 'burpee (no pushup) - User is going as fast as possible']
./00004902.mp4 - ["burpee (no pushup) - User's hips are too high", 'burpee (no pushup) - User is going as fast as possible']


In [None]:
# Filter items by ID >= 228000
subset = [
    item for item in data
    if int(item["video_path"].split("/")[-1].split(".")[0]) >= 228000
]

print(f"Subset size from 228000 onwards: {len(subset)}")

all_labels_subset = [label for item in subset for label in item["labels"]]

label_counts_subset = Counter(all_labels_subset)

print("\nTop labels in subset:")
for label, count in label_counts_subset.most_common(30):
    print(f"{label}: {count}")

Subset size from 228000 onwards: 70089

Top labels in subset:
squats - shoulder-width: 932
squat kick - speed=0.40 rps: 801
fire hydrant - knee on ground: 756
fire hydrant - head straight: 740
fire hydrant - arms shoulder width: 676
squats - no obvious issue: 569
squats - 90 degrees: 529
fire hydrant (standing) - head straight: 527
squat kick - not lifting knees: 510
pushups (on knees) - wide: 489
fire hydrant - no obvious issue: 472
alternating forward lunges - normal speed: 466
alternating forward lunges - no obvious issue: 418
alternating forward lunges - normal width: 402
pushups (on knees) - 90 degrees: 399
squat jump - no obvious issue: 398
raised leg circles (clockwise) - clockwise: 393
fire hydrant (standing) - torso too high: 392
jumping lunges - medium steps: 384
fire hydrant (standing) - shin not parallel: 383
quad stretch (right) - getting into position: 378
jumping lunges - speed=0.80 rps: 354
puddle jump - normal width: 344
quick feet - shoulder-width: 341
pushups - wide:

## Get subset from Qualcom dataset

In [None]:
import json

# Load the QEVD annotations
with open("fine_grained_labels.json", "r") as f:
    data = json.load(f)

# Define keyword lists
tempo_keywords = [
    "too fast", "too slow"
]
form_keywords = [
    "sloppy form", "not straight"
]
rom_keywords = [
    "low range of motion"
]

# Function to check if any keywords exist in a text
def matches_keywords(text, keywords):
    return any(kw in text.lower() for kw in keywords)

# Filter videos
bad_tempo, bad_form, bad_rom = [], [], []

for item in data:
    labels_text = " ".join(item["labels_descriptive"]).lower()

    if matches_keywords(labels_text, tempo_keywords):
        bad_tempo.append(item)
    elif matches_keywords(labels_text, form_keywords):
        bad_form.append(item)
    elif matches_keywords(labels_text, rom_keywords):
        bad_rom.append(item)

print(f"Bad Tempo: {len(bad_tempo)}")
print(f"Bad Form: {len(bad_form)}")
print(f"Bad Range of Motion: {len(bad_rom)}")


# Testing InternVideo (Bad videos only)

In [None]:
# Install dependencies
!pip install -q decord transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
import matplotlib.pyplot as plt
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

# model setting
model_path = 'OpenGVLab/InternVideo2_5_Chat_8B'

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda().to(torch.bfloat16)

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD)])
    return transform


def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float("inf")
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size)
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


def load_image(image, input_size=448, max_num=6):
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
    return frame_indices

def get_num_frames_by_duration(duration):
        local_num_frames = 4
        num_segments = int(duration // local_num_frames)
        if num_segments == 0:
            num_frames = local_num_frames
        else:
            num_frames = local_num_frames * num_segments

        num_frames = min(512, num_frames)
        num_frames = max(128, num_frames)

        return num_frames

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32, get_frame_by_duration = False):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    if get_frame_by_duration:
        duration = max_frame / fps
        num_segments = get_num_frames_by_duration(duration)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

configuration_internvl_chat.py:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


configuration_internlm2.py:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- configuration_internvl_chat.py
- configuration_intern_vit.py
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat_hico2.py:   0%|          | 0.00/19.4k [00:00<?, ?B/s]

modeling_internlm2.py:   0%|          | 0.00/61.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_intern_vit.py:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B:
- modeling_internvl_chat_hico2.py
- modeling_internlm2.py
- conversation.py
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


FlashAttention2 is not installed.




model.safetensors.index.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/VR/Visual_Recognition/subset

bad_form  bad_tempo		  selected_bad_rom.json
bad_rom   selected_bad_form.json  selected_bad_tempo.json


In [None]:
!ls /content/drive/MyDrive/VR/Visual_Recognition/good_bad_videos

bad_form_250   good_rom_250		   selected_bad_tempo_250.json
bad_rom_250    good_tempo_250		   selected_good_form_250.json
bad_tempo_250  selected_bad_form_250.json  selected_good_rom_250.json
good_form_250  selected_bad_rom_250.json   selected_good_tempo_250.json


In [None]:
import csv

# Setup
root_folder = "/content/drive/MyDrive/VR/Visual_Recognition/subset"
categories = {
    "bad_tempo": "tempo",
    "bad_form": "form",
    "bad_rom": "rom"
}

for subfolder in categories:
    folder_path = os.path.join(root_folder, subfolder)
    if os.path.isdir(folder_path):
        print(f"✅ Folder exists: {folder_path}")
    else:
        print(f"❌ Folder NOT found: {folder_path}")

✅ Folder exists: /content/drive/MyDrive/VR/Visual_Recognition/subset/bad_tempo
✅ Folder exists: /content/drive/MyDrive/VR/Visual_Recognition/subset/bad_form
✅ Folder exists: /content/drive/MyDrive/VR/Visual_Recognition/subset/bad_rom


## Zero-Shot Prompt

In [None]:
csv_path = "exercise_eval_results.csv"
header = ["filename", "predicted_label", "predicted_reason", "ground_truth_label", "ground_truth_reason", "correct"]

# Generation Config
max_num_frames = 512
generation_config = dict(
    do_sample=False,
    temperature=0.0,
    max_new_tokens=512,
    top_p=0.1,
    num_beams=1
)

# CSV Setup
with open(csv_path, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for subfolder, ground_truth_reason in categories.items():
        folder_path = os.path.join(root_folder, subfolder)
        for filename in os.listdir(folder_path):
            if any(filename.endswith(ext) for ext in [".mp4"]):
                video_path = os.path.join(folder_path, filename)
                print(f"\nEvaluating: {subfolder}/{filename}")
                try:
                    with torch.no_grad():
                        pixel_values, num_patches_list = load_video(video_path, num_segments=64, max_num=1)
                        pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
                        video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])

                        prompt = f"""We are evaluating whether a video clip of an exercise is being performed correctly for hypertrophy-focused training.
                        There are three main criteria to assess:

                        - **Tempo**: This refers to the speed of movement during each phase of the exercise. For optimal hypertrophy, the eccentric (lowering) phase should be slow and controlled, while the concentric (lifting) phase should be fast and explosive.

                        - **Form**: This includes the alignment, posture, and technique of the movement. Proper form means the exercise is performed with the correct joint positioning, body alignment, and grip/stance. Common issues include flaring elbows, improper back posture, or hands/feet placed too wide or narrow.

                        - **Range of Motion (ROM)**: This measures whether the exercise goes through the full intended motion. A proper ROM means the movement starts from a full stretch and ends in a full contraction, without cutting the motion short or stopping midway.

                        Now, assess the following:
                        **Is this exercise being done correctly?**
                        Respond with only the following format:
                        1. Yes / No
                        2. (If No) tempo / form / rom
                        3. Briefly explain the issue in one sentence.
                        """

                        question = video_prefix + prompt
                        output, _ = model.chat(
                            tokenizer, pixel_values, question, generation_config,
                            num_patches_list=num_patches_list, history=None, return_history=True
                        )
                        print("Model Output: \n", output)

                        lines = output.strip().splitlines()
                        lines = [line.strip() for line in lines if line.strip() != ""]

                        # Defaults
                        predicted_label = "Unclear"
                        predicted_reason = "unclear"
                        explanation = ""

                        if len(lines) >= 1:
                            first_line = lines[0].lower()
                            if "yes" in first_line:
                                predicted_label = "Good"
                                predicted_reason = "none"
                            elif "no" in first_line:
                                predicted_label = "Bad"
                                if len(lines) >= 2:
                                    second_line = lines[1].lower()
                                    if "tempo" in second_line:
                                        predicted_reason = "tempo"
                                    elif "form" in second_line:
                                        predicted_reason = "form"
                                    elif "rom" in second_line or "range" in second_line or "motion" in second_line:
                                        predicted_reason = "rom"
                                    else:
                                        predicted_reason = "unclear"
                                if len(lines) >= 3:
                                    explanation = lines[2]

                        print("Predicted Label:", predicted_label)
                        print("Predicted Reason:", predicted_reason)
                        print("Explanation:", explanation)
                        print("Ground Truth Reason:", ground_truth_reason)

                        writer.writerow([
                            filename,
                            predicted_label,
                            predicted_reason,
                            "Bad",
                            ground_truth_reason,
                            predicted_label == "Bad" and predicted_reason == ground_truth_reason
                        ])
                except Exception as e:
                    print(f"Error processing {filename}: {e}")



Evaluating: bad_tempo/00012249.mp4
Model Output: 
 1. No
2. Form
3. The person's elbows are flaring out too much, which can put unnecessary stress on the joints and muscles.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. The person's elbows are flaring out too much, which can put unnecessary stress on the joints and muscles.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00011556.mp4
Model Output: 
 1. No
2. Form
3. The person's back is arched, which can lead to improper form and potential strain.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. The person's back is arched, which can lead to improper form and potential strain.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00016532.mp4
Model Output: 
 1. No
2. Tempo / Form / ROM
3. The exercise lacks proper form, with the person's lower back arching excessively and the movement not being controlled.
Predicted Label: Bad
Predicted Reason: tempo
Explanation: 3. The exercise lacks proper form, with the perso

## Chain of Thought Prompt

In [None]:
csv_path = "exercise_eval_results_part2.csv"
header = ["filename", "predicted_label", "predicted_reason", "ground_truth_label", "ground_truth_reason", "correct"]

# Generation Config
max_num_frames = 512
generation_config = dict(
    do_sample=False,
    temperature=0.0,
    max_new_tokens=512,
    top_p=0.1,
    num_beams=1
)

# CSV Setup
with open(csv_path, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for subfolder, ground_truth_reason in categories.items():
        folder_path = os.path.join(root_folder, subfolder)
        for filename in os.listdir(folder_path):
            if any(filename.endswith(ext) for ext in [".mp4"]):
                video_path = os.path.join(folder_path, filename)
                print(f"\nEvaluating: {subfolder}/{filename}")
                try:
                    with torch.no_grad():
                        pixel_values, num_patches_list = load_video(video_path, num_segments=64, max_num=1)
                        pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
                        video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])

                        prompt = f"""

                        We are evaluating whether a video clip of an exercise is being performed correctly for hypertrophy training.

                        Step-by-step:
                        1. Analyze the **tempo**. Is the eccentric phase slow and the concentric phase explosive?
                        2. Analyze the **form**. Are the joints aligned properly? Are the hands in correct position?
                        3. Analyze the **range of motion**. Does the movement go from full extension to full contraction?

                        Now, assess the following:
                        **Is this exercise being done correctly?**
                        Respond with only the following format:
                        1. Yes / No
                        2. (If No) tempo / form / rom
                        3. Briefly explain the issue in one sentence.
                        """

                        question = video_prefix + prompt
                        output, _ = model.chat(
                            tokenizer, pixel_values, question, generation_config,
                            num_patches_list=num_patches_list, history=None, return_history=True
                        )
                        print("Model Output: \n", output)

                        lines = output.strip().splitlines()
                        lines = [line.strip() for line in lines if line.strip() != ""]

                        # Defaults
                        predicted_label = "Unclear"
                        predicted_reason = "unclear"
                        explanation = ""

                        if len(lines) >= 1:
                            first_line = lines[0].lower()
                            if "yes" in first_line:
                                predicted_label = "Good"
                                predicted_reason = "none"
                            elif "no" in first_line:
                                predicted_label = "Bad"
                                if len(lines) >= 2:
                                    second_line = lines[1].lower()
                                    if "tempo" in second_line:
                                        predicted_reason = "tempo"
                                    elif "form" in second_line:
                                        predicted_reason = "form"
                                    elif "rom" in second_line or "range" in second_line or "motion" in second_line:
                                        predicted_reason = "rom"
                                    else:
                                        predicted_reason = "unclear"
                                if len(lines) >= 3:
                                    explanation = lines[2]

                        print("Predicted Label:", predicted_label)
                        print("Predicted Reason:", predicted_reason)
                        print("Explanation:", explanation)
                        print("Ground Truth Reason:", ground_truth_reason)

                        writer.writerow([
                            filename,
                            predicted_label,
                            predicted_reason,
                            "Bad",
                            ground_truth_reason,
                            predicted_label == "Bad" and predicted_reason == ground_truth_reason
                        ])
                except Exception as e:
                    print(f"Error processing {filename}: {e}")



Evaluating: bad_tempo/00012249.mp4




Model Output: 
 1. No
2. Form
3. The person's back is arched, and the hands are not aligned properly.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. The person's back is arched, and the hands are not aligned properly.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00011556.mp4
Model Output: 
 1. No
2. tempo / form / rom
3. The exercise lacks a clear eccentric and concentric phase, and the form is not optimal with the hands positioned too far apart.
Predicted Label: Bad
Predicted Reason: tempo
Explanation: 3. The exercise lacks a clear eccentric and concentric phase, and the form is not optimal with the hands positioned too far apart.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00016532.mp4
Model Output: 
 1. No
2. Form
3. The person's lower back is not maintaining contact with the floor during the exercise, which is important for proper form.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. The person's lower back is not maintaining contact with the flo

## Few-Shot Prompt

In [None]:
import os
import csv
import torch

csv_path = "exercise_eval_results_part3.csv"
header = ["filename", "predicted_label", "predicted_reason", "ground_truth_label", "ground_truth_reason", "correct"]

# Generation Config
generation_config = dict(
    do_sample=False,
    temperature=0.0,
    max_new_tokens=512,
    top_p=0.1,
    num_beams=1
)

def get_few_shot_examples():
    return """\
Frame1: <image>
Frame2: <image>
Frame3: <image>
Frame4: <image>
Step-by-step analysis:
1. The eccentric phase is too fast and the concentric is slow.
2. Form is correct.
3. Range of motion is complete.

Answer:
1. No
2. tempo
3. Poor tempo — slow concentric, fast eccentric.

---

Frame1: <image>
Frame2: <image>
Frame3: <image>
Frame4: <image>
Step-by-step analysis:
1. Tempo is appropriate.
2. Form is incorrect — elbows flare and back arches.
3. Full range of motion is performed.

Answer:
1. No
2. form
3. Incorrect form due to flared elbows and lumbar extension.

---

Frame1: <image>
Frame2: <image>
Frame3: <image>
Frame4: <image>
Step-by-step analysis:
1. Tempo is appropriate.
2. Form is clean.
3. The person cuts the movement short — doesn’t reach full extension.

Answer:
1. No
2. rom
3. Incomplete range of motion, rep ends early.
---
"""

# CSV Setup
with open(csv_path, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for subfolder, ground_truth_reason in categories.items():
        folder_path = os.path.join(root_folder, subfolder)
        for filename in os.listdir(folder_path):
            if filename.endswith(".mp4"):
                video_path = os.path.join(folder_path, filename)
                print(f"\nEvaluating: {subfolder}/{filename}")
                try:
                    with torch.no_grad():
                        pixel_values, num_patches_list = load_video(video_path, num_segments=64, max_num=1)
                        pixel_values = pixel_values.to(torch.bfloat16).to(model.device)

                        # Construct video input for the model
                        video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])

                        # Prompt with CoT + few-shot
                        prompt = f"""
We are evaluating whether a video clip of an exercise is being performed correctly for hypertrophy training.

Please follow these steps:
1. Evaluate the **tempo**. Is the eccentric phase slow and the concentric explosive?
2. Evaluate the **form**. Are joints aligned and posture stable?
3. Evaluate the **range of motion**. Is the motion full from start to end?

Use this format:
1. Yes / No
2. tempo / form / rom (if No)
3. Brief explanation of the issue.

Now analyze the next video:
"""

                        full_prompt = get_few_shot_examples() + video_prefix + prompt

                        output, _ = model.chat(
                            tokenizer, pixel_values, full_prompt, generation_config,
                            num_patches_list=num_patches_list, history=None, return_history=True
                        )

                        print("Model Output: \n", output)

                        # Process model response
                        lines = [line.strip() for line in output.strip().splitlines() if line.strip()]
                        predicted_label = "Unclear"
                        predicted_reason = "unclear"
                        explanation = ""

                        if len(lines) >= 1:
                            if "yes" in lines[0].lower():
                                predicted_label = "Good"
                                predicted_reason = "none"
                            elif "no" in lines[0].lower():
                                predicted_label = "Bad"
                                if len(lines) >= 2:
                                    if "tempo" in lines[1].lower():
                                        predicted_reason = "tempo"
                                    elif "form" in lines[1].lower():
                                        predicted_reason = "form"
                                    elif "rom" in lines[1].lower() or "range" in lines[1].lower():
                                        predicted_reason = "rom"
                                    else:
                                        predicted_reason = "unclear"
                                if len(lines) >= 3:
                                    explanation = lines[2]

                        print("Predicted Label:", predicted_label)
                        print("Predicted Reason:", predicted_reason)
                        print("Explanation:", explanation)
                        print("Ground Truth Reason:", ground_truth_reason)

                        writer.writerow([
                            filename,
                            predicted_label,
                            predicted_reason,
                            "Bad",
                            ground_truth_reason,
                            predicted_label == "Bad" and predicted_reason == ground_truth_reason
                        ])
                except Exception as e:
                    print(f"Error processing {filename}: {e}")



Evaluating: bad_tempo/00012249.mp4
Model Output: 
 1. No
2. form
3. Incorrect form due to flared elbows and lumbar extension.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. Incorrect form due to flared elbows and lumbar extension.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00011556.mp4
Model Output: 
 1. No
2. form
3. Incorrect form due to flared elbows and lumbar extension.
Predicted Label: Bad
Predicted Reason: form
Explanation: 3. Incorrect form due to flared elbows and lumbar extension.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00016532.mp4
Model Output: 
 1. No
2. tempo
3. Poor tempo — slow concentric, fast eccentric.
Predicted Label: Bad
Predicted Reason: tempo
Explanation: 3. Poor tempo — slow concentric, fast eccentric.
Ground Truth Reason: tempo

Evaluating: bad_tempo/00023161.mp4
Model Output: 
 1. No
2. tempo
3. Fast eccentric, slow concentric.
Predicted Label: Bad
Predicted Reason: tempo
Explanation: 3. Fast eccentric, slow concentric.
Ground Tr

# Testing InternVideo (Good and Bad videos)

In [None]:
import os
import csv
import json
import torch

# Config
root_folder = "/content/drive/MyDrive/VR/Visual_Recognition/good_bad_videos"
generation_config = dict(
    do_sample=False,
    temperature=0.2,
    max_new_tokens=512,
    top_p=0.1,
    num_beams=1
)

header = ["filename", "predicted_label", "ground_truth_label", "explanation", "correct"]

def ensure_csv_with_header(csv_path):
    if not os.path.exists(csv_path):
        with open(csv_path, mode='w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(header)

def evaluate_category(reasoning_category, prompt):
    """
    Evaluate both bad and good examples for a given reasoning category.
    Saves results incrementally to a single CSV file.
    """
    print(f"\n=== Evaluating Category: {reasoning_category.upper()} ===")

    csv_path = f"exercise_eval_results_{reasoning_category}.csv"
    ensure_csv_with_header(csv_path)

    cases = [
        ("bad", f"bad_{reasoning_category}_250", f"selected_bad_{reasoning_category}_250.json"),
        ("good", f"good_{reasoning_category}_250", f"selected_good_{reasoning_category}_250.json")
    ]

    for label_type, folder_name, json_file in cases:
        ground_truth_label = "Bad" if label_type == "bad" else "Good"

        json_path = os.path.join(root_folder, json_file)
        folder_path = os.path.join(root_folder, folder_name)

        if not os.path.exists(json_path) or not os.path.exists(folder_path):
            print(f"Skipping: {json_path} or {folder_path} not found.")
            continue

        with open(json_path, "r") as f:
            items = json.load(f)

        # Sort deterministically
        items.sort(key=lambda item: os.path.basename(item["video_path"]))
        # items = items[:5]
        total = len(items)

        for idx, item in enumerate(items, start=1):
            filename = os.path.basename(item["video_path"])
            video_path = os.path.join(folder_path, filename)

            if not os.path.exists(video_path):
                print(f"File not found: {video_path}")
                continue

            print(f"\n[{idx}/{total}] Evaluating {label_type.upper()} video: {filename}")

            try:
                with torch.no_grad():
                    pixel_values, num_patches_list = load_video(video_path, num_segments=64, max_num=1)
                    pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
                    video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])

                    question = video_prefix + prompt
                    output, _ = model.chat(
                        tokenizer, pixel_values, question, generation_config,
                        num_patches_list=num_patches_list, history=None, return_history=True
                    )

                    lines = [line.strip() for line in output.strip().splitlines() if line.strip()]
                    predicted_label = "Unclear"
                    explanation = ""

                    if len(lines) >= 1:
                        if "yes" in lines[0].lower():
                            predicted_label = "Good"
                        elif "no" in lines[0].lower():
                            predicted_label = "Bad"
                        if len(lines) >= 2:
                            explanation = lines[1]

                    correct = (predicted_label == ground_truth_label)

                    print(f"Predicted: {predicted_label} | GT: {ground_truth_label} | Correct: {correct}")
                    print("Explanation:", explanation)

                    # Write result immediately
                    with open(csv_path, mode='a', newline='') as f:
                        writer = csv.writer(f)
                        writer.writerow([
                            filename,
                            predicted_label,
                            ground_truth_label,
                            explanation,
                            correct
                        ])

            except Exception as e:
                print(f"Error processing {filename}: {e}")


In [None]:
# Evaluate Tempo
prompt = f"""

You are a highly experienced personal trainer. Your task is to evaluate the **tempo** of the exercise shown in this video clip.

Focus only on the **overall speed of each repetition**.

Guidelines:
- The perfect repetition should be balanced (1-2 seconds).
- A **very very fast** repetition (less than 1 second total) may indicate rushing or poor control.
- A **very very slow** repetition (more than 3–4 seconds) may indicate inefficiency or loss of intensity.
- Ignore all other aspects like form or range of motion.

Note:
These videos are performed by amateurs, not professionals, but also trying to improve the efficiency of the workout.
Only call out clear tempo issues (e.g., obviously too fast or too slow).
Do not penalize minor imperfections in speed or rhythm.

Now answer the following:

**Is this exercise being performed at a correct and balanced tempo?**

Respond using only the following format:
1. Yes / No
2. One-sentence explanation describing your reasoning.
"""

evaluate_category("tempo", prompt)


=== Evaluating Category: TEMPO ===

[1/250] Evaluating BAD video: 00001649.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The exercise is being performed at a very slow tempo, which may indicate inefficiency or loss of intensity.

[2/250] Evaluating BAD video: 00001716.mp4
Predicted: Good | GT: Bad | Correct: False
Explanation: 2. The exercise is being performed at a balanced tempo, with each repetition lasting approximately 1-2 seconds.

[3/250] Evaluating BAD video: 00001724.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The exercise is being performed at a very slow tempo, which may indicate inefficiency or loss of intensity.

[4/250] Evaluating BAD video: 00002281.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The exercise is being performed at a very slow tempo, with each repetition taking more than 3-4 seconds.

[5/250] Evaluating BAD video: 00002408.mp4
Predicted: Good | GT: Bad | Correct: False
Explanation: 2. The exercise is being per

In [None]:
# Evaluate Form
prompt = f"""

You are a highly experienced personal trainer. Your task is to evaluate the **form** of the exercise shown in this video clip.

Focus only on **body alignment and control** during the movement.

Guidelines:
- Watch for **sloppy form**, such as lack of control or inconsistent motion.
- Check for **poor posture**, like a rounded back, knees caving in, or wrists bending awkwardly.
- Watch for **exaggerated movements**, such as knees bending too much, arms locking out too much, or overextension too much.
- Ignore aspects like tempo or range of motion (e.g., how far the person goes — unless it's clearly due to bad form).

Note:
These videos are performed by **amateurs**, not professionals, but they are trying to exercise safely and effectively.
Only flag **obvious form problems** that may reduce safety or efficiency.
Do not penalize small imperfections.

Now answer the following:

**Is this exercise being performed with correct and safe form?**

Respond using only the following format:
1. Yes / No
2. One-sentence explanation describing your reasoning.
"""

evaluate_category("form", prompt)


=== Evaluating Category: FORM ===

[1/250] Evaluating BAD video: 00001051.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person's back is rounded, and their arms are not positioned correctly, which can lead to poor posture and potential strain.

[2/250] Evaluating BAD video: 00001087.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person's back is rounded, and their head is not in a neutral position, which can lead to poor posture and potential strain on the neck and back.

[3/250] Evaluating BAD video: 00001655.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person's back is rounded, and their knees are not aligned properly, which can lead to poor posture and potential injury.

[4/250] Evaluating BAD video: 00002436.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person's form appears to be exaggerated, with their knees bending too much and their arms moving in an inconsistent manner.

[5/250] Evaluating BAD video: 

In [None]:
# Evaluate Range of Motion
prompt = f"""

You are a highly experienced personal trainer. Your task is to evaluate the **range of motion** (ROM) of the exercise shown in this video clip.

Focus only on **how much the limbs or body move** during each repetition.

Guidelines:
- Good ROM means completing the movement fully — from start to finish — without cutting it short.
- Flag any **clearly limited motion**, such as not going deep enough in a squat or a push up.
- Ignore all other aspects like form (e.g., posture) or tempo (e.g., speed of the movement).

Note:
These videos are performed by **amateurs**, not professionals, but they are trying to exercise safely and effectively.
Only call out **obvious issues in range of motion** (clearly too short or excessively exaggerated).
Do not penalize small errors. Be lenient! If you see something coming a bit short of range of motion, that's fine. Keep it as good ROM. Only mark bad range of motion when something is CLEARLY wrong.

Now answer the following:

**Is this exercise being performed with a correct and effective range of motion?**

Respond using only the following format:
1. Yes / No
2. One-sentence explanation describing your reasoning.
"""

evaluate_category("rom", prompt)


=== Evaluating Category: ROM ===

[1/250] Evaluating BAD video: 00000069.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person is not extending their legs fully during the exercise, which limits the range of motion.

[2/250] Evaluating BAD video: 00000581.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The range of motion appears limited, particularly in the upward movement of the legs. The legs do not reach a full extension during the exercise.

[3/250] Evaluating BAD video: 00000999.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The person in the video is not moving their limbs or body enough to complete the exercise fully, indicating a limited range of motion.

[4/250] Evaluating BAD video: 00001497.mp4
Predicted: Bad | GT: Bad | Correct: True
Explanation: 2. The range of motion appears to be limited, with the legs not extending fully during the exercise.

[5/250] Evaluating BAD video: 00002117.mp4
Predicted: Bad | GT: Bad | Correct: Tru