# Convert CVAT Video annotations to AVA format

### Unzip File
Make sure to set "path_to_zip" as the path to the zip file exported from CVAT

In [1]:
def unzip_file(path_to_zip, extract_to="./"):
    import zipfile
    with zipfile.ZipFile(path_to_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

path_to_zip = "./Escape_Room_Small.zip"
unzip_file(path_to_zip, extract_to="./")


After unzipping, you should have a file named "annotations.xml". Make sure to rename this to something appropriate and set it below:

### Imports and Setup
Make sure to set `PATH_TO_XML`, `SAVE_FILE_PATH`  
`NAME_TO_ID` maps the names of the actions to the id as described in `Action_Detection/data/actions_list.txt`  
You can adjust this if you need to but it must be consistent with the `actions_list.txt`  


In [2]:
import numpy as np
import os 
import xmltodict
import pprint

cores = os.cpu_count()
print(f"Number of cores: {cores}")


PATH_TO_XML = "./escape_room_small_annotations.xml" #### SET XML FILE PATH HERE
SAVE_FILE_PATH = "./ava_annotations.txt" #### SET SAVE FILE PATH HERE FOR THE CONVERTED ANNOTATIONS
NAME_TO_ID = {
    "walk": 0,
    "sit": 1,
    "stand": 2,
    "bend/bow (at the waist)": 3,
    "run/jog": 4,
    "hand wave": 5,
    "get up": 6,
    "paired(standing together with one other person)": 7,
    "huddle (standing together with 2 or more people)": 8,
    "lift/pick up": 9,
    "carry/hold (an object)": 10,
    "point to (an object)": 11,
    "write": 12,
    "read": 13,
    "put down": 14,
    "watch (object)": 15,
    "talk to": 16,
    "listen": 17,
    "watch (person)": 18,
    "gesture (a person)": 19,
    "give/serve (an object) to (a person)": 20,
    "take (an object) from (a person)": 21,
    "face to face": 22,
    "hi 5": 23,
    "laugh":25,
    "smile":26,
    "stressed":27,
    "annoyed/frustrated":28,
    "take (an object) from (robot)":29,
    "give/serve (an object) to (robot)":30,
    "talk to (robot)":31,
    "listen to (robot)":32,
    "watch (robot)":33,
}


Number of cores: 32


### Convert annotations to AVA format

#### Load XML file

In [3]:
def get_xml_dic(xml_path):
    with open(xml_path) as fd:
        dic = xmltodict.parse(fd.read())
    return dic


annotations = get_xml_dic(PATH_TO_XML)['annotations']


#### Meta Data

In [4]:
video_name_to_task_id = {}
task_id_to_video_name = {}
height = None
width = None


for task in annotations['meta']['project']['tasks']['task']:
    task_id = int(task['id'])
    video_name = task['name']
    video_name_to_task_id[video_name] = task_id
    task_id_to_video_name[task_id] = video_name
    height = int(task['original_size']['height'])
    width = int(task['original_size']['width'])

print("Number of videos: ", len(video_name_to_task_id))
assert height is not None and width is not None
print("height: ", height)
print("width: ", width)
print("tracks: ", len(annotations['track']))


Number of videos:  2
height:  360
width:  640
tracks:  6


In [6]:
with open(SAVE_FILE_PATH, "w") as f:


    for track in annotations['track']:
        track_id = track['@id']
        video_name = task_id_to_video_name[int(track['@task_id'])]
        for box in track['box']:
            frame_num = int(box["@frame"])
            if frame_num%30 == 0 and frame_num != 0:
                outside = False if box['@outside'] == '0' else True
                if not outside:
                    xtl = round(float(box['@xtl']) / width, 3)
                    ytl = round(float(box['@ytl']) / height, 3)
                    xbr = round(float(box['@xbr']) / width, 3)
                    ybr = round(float(box['@ybr']) / height, 3)
                    for attribute in box['attribute']:
                        action_name = attribute['#text']
                        if action_name != "None" and attribute['@name'] != "Person ID":
                            action_id = NAME_TO_ID[action_name]
                            row_str = f"{video_name},{frame_num},{xtl},{ytl},{xbr},{ybr},{action_id},{track_id}"
                            f.write(row_str + "\n")
            
print("Done")
    

Done


### Extracting Frames
We need to extract frames from all the videos. You can use the following script to extract frames and place them in the proper AVA format. 

In [1]:
import os 
video_folder = "./all_videos"
video_paths = [os.path.join(video_folder, f) for f in os.listdir(video_folder) if f.endswith('.mp4')]


In [3]:
!pip install opencv-python



In [4]:
import os
import cv2
import multiprocessing

# Listen up, we got our folders here
video_folder = 'all_videos'
output_folder = 'frames'

os.makedirs(output_folder, exist_ok=True)

def extract_frames(video_path, output_folder):
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    video_output_folder = os.path.join(output_folder, video_name)
    os.makedirs(video_output_folder, exist_ok=True)
    
    cap = cv2.VideoCapture(video_path)
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1
        frame_filename = f"{video_name}_{frame_count:06d}.jpg"
        frame_filepath = os.path.join(video_output_folder, frame_filename)
        cv2.imwrite(frame_filepath, frame)
    
    cap.release()
    print(f"Extracted frames from {video_name}")

# Get the video paths
video_paths = [os.path.join(video_folder, f) for f in os.listdir(video_folder) if f.endswith('.mp4')]

# Process the videos
def process_video(video_path):
    extract_frames(video_path, output_folder)

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    pool.map(process_video, video_paths)

Extracted frames from G3_hazard_segment1
Extracted frames from G3_hazard_segment7
Extracted frames from G1_medical_segment3
Extracted frames from G1_medical_segment1
Extracted frames from G5_hazard_segment5
Extracted frames from G5_hazard_segment3
Extracted frames from G2_medical_segment6
Extracted frames from G1_medical_segment9
Extracted frames from G1_medical_segment4
Extracted frames from G2_medical_segment7
Extracted frames from G2_medical_segment9
Extracted frames from G1_medical_segment7
Extracted frames from G2_medical_segment2
Extracted frames from G5_medical_segment6
Extracted frames from G5_medical_segment1
Extracted frames from G2_hazard_segment7
Extracted frames from G5_medical_segment7
Extracted frames from G5_medical_segment4
Extracted frames from G4_hazard_segment2
Extracted frames from G1_hazard_segment8
Extracted frames from G4_hazard_segment9
Extracted frames from G4_hazard_segment4
Extracted frames from G1_hazard_segment6
Extracted frames from G2_hazard_segment2
Ext

In [None]:
# need to create file listing all of the frames used for validation and for test
import pandas as pd
file_path = SAVE_FILE_PATH
df = pd.read_csv(file_path, header=None)
video_names = df[0].unique()
print(video_names)
print("Number of videos: ", len(video_names))

np.random.seed(0)
np.random.shuffle(video_names)
split_index = int(0.8 * len(video_names))
train_set = video_names[:split_index]
val_set = video_names[split_index:]

print("Number of videos in train set: ", len(train_set))
print("Number of videos in val set: ", len(val_set))


os.makedirs("./frame_lists", exist_ok=True)
val_file = "./frame_lists/val.csv"
train_file = "./frame_lists/train.csv"

video_id = 0
val_lines = []
train_lines = []

for video_name in val_set:
    frames_folder_path = os.path.join("./frames", video_name.split(".")[0])
    frames = os.listdir(frames_folder_path)
    frames.sort(key=lambda x: int(x.split(".")[0].split("_")[-1]))
    for frame in frames:
        frame_path = os.path.join(frames_folder_path, frame)
        frame_num = int(frame.split(".")[0].split("_")[-1])
        frame_path = os.path.join(os.path.basename(os.path.dirname(frame_path)), os.path.basename(frame_path))

        line = f"{video_id} {video_id} {frame_num-1} {frame_path} \"\""
        val_lines.append(line)
    video_id += 1

for video_name in train_set:
    frames_folder_path = os.path.join("./frames", video_name.split(".")[0])
    frames = os.listdir(frames_folder_path)
    frames.sort(key=lambda x: int(x.split(".")[0].split("_")[-1]))
    for frame in frames:
        frame_path = os.path.join(frames_folder_path, frame)
        frame_num = int(frame.split(".")[0].split("_")[-1])
        frame_path = os.path.join(os.path.basename(os.path.dirname(frame_path)), os.path.basename(frame_path))

        line = f"{video_id} {video_id} {frame_num-1} {frame_path} \"\""
        train_lines.append(line)
    video_id += 1

with open(val_file, "w") as f:
    f.write("original_vido_id video_id frame_id path labels\n")
    for line in val_lines:
        f.write(line + "\n")

with open(train_file, "w") as f:
    f.write("original_vido_id video_id frame_id path labels\n")
    for line in train_lines:
        f.write(line + "\n")