In [None]:

from google.colab import drive
drive.mount('/content/drive')



In [None]:
import cv2
import tensorflow as tf
import numpy as np
import random
import pandas as pd
import pickle
from pathlib import Path
import os
import csv
import xml.etree.ElementTree as ET

In [None]:
#extracting all the labels from the xml file in order to create a csv file with all the labels. This allows for easier handling of the lables.

def extract_labels_from_xml(xml_folder_path, attributes):
    all_labels = []


    for xml_file in os.listdir(xml_folder_path):
        if xml_file.endswith('.xml'):
            xml_path = os.path.join(xml_folder_path, xml_file)
            video_name = xml_file[:-4]
            tree = ET.parse(xml_path)
            root = tree.getroot()


            for image_attr in root.findall('.//image_attribute'):
                frame_num = image_attr.get('frame')
                attributes_found = [attribute.get('name') for attribute in image_attr.findall('.//attribute')]


                matched_attributes = set(attributes_found) & set(attributes)
                if matched_attributes:

                    all_labels.append({'video_name': video_name, 'frame': frame_num, 'label': list(matched_attributes)[0]})
                else:

                    all_labels.append({'video_name': video_name, 'frame': frame_num, 'label': 'Other'})

    return all_labels

def convert_to_csv(all_labels, output_csv_path):
    with open(output_csv_path, 'w', newline='') as csvfile:
        fieldnames = ['video_name', 'frame', 'label']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for label in all_labels:
            writer.writerow(label)

#this is the path to the folders containing the csv, xml and videos in my google drive so replace with your file paths
def main():
    xml_folder_path = '/content/drive/My Drive/XMLs_Polypectomy'
    output_csv_path = '/content/drive/My Drive/labels.csv'
    attributes = ["Injection", "Optimising_Position", "Polypectomy", "Inspecting_Resection"]

    all_labels = extract_labels_from_xml(xml_folder_path, attributes)
    convert_to_csv(all_labels, output_csv_path)

    print(f"CSV file has been created at: {output_csv_path}")

if __name__ == "__main__":
    main()

In [None]:
#here the frames from the videos are extracted, sampled at 5 frames per second, and matched to their respective label from the csv. They are being preprocessed in batches of 16 and then saved to pickle files

def load_labels_from_csv(csv_file_path):
    labels = {}
    with open(csv_file_path, mode='r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            video_name = row['video_name']
            frame = int(row['frame'])
            label = row['label']
            if video_name not in labels:
                labels[video_name] = {}
            labels[video_name][frame] = label
    return labels

def preprocess_video_frames_in_batches(video_folder_path, labels_csv_path, output_folder_path, output_size=(224, 224), frame_step=5, batch_size=16):
    labels = load_labels_from_csv(labels_csv_path)

    for video_file in os.listdir(video_folder_path):
        if video_file.endswith('.mov'):
            video_path = os.path.join(video_folder_path, video_file)
            video_name = Path(video_file).stem
            src = cv2.VideoCapture(video_path)
            frames = []
            batch_count = 0
            actual_frame_number = 0

            while True:
                ret, frame = src.read()
                if not ret:
                    break
                actual_frame_number += 1


                if actual_frame_number % frame_step == 0:

                    if actual_frame_number in labels.get(video_name, {}):
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        frame_resized = cv2.resize(frame, output_size)
                        label = labels[video_name][actual_frame_number]
                        frames.append((frame_resized, label))

                        if len(frames) == batch_size:
                            batch_file_path = os.path.join(output_folder_path, f"{video_name}_batch_{batch_count}.pkl")
                            with open(batch_file_path, 'wb') as file:
                                pickle.dump(frames, file)
                            frames = []
                            batch_count += 1


            if frames:
                batch_file_path = os.path.join(output_folder_path, f"{video_name}_batch_{batch_count}.pkl")
                with open(batch_file_path, 'wb') as file:
                    pickle.dump(frames, file)

            src.release()

    print(f"Preprocessed frames saved to {output_folder_path}")


In [None]:
#again these are the names of my paths so replace with your path names
video_folder_path = '/content/drive/My Drive/polyp video dataset'
labels_csv_path = '/content/drive/My Drive/labels.csv'
output_folder_path = '/content/drive/My Drive/batches matches full'


if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

preprocess_video_frames_in_batches(video_folder_path=video_folder_path, labels_csv_path=labels_csv_path, output_folder_path=output_folder_path, output_size=(224, 224), frame_step=5, batch_size=16)