In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import re
import pickle
from sklearn.preprocessing import LabelEncoder

directory = '/content/drive/My Drive/batches matches full'
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pkl')]

#group files by video ID and keep all batches from the same video in order to preserve temporal nature of video
video_files = {}
for f in files:
    match = re.search(r'P(\d+)_batch_(\d+)', f)
    if match:
        video_id, batch_number = int(match.group(1)), int(match.group(2))
        if video_id not in video_files:
            video_files[video_id] = []
        video_files[video_id].append((batch_number, f))

for video_id in video_files.keys():
    video_files[video_id].sort(key=lambda x: x[0])
    video_files[video_id] = [f for _, f in video_files[video_id]]

sorted_video_ids = sorted(video_files.keys())
train_video_ids = sorted_video_ids[:20]  #first 20 videos for training
test_video_ids = sorted_video_ids[-5:]  #last 5 videos for testing

train_files = [file for vid in train_video_ids for file in video_files[vid]]
test_files = [file for vid in test_video_ids for file in video_files[vid]]

def load_sequences_and_labels(file_list):
    sequences = []
    labels = []
    for file_path in file_list:
        with open(file_path, 'rb') as file:
            sequence_data = pickle.load(file)
            frames = [item[0] for item in sequence_data]
            sequence_labels = [item[1] for item in sequence_data]
            sequences.append(frames)
            labels.append(sequence_labels)
    return sequences, labels

X_train, Y_train = load_sequences_and_labels(train_files)
X_test, Y_test = load_sequences_and_labels(test_files)

In [None]:
#encoding the labels as numbers manually
label_mapping = {'Injection': 0, 'Optimising_Position': 1, 'Polypectomy': 2, 'Inspecting_Resection':3,'Other':4}

Y_train_int = [[label_mapping[label] for label in seq] for seq in Y_train]
Y_test_int = [[label_mapping[label] for label in seq] for seq in Y_test]

In [None]:
#sequence generator is used to create sequences in order to capture temporal relationships that will be captured by LSTMs

import numpy as np
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical

class SequentialDataGenerator(Sequence):
    def __init__(self, X, Y_int, batch_size=16, num_classes=None, max_sequence_length=None):
        self.X = X #sequences
        self.Y_int = Y_int  #integer-encoded labels
        self.batch_size = batch_size  # Batch size
        self.num_classes = num_classes  #total number of classes
        self.max_sequence_length = max_sequence_length or self._max_length()  #calculating max sequence length
        self.indexes = np.arange(len(self.X))  #indices of the sequences

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        batch_X = [self.X[k] for k in indexes]
        batch_Y_int = [self.Y_int[k] for k in indexes]

        #the last frames from each video may not add up to 16 (sequence length) so to ensure equal length they are padded with the label 'Other'
        X_temp_padded = self._pad_sequences(batch_X, self.max_sequence_length)
        Y_temp_encoded = [to_categorical(y, num_classes=self.num_classes) for y in batch_Y_int]
        Y_temp_padded = self._pad_sequences(Y_temp_encoded, self.max_sequence_length, pad_val=[0 if i != label_mapping['Other'] else 1 for i in range(self.num_classes)])

        return np.array(X_temp_padded), np.array(Y_temp_padded)

    #the order of the epochs are not shuffled in order to maintain the temporal nature of the videos
    def on_epoch_end(self):
        pass

    def _max_length(self):
        return max(len(seq) for seq in self.X)

    def _pad_sequences(self, sequences, max_seq_length, pad_val=0):
        padded_sequences = []
        for seq in sequences:
            seq = np.array(seq)
            current_length = len(seq)
            pad_size = max_seq_length - current_length

            if pad_size > 0:
                if isinstance(pad_val, list):
                    pad_array = np.array([pad_val for _ in range(pad_size)])
                    padded_seq = np.concatenate((seq, pad_array), axis=0)
                else:
                    pad_shape = ((0, pad_size),) + ((0, 0),) * (seq.ndim - 1)
                    padded_seq = np.pad(seq, pad_shape, mode='constant', constant_values=pad_val)
            else:
                padded_seq = seq

            padded_sequences.append(padded_seq)
        return padded_sequences

In [None]:
num_classes = 5

train_generator = SequentialDataGenerator(X_train, Y_train_int, batch_size=16, num_classes=5)
validation_generator = SequentialDataGenerator(X_test, Y_test_int, batch_size=16, num_classes=5)


x_sample, y_sample = next(iter(train_generator))
print("X_sample shape:", x_sample.shape)
print("Y_sample shape:", y_sample.shape)