In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torchvision import  transforms
from torchvision import models
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import copy
import random
from sklearn.metrics import confusion_matrix

In [2]:
# Define data transformations
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        #transforms.RandomHorizontalFlip(p=0.5),
        #transforms.RandomRotation(degrees=10),
        #transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
  
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Path to your data directory
data_dir = 'Sets'

# Load data from directories, focusing on 'Play' and 'Time_Between' classes only
def filter_classes(dataset, classes_to_include):
    # Filter samples
    filtered_samples = [(path, label) for path, label in dataset.samples if dataset.classes[label] in classes_to_include]
    
    # Reassign targets and samples
    new_targets = []
    new_samples = []
    class_to_idx = {cls: idx for idx, cls in enumerate(classes_to_include)}
    
    for path, label in filtered_samples:
        class_name = dataset.classes[label]
        if class_name in class_to_idx:
            new_label = class_to_idx[class_name]
            new_samples.append((path, new_label))
            new_targets.append(new_label)
    
    dataset.samples = new_samples
    dataset.targets = new_targets
    dataset.classes = classes_to_include
    dataset.class_to_idx = class_to_idx

# Apply the filter to each dataset split
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val', 'test']}
for phase in ['train', 'val', 'test']:
    filter_classes(image_datasets[phase], ['Play', 'Time_Between'])

# Check dataset sizes and class names
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

# Print dataset sizes and class names
print("Filtered dataset sizes:", dataset_sizes)
print("Filtered classes:", class_names)

# Print number of frames for each class in each dataset
for phase in ['train', 'val', 'test']:
    print(f"\n{phase.upper()} dataset:")
    class_counts = {class_name: 0 for class_name in class_names}
    for _, label in image_datasets[phase].samples:
        class_name = class_names[label]
        class_counts[class_name] += 1
    for class_name, count in class_counts.items():
        print(f"  {class_name}: {count} frames")

Filtered dataset sizes: {'train': 316795, 'val': 56903, 'test': 63966}
Filtered classes: ['Play', 'Time_Between']

TRAIN dataset:
  Play: 128701 frames
  Time_Between: 188094 frames

VAL dataset:
  Play: 23801 frames
  Time_Between: 33102 frames

TEST dataset:
  Play: 26188 frames
  Time_Between: 37778 frames


In [5]:
from collections import defaultdict

def balance_and_reduce_sequences(dataset, seq_length, max_frames_per_class):
    class_counts = defaultdict(list)
    for idx, (path, label) in enumerate(dataset.samples):
        class_name = dataset.classes[label]
        class_counts[class_name].append((path, label))

    max_sequences_per_class = max_frames_per_class // seq_length

    balanced_samples = []
    for class_name in class_counts:
        samples = class_counts[class_name][:max_sequences_per_class * seq_length]
        balanced_samples.extend(samples)

    dataset.samples = balanced_samples
    dataset.targets = [label for _, label in balanced_samples]
    print(f"Balanced and reduced {len(balanced_samples)} samples for dataset.")

class VideoDataset(Dataset):
    def __init__(self, image_folder_dataset, seq_length, transform=None):
        self.image_folder_dataset = image_folder_dataset
        self.seq_length = seq_length
        self.transform = transform
        self.valid_sequences = self.prepare_sequences()
        print(f"Prepared {len(self.valid_sequences)} valid sequences.")

    def prepare_sequences(self):
        grouped_frames = defaultdict(list)
        for path, label in self.image_folder_dataset.samples:
            video_name, frame_number = self.parse_frame_details(path)
            grouped_frames[video_name].append((int(frame_number), path, label))

        valid_sequences = []
        for frames in grouped_frames.values():
            frames.sort()  # Sortiere Frames nach ihrer Nummer
            # Hier generieren wir überlappende Sequenzen
            for i in range(len(frames) - self.seq_length + 1):  # Erlaubt Überlappung
                # Extra check to ensure continuity
                if frames[i + self.seq_length - 1][0] - frames[i][0] == self.seq_length - 1:
                    sequence = [(frame[1], frame[2]) for frame in frames[i:i + self.seq_length]]
                    valid_sequences.append(sequence)
        return valid_sequences

    def parse_frame_details(self, path):
        basename = os.path.basename(path)
        parts = basename.split('_')
        video_name = '_'.join(parts[:-2])
        frame_number = parts[-1].split('.')[0]
        return video_name, frame_number

    def __len__(self):
        return len(self.valid_sequences)

    def __getitem__(self, idx):
        frame_paths, labels = zip(*self.valid_sequences[idx])
        images = [self.load_transform_image(path) for path in frame_paths]
        images = torch.stack(images)
        label = torch.tensor(labels[0])
        assert torch.all(torch.tensor(labels) == label), "Mismatched labels in a sequence."
        return images, label

    def load_transform_image(self, path):
        img = self.image_folder_dataset.loader(path)
        if self.transform:
            img = self.transform(img)
        return img

seq_length = 60
max_frames_train = 50000
max_frames_val_test = 7500

# Assume image_datasets and data_transforms are defined somewhere else
balance_and_reduce_sequences(image_datasets['train'], seq_length, max_frames_train)
balance_and_reduce_sequences(image_datasets['val'], seq_length, max_frames_val_test)
balance_and_reduce_sequences(image_datasets['test'], seq_length, max_frames_val_test)

video_datasets = {x: VideoDataset(image_datasets[x], seq_length, data_transforms[x]) for x in ['train', 'val', 'test']}
dataloaders = {x: DataLoader(video_datasets[x], batch_size=1, shuffle=True) for x in ['train', 'val', 'test']}

# Print number of sequences in each set
for key, dataset in video_datasets.items():
    print(f"{key.upper()} dataset: {len(dataset)} sequences")

Balanced and reduced 99960 samples for dataset.
Balanced and reduced 15000 samples for dataset.
Balanced and reduced 15000 samples for dataset.
Prepared 88566 valid sequences.
Prepared 13090 valid sequences.
Prepared 13191 valid sequences.
TRAIN dataset: 88566 sequences
VAL dataset: 13090 sequences
TEST dataset: 13191 sequences


In [6]:
import random

def print_random_sequences(dataset, num_sequences=2):
    # select random sequence
    for _ in range(num_sequences):
        random_idx = random.randint(0, len(dataset) - 1)
        images, label = dataset[random_idx]
        print(f"Random Sequence - Label: {dataset.image_folder_dataset.classes[label]}")
        for img_path, _ in dataset.valid_sequences[random_idx]:
            print(f"  {img_path}")
        print("\n")  

print_random_sequences(video_datasets['train'])


Random Sequence - Label: Play
  Sets\train\Play\GH085451_frame_5725.jpg
  Sets\train\Play\GH085451_frame_5726.jpg
  Sets\train\Play\GH085451_frame_5727.jpg
  Sets\train\Play\GH085451_frame_5728.jpg
  Sets\train\Play\GH085451_frame_5729.jpg
  Sets\train\Play\GH085451_frame_5730.jpg
  Sets\train\Play\GH085451_frame_5731.jpg
  Sets\train\Play\GH085451_frame_5732.jpg
  Sets\train\Play\GH085451_frame_5733.jpg
  Sets\train\Play\GH085451_frame_5734.jpg
  Sets\train\Play\GH085451_frame_5735.jpg
  Sets\train\Play\GH085451_frame_5736.jpg
  Sets\train\Play\GH085451_frame_5737.jpg
  Sets\train\Play\GH085451_frame_5738.jpg
  Sets\train\Play\GH085451_frame_5739.jpg
  Sets\train\Play\GH085451_frame_5740.jpg
  Sets\train\Play\GH085451_frame_5741.jpg
  Sets\train\Play\GH085451_frame_5742.jpg
  Sets\train\Play\GH085451_frame_5743.jpg
  Sets\train\Play\GH085451_frame_5744.jpg
  Sets\train\Play\GH085451_frame_5745.jpg
  Sets\train\Play\GH085451_frame_5746.jpg
  Sets\train\Play\GH085451_frame_5747.jpg
  Se

In [None]:
# Check if CUDA is available and use it if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class CNN_LSTM(nn.Module):
    def __init__(self, cnn_model, hidden_size, num_classes=2, num_layers=1):
        super(CNN_LSTM, self).__init__()
        self.cnn = cnn_model
        self.lstm = nn.LSTM(input_size=2048, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) # 2048 for ResNet
        self.fc = nn.Linear(hidden_size, num_classes)
        #self.sigmoid = nn.Sigmoid()
        

    def forward(self, x):
        batch_size, seq_length, C, H, W = x.size()  # Extract the dimensions of the input

        # Reshape input for CNN
        
        c_in = x.view(batch_size * seq_length, C, H, W) 
        c_out = self.cnn(c_in)  # Run through CNN for feature extraction
        c_out = c_out.view(batch_size, seq_length, -1)  

        # Run through LSTM for sequence processing
        r_out, (h_n, c_n) = self.lstm(c_out)  # LSTM layer
        out = self.fc(r_out[:, -1, :])  # Use last output of the LSTM for classification
        return out

# Load the pre-trained ResNet-50 model and modify output feature maps
resnet = models.resnet50(pretrained=True)
cnn_model = nn.Sequential(*list(resnet.children())[:-2], nn.AdaptiveAvgPool2d((1, 1)))

# Define the hidden size, input size, number of classes, and number of LSTM layers
hidden_size = 512
num_classes = 2 
#num_classes = 1  # Binary classification with single output for BCEWithLogitsLoss
num_layers = 1 # Example number of LSTM layers

# Instantiate the combined CNN-LSTM model
cnn_lstm_model = CNN_LSTM(cnn_model, hidden_size, num_classes, num_layers).to(device)

# Define the criterion, optimizer, and learning rate scheduler
#criterion = nn.BCEWithLogitsLoss()  # BCEWithLogitsLoss for binary classification (Sigmoid)
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss for classification tasks (Softmax)
optimizer_ft = optim.Adam(cnn_lstm_model.parameters(), lr=0.0001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)


In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=50, batch_update_interval=35):
    since = time.time()  # Track the start time for training duration

    best_model_wts = copy.deepcopy(model.state_dict())  # Keep a copy of the best model weights
    best_acc = 0.0  # Initialize the best accuracy

    print("Training start")  # Print training start message

    for epoch in range(num_epochs):  # Loop over epochs
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        for phase in ['train', 'val']:  # Each epoch has a training and validation phase
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0  # Initialize running loss
            running_corrects = 0  # Initialize running correct predictions
            batch_count = 0  # Initialize batch count

            all_labels = []  # Store all true labels
            all_preds = []  # Store all predictions

            data_iter = iter(dataloaders[phase])  # Create an iterator for the DataLoader
            batch_total = len(dataloaders[phase])  # Total number of batches

            while True:
                try:
                    batch = next(data_iter)  # Get the next batch
                    inputs, labels, _ = batch  # Get the first two elements only, ignore the rest
                except StopIteration:
                    break  # Exit the loop if there are no more batches

                inputs = inputs.to(device)
                labels = labels.to(device).long()  # Ensure labels are of type long for CrossEntropyLoss

                optimizer.zero_grad()  # Zero the parameter gradients

                # Forward pass
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)  # Get the index of the max log-probability
                    loss = criterion(outputs, labels)

                    # Backward pass and optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Update running loss and correct predictions
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                batch_count += 1

                # Collect labels and predictions for confusion matrix
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

                # Print update every `batch_update_interval` batches
                if batch_count % batch_update_interval == 0:
                    print(f'Batch {batch_count}/{batch_total}: {phase} Loss: {running_loss / (batch_count * inputs.size(0)):.4f} Acc: {running_corrects.double() / (batch_count * inputs.size(0)):.4f}')

            if phase == 'train':
                scheduler.step()  # Step the learning rate scheduler

            # Calculate epoch loss and accuracy
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Print confusion matrix for validation phase
            if phase == 'val':
                cm = confusion_matrix(np.array(all_labels).flatten(), np.array(all_preds).flatten())
                print(f'Confusion Matrix for epoch {epoch}:\n{cm}')

            # Deep copy the best model weights and save the model if it has the best accuracy
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, 'best_model.pth')  # Save the best model weights

        print()

    # Calculate total training time
    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Instantiate the criterion for multi-class classification
criterion = nn.CrossEntropyLoss()

# Train and evaluate the model with live updates and confusion matrix printing
model_ft = train_model(cnn_lstm_model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=35)

