In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torchvision import  transforms
from torchvision import models
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import copy
import random
from sklearn.metrics import confusion_matrix


In [2]:
# Define data transformations
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        #transforms.RandomHorizontalFlip(p=0.5),
        #transforms.RandomRotation(degrees=10),
        #transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
  
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Path to your data directory
data_dir = 'Sets'

# Load data from directories, focusing on 'Play' and 'Time_Between' classes only
def filter_classes(dataset, classes_to_include):
    # Filter samples
    filtered_samples = [(path, label) for path, label in dataset.samples if dataset.classes[label] in classes_to_include]
    
    # Reassign targets and samples
    new_targets = []
    new_samples = []
    class_to_idx = {cls: idx for idx, cls in enumerate(classes_to_include)}
    
    for path, label in filtered_samples:
        class_name = dataset.classes[label]
        if class_name in class_to_idx:
            new_label = class_to_idx[class_name]
            new_samples.append((path, new_label))
            new_targets.append(new_label)
    
    dataset.samples = new_samples
    dataset.targets = new_targets
    dataset.classes = classes_to_include
    dataset.class_to_idx = class_to_idx

# Apply the filter to each dataset split
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val', 'test']}
for phase in ['train', 'val', 'test']:
    filter_classes(image_datasets[phase], ['Play', 'Time_Between'])

# Check dataset sizes and class names
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

# Print dataset sizes and class names
print("Filtered dataset sizes:", dataset_sizes)
print("Filtered classes:", class_names)

# Print number of frames for each class in each dataset
for phase in ['train', 'val', 'test']:
    print(f"\n{phase.upper()} dataset:")
    class_counts = {class_name: 0 for class_name in class_names}
    for _, label in image_datasets[phase].samples:
        class_name = class_names[label]
        class_counts[class_name] += 1
    for class_name, count in class_counts.items():
        print(f"  {class_name}: {count} frames")

Filtered dataset sizes: {'train': 316795, 'val': 56903, 'test': 63966}
Filtered classes: ['Play', 'Time_Between']

TRAIN dataset:
  Play: 128701 frames
  Time_Between: 188094 frames

VAL dataset:
  Play: 23801 frames
  Time_Between: 33102 frames

TEST dataset:
  Play: 26188 frames
  Time_Between: 37778 frames


In [3]:
from collections import defaultdict

# Function to balance the dataset and adjust frame count for each phase
def balance_and_reduce_sequences(dataset, seq_length, max_frames_per_class):
    # Count the number of samples for each class
    class_counts = defaultdict(list)
    for idx, (path, label) in enumerate(dataset.samples):
        class_name = dataset.classes[label]
        class_counts[class_name].append((path, label))

    # Determine the maximum number of sequences per class
    max_sequences_per_class = max_frames_per_class // seq_length

    # Balance the dataset by reducing the larger class sequences
    balanced_samples = []
    for class_name in class_counts:
        samples = class_counts[class_name][:max_sequences_per_class * seq_length]
        balanced_samples.extend(samples)
    
    # Update the dataset with the balanced and reduced samples
    dataset.samples = balanced_samples
    dataset.targets = [label for _, label in balanced_samples]

# Define the sequence length
seq_length = 60  # Number of frames per sequence

# Define maximum frames for each dataset
max_frames_train = 50000
max_frames_val_test = 7500

# Apply the balance and reduce function to each dataset split
balance_and_reduce_sequences(image_datasets['train'], seq_length, max_frames_train)
balance_and_reduce_sequences(image_datasets['val'], seq_length, max_frames_val_test)
balance_and_reduce_sequences(image_datasets['test'], seq_length, max_frames_val_test)

# Continue with creating and using the VideoDataset class, and creating DataLoaders
class VideoDataset(Dataset):
    def __init__(self, image_folder_dataset, seq_length, transform=None):
        self.image_folder_dataset = image_folder_dataset
        self.seq_length = seq_length
        self.transform = transform

        # Sort the samples to ensure they are in the correct order
        self.image_folder_dataset.samples.sort()

    def __len__(self):
        return len(self.image_folder_dataset) // self.seq_length

    def __getitem__(self, idx):
        images = []
        labels = []
        img_paths = []
        for i in range(self.seq_length):
            img_path, label = self.image_folder_dataset.samples[idx * self.seq_length + i]
            img = self.image_folder_dataset.loader(img_path)
            if self.transform:
                img = self.transform(img)
            images.append(img)
            labels.append(label)
            img_paths.append(img_path)

        images = torch.stack(images)
        labels = torch.tensor(labels)
        assert torch.all(labels == labels[0])

        return images, labels[0], img_paths

video_datasets = {x: VideoDataset(image_datasets[x], seq_length, data_transforms[x]) for x in ['train', 'val', 'test']}
dataloaders = {x: DataLoader(video_datasets[x], batch_size=4, shuffle=True) for x in ['train', 'val', 'test']} #Shuffle wurde von False auf True gesetzt

# Check balanced dataset sizes
balanced_dataset_sizes = {x: len(video_datasets[x]) for x in ['train', 'val', 'test']}
print("Balanced dataset sizes:", balanced_dataset_sizes)

for phase in ['train', 'val', 'test']:
    print(f"\n{phase.upper()} dataset:")
    class_counts = {class_name: 0 for class_name in class_names}
    for _, label in image_datasets[phase].samples:
        class_name = class_names[label]
        class_counts[class_name] += 1
    for class_name, count in class_counts.items():
        print(f"  {class_name}: {count} frames")

for phase in ['train', 'val', 'test']:
    print(f"\n{phase.upper()} dataset (after balancing and reducing):")
    class_counts = {class_name: 0 for class_name in class_names}
    for _, label in video_datasets[phase].image_folder_dataset.samples:
        class_name = class_names[label]
        class_counts[class_name] += 1
    for class_name, count in class_counts.items():
        print(f"  {class_name}: {count // seq_length} sequences ({count} frames)")


Balanced dataset sizes: {'train': 1666, 'val': 250, 'test': 250}

TRAIN dataset:
  Play: 49980 frames
  Time_Between: 49980 frames

VAL dataset:
  Play: 7500 frames
  Time_Between: 7500 frames

TEST dataset:
  Play: 7500 frames
  Time_Between: 7500 frames

TRAIN dataset (after balancing and reducing):
  Play: 833 sequences (49980 frames)
  Time_Between: 833 sequences (49980 frames)

VAL dataset (after balancing and reducing):
  Play: 125 sequences (7500 frames)
  Time_Between: 125 sequences (7500 frames)

TEST dataset (after balancing and reducing):
  Play: 125 sequences (7500 frames)
  Time_Between: 125 sequences (7500 frames)


In [4]:
# Select a random sequence from the training dataset and print the frame names
def print_random_sequence(dataloader):
    dataset = dataloader.dataset
    idx = random.randint(0, len(dataset) - 1)
    images, label, img_paths = dataset[idx]
    print(f"Random Sequence - Label: {class_names[label]}")
    for img_path in img_paths:
        print(f"  {img_path}")

# Print a random sequence from the training dataset
print_random_sequence(dataloaders['train'])

Random Sequence - Label: Play
  Sets\train\Play\GH065451_frame_20339.jpg
  Sets\train\Play\GH065451_frame_2034.jpg
  Sets\train\Play\GH065451_frame_20340.jpg
  Sets\train\Play\GH065451_frame_20341.jpg
  Sets\train\Play\GH065451_frame_20342.jpg
  Sets\train\Play\GH065451_frame_20343.jpg
  Sets\train\Play\GH065451_frame_20344.jpg
  Sets\train\Play\GH065451_frame_20345.jpg
  Sets\train\Play\GH065451_frame_20346.jpg
  Sets\train\Play\GH065451_frame_20347.jpg
  Sets\train\Play\GH065451_frame_20348.jpg
  Sets\train\Play\GH065451_frame_20349.jpg
  Sets\train\Play\GH065451_frame_2035.jpg
  Sets\train\Play\GH065451_frame_20350.jpg
  Sets\train\Play\GH065451_frame_20351.jpg
  Sets\train\Play\GH065451_frame_20352.jpg
  Sets\train\Play\GH065451_frame_20353.jpg
  Sets\train\Play\GH065451_frame_20354.jpg
  Sets\train\Play\GH065451_frame_20355.jpg
  Sets\train\Play\GH065451_frame_20356.jpg
  Sets\train\Play\GH065451_frame_20357.jpg
  Sets\train\Play\GH065451_frame_20358.jpg
  Sets\train\Play\GH065451

Frame Input: Accepts a sequence of images as input, with each image representing a frame

CNN Processing:
Each frame is processed individually by the CNN (ResNet 50)
The CNN acts as a feature extractor, converting each image into a high-dimensional feature vector

Pooling:
Features from each frame are reduced to a single vector using nn.AdaptiveAvgPool2d((1, 1)), simplifying each frame's information into a fixed-size vector

LSTM Processing:
The sequence of compacted feature vectors from all frames is fed into the LSTM network
LSTM processes the entire sequence, capturing temporal dependencies and patterns across frames

Sequence Analysis:
The LSTM analyzes the sequence of vectors to understand changes in visual features over time

In [5]:
# Check if CUDA is available and use it if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class CNN_LSTM(nn.Module):
    def __init__(self, cnn_model, hidden_size, num_classes=2, num_layers=1):
        super(CNN_LSTM, self).__init__()
        self.cnn = cnn_model
        self.lstm = nn.LSTM(input_size=2048, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) # 2048 for ResNet
        self.fc = nn.Linear(hidden_size, num_classes)
        #self.sigmoid = nn.Sigmoid()
        

    def forward(self, x):
        batch_size, seq_length, C, H, W = x.size()  # Extract the dimensions of the input

        # Reshape input for CNN
        
        c_in = x.view(batch_size * seq_length, C, H, W) 
        c_out = self.cnn(c_in)  # Run through CNN for feature extraction
        c_out = c_out.view(batch_size, seq_length, -1)  

        # Run through LSTM for sequence processing
        r_out, (h_n, c_n) = self.lstm(c_out)  # LSTM layer
        out = self.fc(r_out[:, -1, :])  # Use last output of the LSTM for classification
        return out

# Load the pre-trained ResNet-50 model and modify output feature maps
resnet = models.resnet50(pretrained=True)
cnn_model = nn.Sequential(*list(resnet.children())[:-2], nn.AdaptiveAvgPool2d((1, 1)))

# Define the hidden size, input size, number of classes, and number of LSTM layers
hidden_size = 512
num_classes = 2 
#num_classes = 1  # Binary classification with single output for BCEWithLogitsLoss
num_layers = 1 # Example number of LSTM layers

# Instantiate the combined CNN-LSTM model
cnn_lstm_model = CNN_LSTM(cnn_model, hidden_size, num_classes, num_layers).to(device)

# Define the criterion, optimizer, and learning rate scheduler
#criterion = nn.BCEWithLogitsLoss()  # BCEWithLogitsLoss for binary classification (Sigmoid)
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss for classification tasks (Softmax)
optimizer_ft = optim.Adam(cnn_lstm_model.parameters(), lr=0.0001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)


Using device: cuda:0




In [6]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=35, batch_update_interval=50):
    since = time.time()  # Track the start time for training duration

    best_model_wts = copy.deepcopy(model.state_dict())  # Keep a copy of the best model weights
    best_acc = 0.0  # Initialize the best accuracy

    print("Training start")  # Print training start message

    for epoch in range(num_epochs):  # Loop over epochs
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        for phase in ['train', 'val']:  # Each epoch has a training and validation phase
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0  # Initialize running loss
            running_corrects = 0  # Initialize running correct predictions
            batch_count = 0  # Initialize batch count

            all_labels = []  # Store all true labels
            all_preds = []  # Store all predictions

            data_iter = iter(dataloaders[phase])  # Create an iterator for the DataLoader
            batch_total = len(dataloaders[phase])  # Total number of batches

            while True:
                try:
                    batch = next(data_iter)  # Get the next batch
                    inputs, labels, _ = batch  # Get the first two elements only, ignore the rest
                except StopIteration:
                    break  # Exit the loop if there are no more batches

                inputs = inputs.to(device)
                labels = labels.to(device).long()  # Ensure labels are of type long for CrossEntropyLoss

                optimizer.zero_grad()  # Zero the parameter gradients

                # Forward pass
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)  # Get the index of the max log-probability
                    loss = criterion(outputs, labels)

                    # Backward pass and optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Update running loss and correct predictions
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                batch_count += 1

                # Collect labels and predictions for confusion matrix
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

                # Print update every `batch_update_interval` batches
                if batch_count % batch_update_interval == 0:
                    print(f'Batch {batch_count}/{batch_total}: {phase} Loss: {running_loss / (batch_count * inputs.size(0)):.4f} Acc: {running_corrects.double() / (batch_count * inputs.size(0)):.4f}')

            if phase == 'train':
                scheduler.step()  # Step the learning rate scheduler

            # Calculate epoch loss and accuracy
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Print confusion matrix for validation phase
            if phase == 'val':
                cm = confusion_matrix(np.array(all_labels).flatten(), np.array(all_preds).flatten())
                print(f'Confusion Matrix for epoch {epoch}:\n{cm}')

            # Deep copy the best model weights and save the model if it has the best accuracy
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, 'best_model.pth')  # Save the best model weights

        print()

    # Calculate total training time
    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Instantiate the criterion for multi-class classification
criterion = nn.CrossEntropyLoss()

# Train and evaluate the model with live updates and confusion matrix printing
model_ft = train_model(cnn_lstm_model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=35)



Training start
Epoch 0/34
----------
Batch 50/417: train Loss: 0.7151 Acc: 0.6050
Batch 100/417: train Loss: 0.6750 Acc: 0.6175
Batch 150/417: train Loss: 0.6720 Acc: 0.6150
Batch 200/417: train Loss: 0.6680 Acc: 0.6100
Batch 250/417: train Loss: 0.6687 Acc: 0.6060
Batch 300/417: train Loss: 0.6678 Acc: 0.5908
Batch 350/417: train Loss: 0.6636 Acc: 0.5814
Batch 400/417: train Loss: 0.6598 Acc: 0.5750
train Loss: 0.0035 Acc: 0.0030
Batch 50/63: val Loss: 0.7049 Acc: 0.5300
val Loss: 0.0031 Acc: 0.0022
Confusion Matrix for epoch 0:
[[77 48]
 [75 50]]

Epoch 1/34
----------
Batch 50/417: train Loss: 0.6407 Acc: 0.6000
Batch 100/417: train Loss: 0.6448 Acc: 0.6075
Batch 150/417: train Loss: 0.6416 Acc: 0.6033
Batch 200/417: train Loss: 0.6287 Acc: 0.6038
Batch 250/417: train Loss: 0.6273 Acc: 0.6110
Batch 300/417: train Loss: 0.6281 Acc: 0.6075
Batch 350/417: train Loss: 0.6304 Acc: 0.5964
Batch 400/417: train Loss: 0.6301 Acc: 0.5975
train Loss: 0.0033 Acc: 0.0031
Batch 50/63: val Loss: 0

- Derzeit: Model ohne Sigmoid weil loss bereits einen sigmoid verwendet im Training.
- Versucb mit evtl. batch_size 1 
- class number auf 2 => neue Loss function

Nochmal die label und Formatierung checken

Stand 22.05
Neue Loss function, num_classes = 2 
[[ 72 594]
 [ 96 570]]
 Nach 25 epoch => enrneuter Durchlauf mit 50 

Test Nummer 2: 
Gleicher Code, lr 0.0001, 25 epochs 
[[300 366]
 [294 372]]

 Problem bei diesen Tests: Alle Sets also train und val waren gleichgroß

23.05.
Erneuter Test mit gleichem Code, aber Erhöhung der epochs auf 50 und 70/15/15 Verhältnis der Sets
[[  0 250]
 [  0 250]]


24.05.
Test mit neuem Code und neuen Modell BC_2
Fehler bei Trainingsschleife, nur eine Klasse beachtet

Bei erneutem Test mit BC2 selbes Problem wie bei allen Sigmoid Anwendungen 
=> Detektion von nur einer Klasse

TO DO:
Val Datensatz prüfen

Test 24.05. Epochs 35
[[112 138]
 [115 135]]

Test 25.05. Epochs 35
Neue Sequenzgröße von 60 => Verdoppelung 
[[62 63]
 [63 62]]


Test 26.05.
Ohne Data Augmentation, Sequenz 60 
[[83 42]
 [70 55]]

Test 27.05.
Ohne DA, Sequenz 90
[[24 59]
 [31 52]]

Test 28.05
Neuer Code, welcher gleiches Modell verwendet. Aber getrennte Sequenzen mit 640 Frames betrachtet. Durch padding werden kleinere Sequnezen aufgefüllt. 
Abbruch nach 33 Stunden und 18 epochs, da loss und acc gleichbleibend war:
val Loss: 0.6935 Acc: 0.5000
Confusion Matrix for epoch 17:
[[15  0]
 [15  0]]


STAND 29.05
Binary_Classification Code ist bester. Nach Meeting wird bestehender Ansatz weiter verbessert
=> neues Skript aufbauend auf Binary_Classification 
Zusätzlich wurde bis jetzt eine batch_size von 1 verwendet, welche evtl. negative Auswirkungen hat, daher werden weitere Versuche mit höherer Batch_size durchgefürt (shuffle = True)

Test 29.05
Erneute ausführung des Binary_classification Codes
- Sequence = 60
- shuffle = True und Batch size 4
- Layer = 512
- Scheduler = 7

[[103  22]
 [ 77  48]]
 Bislang bestes Ergebnis mit 62 %

Überlegung layer size anpassen, für Test kleiner als 512, zB 256, 128
Learning Rate Scheduler auf jede bzw. jede 2. epoche anpassen
Eventuelle von AvgPooling auf MaxPooling wechseln 

Test 02.06
Binary_SequenceRow_Classification 
- Sequence = 60
- Batch_size = 2
- Layer = 256
- Scheduler = 2

Bestes Ergebnis: ~ 71.5%
[[2096  875]
 [ 814 2156]]

Hat sehr gut funktioniert, jedoch war DataLoader nicht optimal. Es waren teilweise beide Klassen in einer Sequenz. Diese Sequenzen wurden verworfen.
Ein weiterer Testlauf mit gleichem Code wäre sinnvoll (insofern Test vom 06.06 keine besseren Ergebnisse bringt). Bei neuem Test wird DataLoader angepasst.

Test 06.06 
Binary_Classification_SR_Seperation 
Sequenzen enthalten beide Label, es kommt daher innerhalb einer Sequenz zum wechsel einer Klasse
- Sequence = 60
- Batch_size = 8
- Layer = 256
- Scheduler = 2
- MaxPool2d statt AvgPooling

Bestes Ergebnis: 55% nach knapp 2 Tagen Laufzeit 
[[14378  9052]
 [10694  8956]]


Problem bei diesen Ansatz: Um Labelwechsel zu erkennen, wird für jedes einzelne frame eine Erkennung durchgeführt. Also für jedes frame innerhalb der Sequenz
=> kann zu niedriger Genauigkeit führen weil nicht ganze Sequenz analysiert wird 
=> NOTWENDIGE ANPASSUNG: Das weiterhin nur das letzte Frame der beachtet wird und durch um eins weiterschieben der Label wechsel erkannt wird 

Nächster Test: Binary_SequenceRow_Classification mit angepasstem DataLoader 

TEST 10.06.24
Binary_SequenceRow_Classification mit angepasstem DataLoader: 
Sequenzen können nur aus einem Video gebildet werden 
Sequenzen kontrollieren gesamten Pfad, um Sequenz zu erstellen 
Überlappung innerhalb der Sequenz weitherin vorhanden 

- Sequence = 60
- Batch_size = 2
- Layer = 256
- Scheduler = 4


TEST 11.06.24
Binary_SequenceRow_Classification 
- Sequence = 60
- Batch_size = 2
- Layer = 126
- num_layer = 2
- Scheduler = 2
- nn.AdaptiveAvgPool2d

Batch 1450/1471: val Loss: 0.5096 Acc: 0.8559
[[1325  146]
 [ 278 1192]]

TEST 12.06.24
- Sequence = 60
- Batch_size = 2
- Layer = 128
- num_layer = 2
- Scheduler = 2
- nn.AdaptiveAvgPool2d

Neuer DataLoader: Sequenzen können nur aus einem Video gebildet werden
Überlappung innerhalb der Sequenz weitherin vorhanden 

[[1379   92]
 [ 254 1217]]

Genaugkeit knapp 88%

Überprüfung mit Test-Datensatz 
bestmodel 1
63%
[[1159  312]
 [ 769  702]]

TEST 13.06

- Sequence = 60
- Batch_size = 2
- Layer = 126
- num_layer = 1
- Scheduler = 2
- nn.AdaptiveAvgPool2d
val acc: 88%

[[648  13]
 [143 518]]

 Test:
 bestmodel 2
 61%
 [[566  95]
 [431 230]]


TEST 14.06
seq_length = 60  
max_train_sequences = 1000
Neuer Dataloader => multi label werden nicht trainiert
Stark verringerte Sequenzanzahl 

Misserfolg, viele Epochs haben beide Klassen als eines erkannt => neuer Versuch 

TEST 23.06
mit gesamten Datensatz 
[[3640 5857]
 [2490 7007]]

TRAIN dataset: 126630 sequences
  Label 0: 63315 sequences
  Label 1: 63315 sequences
VAL dataset: 18994 sequences
  Label 0: 9497 sequences
  Label 1: 9497 sequences
TEST dataset: 18994 sequences
  Label 0: 9497 sequences
  Label 1: 9497 sequences


Augmentation 
[[ 662  838]
 [ 345 1155]] Test 
 

 [[1002  498]
 [ 476 1024]]  Val 