In [None]:
import sys
sys.path.insert(0, '/home/sonia2oo2soia/projects/Thesis-Project/Thesis-Project')  

from dotenv import load_dotenv
load_dotenv('/home/sonia2oo2soia/projects/Thesis-Project/Thesis-Project/.env')


import numpy as np
import torch
import gc
import matplotlib.pyplot as plt

from torch.optim.lr_scheduler import StepLR
from torch.nn import BCELoss, BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils import data
from torch.cuda.amp import GradScaler, autocast

from data_handling.data_augmentation import VideoTransform
from data_handling.video_dataset import VideoDataset
from model.violence_detection_model import ViolenceDetectionModel
from utils import collate_fn_pad


config = {
    "lr": 0.0008279,
    "epochs": 1,
    "factor": 0.02839,
    "batch": 20,
    "n_folds": 4,
    "step_size": 1
}

def train():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("device: {}".format(device))

    print("set transforms for each dataset")
    hf_transforms = VideoTransform(dataset="RWF-2000", json_file="augmentation_values.json")

    print("initialize datasets")
    hf_dataset = VideoDataset(dataset="RWF-300", transformations=hf_transforms)
    
    # helper list for calculating the final val_loss
    train_losses = []
    val_losses = []

    for dataset in [hf_dataset]:
        print("dataset {}".format(dataset.dataset))

        for n, fold in enumerate(dataset.k_fold(n_folds=config["n_folds"])):
            print("fold {}".format(n))
            dataloader = data.DataLoader(fold, batch_size=config["batch"], collate_fn=collate_fn_pad, shuffle=True, num_workers=2)

            print("create model")
            vd_model = ViolenceDetectionModel(json_file="model_settings.json")
            vd_model = vd_model.to(device)

            criterion = BCEWithLogitsLoss()
            optimizer = Adam(params=vd_model.parameters(), lr=config["lr"])
            scheduler = StepLR(optimizer=optimizer, step_size=config["step_size"], gamma=config["factor"])
            scaler = GradScaler()

            accumulation_steps = 5

            # track losses per batch in training along epochs
            losses_per_batch = np.array([])

            for epoch in range(config["epochs"]):
                print("epoch {}".format(epoch))

                # ------------train------------
                print("training")

                vd_model.train()
                # use training part of dataset
                fold.flag = False

                # track loss for entire train dataset
                cumulating_loss = 0.0
                plot_loss = 0

                print(len(fold))
                print(len(dataloader))
                for i, batch in enumerate(dataloader):
                    print("batch")

                    videos, labels, lengths = batch
                    videos, labels = videos.to(device), labels.to(device)

                    with autocast():
                        outputs = vd_model(videos, lengths)
                        loss = criterion(outputs, labels)
                        loss = loss / accumulation_steps
                        
                    # loss.backward() 
                    scaler.scale(loss).backward()

                    plot_loss += loss.cpu().item()
                    
                    if ((i + 1) % accumulation_steps == 0) or (i + 1 == len(dataloader)):
                        # optimizer.step()
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        print("opt")
                        losses_per_batch = np.append(losses_per_batch, plot_loss)
                        cumulating_loss += plot_loss
                        plot_loss = 0

                    del videos, labels
                    gc.collect()
                    torch.cuda.empty_cache()

                # log the cumulating train loss to wandb
                avg_loss = cumulating_loss / len(dataloader)
                train_losses.append(avg_loss)
                print("train loss: {}".format(avg_loss))

                scheduler.step()
                # ------------val------------
                print("evaluating")

                vd_model.eval()
                fold.flag = True

                # lists for tracking loss and f1 for the entire val dataset
                cumulating_loss = 0.0
                cumulating_outputs = []
                cumulating_labels = []

                print(len(fold))
                print(len(dataloader))

                with torch.no_grad():
                    for batch in dataloader:
                        print("batch")
                        videos, labels, lengths = batch
                        videos, labels = videos.to(device), labels.to(device)

                        with autocast():
                            outputs = vd_model(videos, lengths)
                            loss = criterion(outputs, labels)

                            cumulating_loss += loss.cpu().item()
                            cumulating_outputs.extend(outputs.cpu().tolist())
                            cumulating_labels.extend(labels.cpu().tolist())

                        del videos, labels
                        gc.collect()
                        torch.cuda.empty_cache()

                avg_loss = cumulating_loss / len(dataloader)
                print("val loss: {}".format(avg_loss))

                val_losses.append(avg_loss)

            fig, ax = plt.subplots(1, 2, figsize=(20, 6))

            # Plot batch losses on the first subplot
            ax[0].plot(losses_per_batch, label="Loss per batch")
            ax[0].set_title('Batch Losses')
            ax[0].set_xlabel('Batch')
            ax[0].set_ylabel('Loss')
            ax[0].legend()

            # Plot train and validation losses on the second subplot
            ax[1].plot(np.arange(config["epochs"]), train_losses, label='Train Loss')
            ax[1].plot(np.arange(config["epochs"]), val_losses, label='Validation Loss')
            ax[1].set_title('Training and Validation Losses')
            ax[1].set_xlabel('Epochs')
            ax[1].set_ylabel('Loss')
            ax[1].legend()

            plt.show()
            plt.close(fig)
            
            torch.save(vd_model.state_dict(), 'model_weights.pth')
            break  # fold       
train()

In [2]:
! python /home/sonia2oo2soia/projects/Thesis-Project/Thesis-Project/execution/train.py 

Create sweep with ID: urr7b8k9
Sweep URL: https://wandb.ai/soniamatei/vd_model_training/sweeps/urr7b8k9
[34m[1mwandb[0m: Agent Starting Run: g4xf6r5u with config:
[34m[1mwandb[0m: 	batch_size: 24
[34m[1mwandb[0m: 	epoch: 2
[34m[1mwandb[0m: 	factor: 0.006869515440405294
[34m[1mwandb[0m: 	learning_rate: 0.002407053645211852
[34m[1mwandb[0m: 	n_folds: 4
[34m[1mwandb[0m: 	step_size: 1
[34m[1mwandb[0m: 	treshold: 0.5
initialize wandb
[34m[1mwandb[0m: Currently logged in as: [33msoniamatei[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.17.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/sonia2oo2soia/projects/Thesis-Project/Thesis-Project/execution/wandb/run-20240530_115729-g4xf6r5u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgraceful-sweep-1[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/soni