In [11]:
%load_ext autoreload
%autoreload 2

from torch.utils.data import DataLoader
import sys

sys.path.insert(0, "datasets")
from datasets.MSDWildOptimized import LazyNPZDataset
import numpy as np
from pathlib import Path
import random

import torch
import pandas as pd
import torch.nn.functional as F
from models.VisualOnly import VisualOnlyModel
from sklearn.cluster import AgglomerativeClustering
from losses.DiarizationLoss import DiarizationLogitsLoss
from tqdm import tqdm
from torch.utils.data import Subset
from pairs.config import S3_BUCKET_NAME, S3_VIDEO_DIR
import os
from training.train_multimodal import *
from training.visual_train import *
from torchsummaryX import summary

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Dataloader
NPZ_PATH = "triplet_batches"
NPZ_BATCH_SIZE = 5000
VISUAL_TYPE = "face"  # lip or face

MODEL_BATCH_SIZE = 64

CHECKPOINT_PATH = "model_checkpoints"

os.makedirs(CHECKPOINT_PATH, exist_ok=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [13]:
train_dataset = LazyNPZDataset(
    npz_dir=NPZ_PATH,
    batch_size=NPZ_BATCH_SIZE,
    end=65,
    bucket=S3_BUCKET_NAME,
    shuffle_within_file=True,
    visual_type=VISUAL_TYPE,
)

val_dataset = LazyNPZDataset(
    npz_dir=NPZ_PATH,
    batch_size=NPZ_BATCH_SIZE,
    start=66,
    bucket=S3_BUCKET_NAME,
    shuffle_within_file=True,
    visual_type=VISUAL_TYPE,
)

In [14]:
train_loader = DataLoader(
    train_dataset,
    batch_size=MODEL_BATCH_SIZE,
    shuffle=False,
    collate_fn=train_dataset.collate_fn,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=MODEL_BATCH_SIZE,
    shuffle=False,
    collate_fn=val_dataset.collate_fn,
)

In [15]:
num = 0
for batch in tqdm(train_loader):
    # print("Visual Data Shape:", batch[0].shape)
    # print("Audio Data Shape:", batch[1].shape)
    # print("Labels:", batch[2])
    # break
    num += 1
print(f"got through {num} batches")

  0%|          | 0/5079 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [16]:
if VISUAL_TYPE == "face":
    visual_model = VisualOnlyModel(
        embedding_dims=1024, weights_path="pretrained/visual_encoder.pth"
    )
elif VISUAL_TYPE == "lip":
    visual_model = None  # TODO: add ResNet Model
else:
    raise ValueError("Invalid visual type: " + VISUAL_TYPE)

audio_model = None  # TODO: add audio model

model = None  # TODO: create multimodal model

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = DiarizationLogitsLoss(0.3, 0.7)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.8, patience=8
)

start_epoch = 0
final_epoch = 100

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [59]:
metrics = {}
best_valid_acc = 0
for epoch in range(start_epoch, final_epoch):
    print("\nEpoch {}/{}".format(epoch + 1, final_epoch))
    curr_lr = float(scheduler.get_last_lr()[0])
    metrics.update({"lr": curr_lr})
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion)
    print(
        "\nEpoch {}/{}: \nTrain Cls. Acc {:.04f}%\t Train Cls. Loss {:.04f}\t Learning Rate {:.04f}".format(
            epoch + 1, final_epoch, train_acc, train_loss, curr_lr
        )
    )
    metrics.update({"train_cls_acc": train_acc, "train_loss": train_loss})
    valid_acc, valid_loss = evaluate_epoch(model, val_loader, criterion)
    print("Val Cls. Acc {:.04f}%\t Val Cls. Loss {:.04f}".format(valid_acc, valid_loss))
    metrics.update({"valid_cls_acc": valid_acc, "valid_loss": valid_loss})
    if epoch % 5 == 4:
        epoch_ckpt_path = Path(CHECKPOINT_PATH, f"epoch_{epoch+1}.pth")
        save_model(model, metrics, epoch, epoch_ckpt_path)
    if valid_acc >= best_valid_acc:
        best_valid_acc = valid_acc
        save_model(model, metrics, epoch, Path(CHECKPOINT_PATH, "best_visual.pth"))
    if scheduler is not None:
        scheduler.step(valid_loss)
save_model(model, metrics, epoch, Path(CHECKPOINT_PATH, "last_visual.pth"))

NameError: name 'start_epoch' is not defined

In [26]:
# dataset_size = len(full_dataset)
# indices = list(range(dataset_size))
# random.shuffle(indices)
# split = int(0.8 * dataset_size)
# train_indices = indices[:split]
# val_indices   = indices[split:]
# train_subset = Subset(full_dataset, train_indices)
# val_subset   = Subset(full_dataset, val_indices)
# train_loader = DataLoader(
#     train_subset,
#     batch_size=batch_size,
#     shuffle=True,
#     collate_fn=full_dataset.build_batch
# )
# val_loader = DataLoader(
#     val_subset,
#     batch_size=batch_size,
#     shuffle=False,
#     collate_fn=full_dataset.build_batch
# )
# print(f"Train size: {len(train_subset)}   Val size: {len(val_subset)}")
# model = VisualOnlyModel(embedding_dims=512, num_classes=2)
# model = model.float().to(DEVICE)
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
# criterion = DiarizationLogitsLoss(0.3, 0.7)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=8)
# start_epoch = 0
# final_epoch = 100
# metrics = {}
# best_valid_acc = 0
# for epoch in range(start_epoch, final_epoch):
#         print("\nEpoch {}/{}".format(epoch+1, final_epoch))
#         curr_lr = float(scheduler.get_last_lr()[0])
#         metrics.update({'lr': curr_lr})
#         train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion)
#         print("\nEpoch {}/{}: \nTrain Cls. Acc {:.04f}%\t Train Cls. Loss {:.04f}\t Learning Rate {:.04f}".format(epoch + 1, final_epoch, train_acc, train_loss, curr_lr))
#         metrics.update({'train_cls_acc': train_acc, 'train_loss': train_loss})
#         valid_acc, valid_loss = evaluate_epoch(model, val_loader, criterion)
#         print("Val Cls. Acc {:.04f}%\t Val Cls. Loss {:.04f}".format(valid_acc, valid_loss))
#         metrics.update({'valid_cls_acc': valid_acc, 'valid_loss': valid_loss})
#         if epoch%5==4:
#             epoch_ckpt_path = Path(CHECKPOINT_PATH, f"epoch_{epoch+1}.pth")
#             save_model(model, metrics, epoch, epoch_ckpt_path)
#         if valid_acc >= best_valid_acc:
#             best_valid_acc = valid_acc
#             save_model(model, metrics, epoch, Path(CHECKPOINT_PATH, 'best_visual.pth'))
#         if scheduler is not None:
#             scheduler.step(valid_loss)
# save_model(model, metrics, epoch, Path(CHECKPOINT_PATH, 'last_visual.pth'))

In [93]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##LOAD CHECKPOINTS
audio_model = CompactAudioEmbedding(input_dim=40, embedding_dim=512, dropout_rate=0.3)
old_audio_dict = torch.load("12.pth", map_location=DEVICE)

new_audio_state_dict = {}
for key, value in old_audio_dict.items():
    if key.startswith("classifier"):
        continue
    new_key = key.replace("encoder.", "")
    new_audio_state_dict[new_key] = value

audio_model.load_state_dict(new_audio_state_dict)

visual_model = ResNet34(embedding_dims=512)

vid_state_dict = torch.load("model_checkpoints/epoch_55.pth", map_location=DEVICE)[
    "model_state_dict"
]

new_vid_state_dict = {}
for key, value in vid_state_dict.items():
    if key.startswith("visual_encoder."):
        new_key = key.replace("visual_encoder.", "")
        new_vid_state_dict[new_key] = value
    elif key.startswith("classifier"):
        continue
    else:
        new_vid_state_dict[key] = value
visual_model.load_state_dict(new_vid_state_dict)

fusion_model = ConcatenationFusionModel(
    audio_model=audio_model,
    visual_model=visual_model,
    fusion_dim=512,
    embedding_dim=512,
    fusion_type="additive",
).to(DEVICE)

# train_rttm_path = "data_sample/all.rttm"
# train_data_path = "preprocessed"

# train_dataset_full = MSDWildChunks(
#     data_path=train_data_path, rttm_path=train_rttm_path, subset=0.8
# )

# # split few_train into train + val
train_size = int(0.8 * full_dataset.length)
val_size = full_dataset.length - train_size

train_subset, val_subset = random_split(
    full_dataset,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(69),
)

batch_size = 64
train_loader = DataLoader(
    train_subset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
)

val_loader = DataLoader(
    val_subset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
)

unfreeze_schedule = {
    5: "audio_last",  # after 2 epochs - unfreeze last layers of audio encoder
    5: "visual_last",  # same as above for visual
    10: "all",  # unfreeze everything
}

optimizer = optim.AdamW(
    [
        {"params": fusion_model.fusion_linear.parameters()},
        {"params": fusion_model.bn.parameters()},
        {"params": fusion_model.fusion_embedding.parameters()},
        {"params": fusion_model.classifier.parameters()},
    ],
    lr=0.001,
    weight_decay=0.01,
)

criterion = DiarizationLoss(triplet_lambda=0.3, bce_lambda=0.7)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.8)

trained_model, best_val_loss = train_fusion_model(
    fusion_model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    scheduler,
    DEVICE,
    num_epochs=100,
    unfreeze_schedule=unfreeze_schedule,
    checkpoint_dir="multimodal_concat_512_checkpoints",
)


Epoch 1/100
training: loss: 0.2320, acc: 0.9627
val: loss: 0.1941, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/best_model_1.pth

Epoch 2/100
training: loss: 0.1196, acc: 0.9684
val: loss: 0.1182, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/best_model_2.pth

Epoch 3/100
training: loss: 0.1006, acc: 0.9669
val: loss: 0.1094, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/best_model_3.pth

Epoch 4/100
training: loss: 0.0929, acc: 0.9660
val: loss: 0.1033, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/best_model_4.pth

Epoch 5/100
training: loss: 0.0866, acc: 0.9648
val: loss: 0.1046, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/epoch_multimodal_5.pth

Epoch 6/100
Update Schedule to visual_last
training: loss: 0.0816, acc: 0.9678
val: loss: 0.0829, acc: 0.9675
Saving to multimodal_concat_512_checkpoints/best_model_6.pth

Epoch 7/100
training: loss: 0.0807, acc: 0.9675
val: loss: 0.0823, acc: 0.9675
Saving to multimodal_concat_512_check