In [2]:
!nvidia-smi

Sun Oct  9 16:28:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   37C    P8    26W / 300W |     28MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:03:00.0 Off |                  N/A |
|  0%   35C    P8     4W / 300W |     14MiB / 11019MiB |      0%      Default |
|       

In [2]:
from comet_ml import Experiment

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from PIL import Image
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset

from einops import rearrange

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score


LOG_COMMET = True
DEBUG = False

if DEBUG:
    import matplotlib.pyplot as plt
    get_ipython().magic('matplotlib inline')
    FIG_PATH = "figures_{}".format(datetime.now().strftime("%d-%m-%Y_%H:%M:%S"))
    os.mkdir(FIG_PATH)

if LOG_COMMET:
    experiment = Experiment(
        api_key="7HshjkeTgLasPcZqrDwsPqq3J",
        project_name="conformer",
        workspace="standardai",
        auto_metric_logging=False, 
        log_code=True,
    )
        
    experiment.set_name("mavi-audio-conformer #{}".format(datetime.now().strftime("%d/%m/%Y - %H:%M:%S")))    
    experiment.set_code()
else:
    experiment = Experiment(api_key="7HshjkeTgLasPcZqrDwsPqq3J")

COMET INFO: Experiment is live on comet.ml https://www.comet.com/standardai/conformer/4422c3ea15024224a5d50ead81993c9f



In [3]:
import random
seed = 2023
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
from utils.colorjitter import StaticColorJitter
from utils.params import *
from utils.data_prep import get_train_val_test
from utils.dataset import DatasetFD, DATAMODE

In [5]:
import torchaudio
from torchaudio import models

In [6]:
best_acc = 0
model_path = "mavi-audio-conformer_tolga.pth"

In [7]:
if LOG_COMMET:
    experiment.log_parameters(param_dict)

device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [8]:
#### DATA ###

# Traverse Data from scratch
df, train_idx, val_idx, test_idx = get_train_val_test()

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    

train_dataset = DatasetFD(df, train_idx, param_dict["n_sample_frames"], param_dict["sampling_mode"])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,num_workers=8, pin_memory=True, worker_init_fn=seed_worker)

val_dataset = DatasetFD(df, val_idx, param_dict["n_sample_frames"], param_dict["sampling_mode"], istest=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False,num_workers=8, pin_memory=True, worker_init_fn=seed_worker)

test_dataset = DatasetFD(df, test_idx, param_dict["n_sample_frames"], param_dict["sampling_mode"], istest=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=16, pin_memory=True, worker_init_fn=seed_worker)

if LOG_COMMET:
    experiment.log_parameters({"num_videos": len(df),
                               "train_size": len(train_dataset),
                               "val_size": len(val_dataset),
                               "test_size": len(test_dataset),
                              })

In [9]:
### MODEL ###

In [10]:
class MultiHeadSelfAttentionAISummer(nn.Module):
    def __init__(self, dim, heads=4, dim_head=None):
        """
        Implementation of multi-head attention layer of the original transformer model.
        einsum and einops.rearrange is used whenever possible
        Args:
            dim: token's dimension, i.e. word embedding vector size
            heads: the number of distinct representations to learn
            dim_head: the dim of the head. In general dim_head<dim.
            However, it may not necessary be (dim/heads)
        """
        super().__init__()
        self.dim_head = (int(dim / heads)) if dim_head is None else dim_head
        _dim = self.dim_head * heads
        self.heads = heads
        self.to_qvk = nn.Linear(dim, _dim * 3, bias=False)
        self.W_0 = nn.Linear( _dim, dim, bias=False)
        self.scale_factor = self.dim_head ** -0.5

    def forward(self, x, mask=None):
        assert x.dim() == 3
        # Step 1
        qkv = self.to_qvk(x)  # [batch, tokens, dim*3*heads ]

        # Step 2
        # decomposition to q,v,k and cast to tuple
        # the resulted shape before casting to tuple will be:
        # [3, batch, heads, tokens, dim_head]
        q, k, v = tuple(rearrange(qkv, 'b t (d k h) -> k b h t d ', k=3, h=self.heads))

        # Step 3
        # resulted shape will be: [batch, heads, tokens, tokens]
        scaled_dot_prod = torch.einsum('b h i d , b h j d -> b h i j', q, k) * self.scale_factor

        if mask is not None:
            assert mask.shape == scaled_dot_prod.shape[2:]
            scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf)

        attention = torch.softmax(scaled_dot_prod, dim=-1)

        # Step 4. Calc result per batch and per head h
        out = torch.einsum('b h i j , b h j d -> b h i d', attention, v)

        # Step 5. Re-compose: merge heads with dim_head d
        out = rearrange(out, "b h t d -> b t (h d)")

        # Step 6. Apply final linear transformation layer
        return self.W_0(out)

In [17]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conformer = conformer = models.Conformer(
                        input_dim=20,
                        num_heads=4,
                        ffn_dim=128,
                        num_layers=4,
                        depthwise_conv_kernel_size=31)
        
        self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,5], stride=[1,5])
        self.lin = nn.Linear(14000, 2)
        

    def forward(self, x):    
        bs = x.shape[0]
        x = x.permute(0,2,1) # batchx3500x20 -> batchx20x3500        
        x = self.transformer_maxpool(x) # batchx20x3500 -> batchx20x700        
        x = x.permute(0,2,1)        
        x = self.conformer(x, torch.ones(bs).to(device)*700 ) # 700xbatchx20
        x = x[0]        
        #x = x.permute(1,0,2) # batchx20x700 -> 700xbatchx20
        #x = torch.mean(x, dim=0) # batchx20
        x = x.reshape(bs,-1) # batchx20x700 -> batchx14000
        x = self.lin(x)
        return x
    

In [18]:
class CnnLstm(nn.Module):
    def __init__(self, m_rgb_, m_d_):
        super(CnnLstm, self).__init__()  
        self.m_audio = AudioCNN()             
    
    def forward(self, x_rgb, x_d, x_a):
        return self.m_audio(x_a)

In [19]:
def plot_data(uid, frame_idx, imgs, labels, save_dict):
    print("----"*15)
    print(df[["action", "bag_no", "bag_label"]].iloc[uid.data.numpy()])

    n_batch, n_frames, n_ch, h, w = imgs.size()

    f, axarr = plt.subplots(n_batch,n_frames, figsize=(12,8))
    for i in range(n_batch):
        for j in range(n_frames):
            im_ = imgs[i][j].numpy().transpose((1,2,0))            
            axarr[i,j].imshow(im_, interpolation='nearest')
            
    fname = os.path.join(FIG_PATH, "{}_{}_{}.png".format(save_dict["name"], save_dict["epoch"], save_dict["batch_id"]))
    
    plt.savefig(fname)
    
#     plt.show()

In [20]:
def test_epoch(model, test_loader, epoch):
    model.eval()
    
    test_loss = 0
    
    all_labels, all_preds = [], []     
    
    batch_id = 0 
    with experiment.test():
        
        for idx, frame_idx, imgs, dimgs, audio, labels in test_loader:
            batch_id += 1
            
            if DEBUG and epoch % 10 == 0:
                print("XX"*20)                
                print("Test Epoch:{}, Batch:{}".format(epoch, batch_id))
                if epoch == 0:
                    plot_data(idx, frame_idx, imgs, labels, {"epoch": epoch, "batch_id": batch_id, "name": "test"})
                
            all_labels = np.concatenate((all_labels, labels.cpu().data.numpy()), axis=0)
            
            imgs = imgs.to(device)
            dimgs= dimgs.to(device)
            audio = audio.to(device)
            labels = labels.to(device)  
            
            if DATAMODE["RGBONLY"]:                
                output= model(imgs)
            
            elif DATAMODE["RGBD"]:                
                output= model(imgs, dimgs)
                
            elif DATAMODE["DONLY"]:                
                output= model(dimgs)
                
            elif DATAMODE["AONLY"]:                
                output= model(None, None, audio)
            
            elif DATAMODE["RGBDA"]:                
                output= model(imgs, dimgs, audio)
                
            loss = F.cross_entropy(output, labels)
            
            test_loss += loss.item()
            values, indices = torch.max(torch.softmax(output, dim=1), 1)   

            all_preds = np.concatenate((all_preds, indices.cpu().data.numpy()), axis=0)
            
            if DEBUG and epoch % 10 == 0:
                print("Test preds: ", indices.cpu().data.numpy())
            

        test_loss = test_loss / len(test_dataset)        
        test_acc = f1_score(all_labels, all_preds,average='weighted')        
        
        conf_mat = confusion_matrix(all_labels, all_preds)
        clf_report = classification_report(all_labels, all_preds)
        
        global best_acc
        if test_acc > best_acc:
            best_acc = test_acc
            #torch.save(model.state_dict(), model_path)
            
            if isinstance(model, nn.DataParallel):
                train_state = {                
                    'epoch': epoch,
                    'state_dict': model.module.state_dict(),                
                    'model': model,
                }
            else:
                train_state = {                
                    'epoch': epoch,
                    'state_dict': model.state_dict(),                
                    'model': model,
                }
                
            torch.save(
                train_state,
                model_path
            )    
            

        if LOG_COMMET:
            experiment.log_metric("f1_score", test_acc, step=epoch)
            experiment.log_metric("loss", test_loss, step=epoch)        
            experiment.log_confusion_matrix(matrix = conf_mat, step=epoch, file_name="conf_mat_test_{}.json".format(epoch))        
            experiment.log_text(clf_report, step=epoch)    

In [21]:
def train_epoch(model, train_loader, epoch): 

    model.train()

    epoch_loss = 0
    all_labels, all_preds = [], []     
    
    batch_id = 0
    
    with experiment.train():
        
        for idx, frame_idx, imgs, dimgs, audio, labels in train_loader:          
            batch_id += 1            
            
            if DEBUG and epoch % 10 == 0:
                print("//"*15)
                print("Train Epoch:{}, Batch:{}".format(epoch, batch_id))
                if epoch == 0:
                    plot_data(idx, frame_idx, imgs, labels, {"epoch": epoch, "batch_id": batch_id, "name": "train"})
                
            imgs = imgs.to(device)
            dimgs= dimgs.to(device)
            audio = audio.to(device)
            labels = labels.to(device)

            all_labels = np.concatenate((all_labels, labels.cpu().data.numpy()), axis=0)
            
            optimizer.zero_grad() 
            
            if DATAMODE["RGBONLY"]:                
                output= model(imgs)
            
            elif DATAMODE["RGBD"]:                
                output= model(imgs, dimgs)
                
            elif DATAMODE["DONLY"]:                
                output= model(dimgs)
                
            elif DATAMODE["AONLY"]:                
                output= model(None, None, audio)
            
            elif DATAMODE["RGBDA"]:                
                output= model(imgs, dimgs, audio)
                            
            loss = F.cross_entropy(output, labels)                
                
            values, indices = torch.max(torch.softmax(output, dim=1), 1)        
            all_preds = np.concatenate((all_preds, indices.cpu().data.numpy()), axis=0)            
            
            loss.backward()        
            optimizer.step()
            epoch_loss += loss.item()
            
            if DEBUG and epoch % 10 == 0:
                print("Trian preds: ", indices.cpu().data.numpy())

        epoch_loss = epoch_loss / len(train_dataset)         
        tr_acc = f1_score(all_labels, all_preds, average='weighted')
        

        conf_mat = confusion_matrix(all_labels, all_preds)
        clf_report = classification_report(all_labels, all_preds)
            
        if LOG_COMMET:
            experiment.log_metric("f1_score", tr_acc, step=epoch)
            experiment.log_metric("loss", epoch_loss, step=epoch)        
            experiment.log_confusion_matrix(matrix = conf_mat, step=epoch, file_name="conf_mat_train_{}.json".format(epoch))        
            experiment.log_text(clf_report, step=epoch)

In [22]:
from vivitm3 import ViViTBackbone



model = CnnLstm(None, None) # 7:256, 10:256, 14:384
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=param_dict["lr"], weight_decay=param_dict["weight_decay"])

for epoch in range(param_dict["n_epoch"]):
    print(epoch)
    train_epoch(model, train_loader, epoch)
    test_epoch(model, val_loader, epoch)    


if LOG_COMMET:
    experiment.end()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60


KeyboardInterrupt: 

In [23]:
if LOG_COMMET:
    experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/standardai/conformer/4422c3ea15024224a5d50ead81993c9f
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     test_f1_score [60]  : (0.4833514833514835, 0.5760066833751045)
COMET INFO:     test_loss [60]      : (0.07030855040801198, 0.07227720398651927)
COMET INFO:     train_f1_score [60] : (0.531109865470852, 0.7501831192247013)
COMET INFO:     train_loss [60]     : (0.03361350630543062, 0.043526457622647285)
COMET INFO:   Others:
COMET INFO:     Name : mavi-audio-conformer #09/10/2022 - 15:03:24
COMET INFO:   Parameters:
COMET INFO:     audio_path        : /home/cak/arda/audio
COMET INFO:     audio_path2       : /home/cak/arda/yeni_train/audio
COMET INFO:     batch_norm        : True
COMET INFO:     batch_size        : 8
COMET INFO:     depth