In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append('../scripts')
import os

import yaml
import torch
from argparse import Namespace
from tqdm.notebook import tqdm

from runner import Runner
from metrics import LWLRAP


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
)

In [4]:
import numpy as np

In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
with open('/home/jupyter/rfcx_submission/config/training_config.yaml', 'r') as f:
    args= yaml.load(f,  yaml.FullLoader)

In [7]:
args['epochs'] = 40
args['batch_size'] =16
args['gradient_accumulation_steps'] = 8
args['warmup_steps'] = 0
args['mixup'] = 0
args['aug'] = 0
args['loss'] = 'WeightedFocalLoss'
save_path = 'result/run5_with_aug_with_mixup4_upstream'
args

{'batch_size': 16,
 'ckpt_path': '/home/jupyter/rfcx_submission/notebooks/result/run3_with_aug_with_mixup_upstream/best_model_7.pt',
 'csv_path': '/home/jupyter/rfcx/data/train_tp.csv',
 'data_dir': '/home/jupyter/rfcx/data/train',
 'transformer_config_path': '/home/jupyter/rfcx_submission/config/upstream_config.yaml',
 'transformer_weights_path': '/home/jupyter/rfcx/rfcx/model_weights/mockingjay_mel80_no_delta_cmvn_run4/states-2000.ckpt',
 'training': {'upstream': True, 'downstream': True},
 'epochs': 40,
 'gradient_accumulation_steps': 8,
 'warmup_steps': 0,
 'mixup': 0,
 'aug': 0,
 'loss': 'WeightedFocalLoss'}

In [8]:
os.makedirs(save_path, exist_ok=True)

In [9]:
args= Namespace(**args)

In [10]:
runner= Runner(device, args)

In [11]:
runner.set_data_loader()

using cached dataset
all audio loaded
using cached dataset
all audio loaded
training: number of docs : 76
evaluation: number of docs : 1216


In [12]:
runner.set_model()

In [13]:
ckpt= torch.load('/home/jupyter/rfcx_submission/notebooks/result/run4_with_aug_with_mixup4_upstream/best_model_0.pt',map_location='cpu')
runner.model.upstream.load_state_dict(ckpt['upstream'])
runner.model.downstream.load_state_dict(ckpt['downstream'])


<All keys matched successfully>

In [14]:
runner.args.training['upstream']

True

In [15]:
def save_ckpt(model, optimizer, save_path,args, i):
    print(f'saving model at epoch {i}')
    all_states = {
        'upstream' : model.upstream.state_dict(),
        'downstream' : model.downstream.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'config' : vars(args)
    }
    torch.save(all_states, f'{save_path}/best_model_{i}.pt')

In [16]:
class WeightedFocalLoss(torch.nn.Module):
    "Non weighted version of Focal Loss"
    def __init__(self, alpha=.25, gamma=1):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1-alpha]).cuda()
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = torch.nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
        targets = targets.type(torch.long)
        
        at = self.alpha.gather(0, targets.data.view(-1))
        at = at.data.view(targets.shape[0], -1)

        pt = torch.exp(-BCE_loss)
        F_loss = at*(1-pt)**self.gamma * BCE_loss
        return F_loss.mean()
    
    
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=5, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma


    def forward(self, inputs, targets):

        BCE_loss = torch.nn.functional.binary_cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

 
        return torch.mean(F_loss)

# num_birds = 24
# pos_weights = torch.ones(num_birds)
# pos_weights = pos_weights * 8   
# loss_function = torch.nn.BCELoss(pos_weights.cuda())
# loss_function = torch.nn.BCELoss()
loss_function = WeightedFocalLoss()

In [17]:
t_total = (
    len(runner.train_dataloader)
    // args.gradient_accumulation_steps
    * args.epochs
)

print(t_total)

360


In [18]:
plist = [
         {'params': runner.model.upstream.parameters(), 'lr': 5e-5,},
         {'params':runner.model.downstream.parameters(), 'lr': 5e-4}
         ]

    
optimizer = AdamW(params= plist, lr= 0.01, )

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

In [19]:
print(optimizer)
print(scheduler)

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 5e-05
    weight_decay: 0.0

Parameter Group 1
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 0.0005
    lr: 0.0005
    weight_decay: 0.0
)
<torch.optim.lr_scheduler.LambdaLR object at 0x7fcbac4368d0>


In [20]:
accumulation_steps= args.gradient_accumulation_steps

In [21]:
best_lwrap= 0

for i in range(args.epochs):
    
    train_loss = []
    train_lwlrap = []
    
    runner.model.train()
    if not runner.args.training['upstream']:
        runner.model.upstream.eval()

    runner.model.downstream.train()

    pbar = tqdm(enumerate(runner.train_dataloader), 
                                      total= len(runner.train_dataloader))

    runner.model.zero_grad()
    for batch, (data, target) in  pbar :
        
        target = target.cuda()
        
        output = runner.model([d.cuda() for d in data])
        loss = loss_function(output, target)
        
        _score= LWLRAP(output.cpu(), target.cpu())
        
        train_lwlrap.append(_score)
        train_loss.append(loss.item())
        
        pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)}")
        
        
        loss = loss / accumulation_steps
        loss.backward()
        
        if (i+1) % accumulation_steps == 0 or (batch+1) == len(runner.train_dataloader):
            optimizer.step()
            scheduler.step()
            runner.model.zero_grad()
    
    lrs= [group['lr'] for group in optimizer.param_groups]
    print(f'Epoch : {i}  training end. LR: {lrs}  Loss: {round (np.mean(train_loss) , 3)}  lwrlrap_score: {round(np.mean(train_lwlrap), 3)}')
        
        
    with torch.no_grad():
        val_loss = []
        val_lwlrap = []

        runner.model.eval()
        pbar = tqdm(enumerate(runner.eval_dataloader), 
                                          total= len(runner.eval_dataloader))
        for batch, (data, target) in pbar:

            target = target.cuda()

            output = runner.model([d.cuda() for d in data])
            loss = loss_function(output, target)

            val_loss.append(loss.item())

            _score= LWLRAP(output.cpu(), target.cpu())
            val_lwlrap.append(_score)
            pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)}")

        print(f'Valid Loss: {np.mean(val_loss)}  val_lwrlrap_score: {np.mean(val_lwlrap)}')
    
    if np.mean(val_lwlrap) > best_lwrap:
        save_ckpt(runner.model, optimizer, save_path,args, i)
        best_lwrap = np.mean(val_lwlrap)
        
    print('='*100)
save_ckpt(runner.model, optimizer, save_path,args, i)

HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 0  training end. LR: [4.986111111111111e-05, 0.0004986111111111111]  Loss: 0.054  lwrlrap_score: 0.648


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.020597201708465275  val_lwrlrap_score: 0.9057807875552067
saving model at epoch 0


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 1  training end. LR: [4.972222222222223e-05, 0.0004972222222222222]  Loss: 0.054  lwrlrap_score: 0.64


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.02137984785574525  val_lwrlrap_score: 0.9046213430426034


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 2  training end. LR: [4.958333333333334e-05, 0.0004958333333333334]  Loss: 0.055  lwrlrap_score: 0.638


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.021699691509160113  val_lwrlrap_score: 0.9031106210436279


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 3  training end. LR: [4.9444444444444446e-05, 0.0004944444444444445]  Loss: 0.055  lwrlrap_score: 0.624


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.02161521858521921  val_lwrlrap_score: 0.896377918941557


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 4  training end. LR: [4.930555555555556e-05, 0.0004930555555555556]  Loss: 0.054  lwrlrap_score: 0.634


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.02116089756638853  val_lwrlrap_score: 0.8963440301227629


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 5  training end. LR: [4.9166666666666665e-05, 0.0004916666666666666]  Loss: 0.054  lwrlrap_score: 0.636


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.020621861099865975  val_lwrlrap_score: 0.8995555595335502


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 6  training end. LR: [4.902777777777778e-05, 0.0004902777777777777]  Loss: 0.051  lwrlrap_score: 0.654


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.020090601887587628  val_lwrlrap_score: 0.9018956695830351


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 7  training end. LR: [3.8472222222222225e-05, 0.0003847222222222222]  Loss: 0.052  lwrlrap_score: 0.656


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.018938237494263665  val_lwrlrap_score: 0.9156355024437959
saving model at epoch 7


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 8  training end. LR: [3.8333333333333334e-05, 0.00038333333333333334]  Loss: 0.052  lwrlrap_score: 0.661


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.018884967273870165  val_lwrlrap_score: 0.917998456065298
saving model at epoch 8


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 9  training end. LR: [3.8194444444444444e-05, 0.0003819444444444444]  Loss: 0.052  lwrlrap_score: 0.665


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.01883314309375955  val_lwrlrap_score: 0.9233575569600553
saving model at epoch 9


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 10  training end. LR: [3.805555555555555e-05, 0.00038055555555555553]  Loss: 0.052  lwrlrap_score: 0.662


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.018740158557193354  val_lwrlrap_score: 0.9305386159103364
saving model at epoch 10


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 11  training end. LR: [3.791666666666667e-05, 0.00037916666666666665]  Loss: 0.052  lwrlrap_score: 0.658


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.01865880679810959  val_lwrlrap_score: 0.9327312423663802
saving model at epoch 11


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 12  training end. LR: [3.777777777777778e-05, 0.00037777777777777777]  Loss: 0.052  lwrlrap_score: 0.658


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.01849696535952573  val_lwrlrap_score: 0.9339400230980429
saving model at epoch 12


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 13  training end. LR: [3.763888888888889e-05, 0.0003763888888888889]  Loss: 0.051  lwrlrap_score: 0.677


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.018364438360693288  val_lwrlrap_score: 0.9367556559338578
saving model at epoch 13


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 14  training end. LR: [3.7500000000000003e-05, 0.000375]  Loss: 0.053  lwrlrap_score: 0.666


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.018371088949176117  val_lwrlrap_score: 0.9368731371421171
saving model at epoch 14


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 15  training end. LR: [2.6944444444444445e-05, 0.00026944444444444444]  Loss: 0.051  lwrlrap_score: 0.658


HBox(children=(FloatProgress(value=0.0, max=1216.0), HTML(value='')))


Valid Loss: 0.01761150794596975  val_lwrlrap_score: 0.9457135976561787
saving model at epoch 15


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))


Epoch : 16  training end. LR: [2.6805555555555557e-05, 0.00026805555555555556]  Loss: 0.048  lwrlrap_score: 0.693


OSError: [Errno 12] Cannot allocate memory