In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append('../scripts')
import os

import yaml
import torch
from argparse import Namespace
from tqdm.notebook import tqdm

from runner import Runner
from metrics import LWLRAP


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
)

In [4]:
import numpy as np

In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
with open('/home/jupyter/rfcx_submission/config/training_config.yaml', 'r') as f:
    args= yaml.load(f,  yaml.FullLoader)

In [7]:
args['epochs'] = 20
args['batch_size'] =2
args['gradient_accumulation_steps'] =8
args['warmup_steps'] = 0
args['mixup'] = 0
args['aug'] = 0
args['loss'] = 'WeightedFocalLoss'
save_path = 'result/run1_no_aug_no_mixup'
args

{'batch_size': 2,
 'ckpt_path': None,
 'csv_path': '/home/jupyter/rfcx_submission/notebooks/train_tp.csv',
 'data_dir': '/home/jupyter/rfcx/data/train',
 'transformer_config_path': '/home/jupyter/rfcx_submission/config/upstream_config.yaml',
 'transformer_weights_path': '/home/jupyter/rfcx/rfcx/model_weights/mockingjay_mel80_no_delta_cmvn_run4/states-2000.ckpt',
 'training': {'upstream': False, 'downstream': True},
 'epochs': 20,
 'gradient_accumulation_steps': 8,
 'warmup_steps': 0,
 'mixup': 0,
 'aug': 0,
 'loss': 'WeightedFocalLoss'}

In [8]:
os.makedirs(save_path, exist_ok=True)

In [9]:
args= Namespace(**args)

In [10]:
runner= Runner(device, args)

In [11]:
runner.set_data_loader()

training: number of docs : 566
evaluation: number of docs : 566


In [12]:
runner.set_model()

In [13]:
ckpt= torch.load('result/run1_no_aug_no_mixup/best_model_19.pt',map_location='cpu')
runner.model.upstream.load_state_dict(ckpt['upstream'])
runner.model.downstream.load_state_dict(ckpt['downstream'])


<All keys matched successfully>

In [14]:
runner.args.training['upstream']

False

In [15]:
def save_ckpt(model, optimizer, save_path,args, i):
    print(f'saving model at epoch {i}')
    all_states = {
        'upstream' : model.upstream.state_dict(),
        'downstream' : model.downstream.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'config' : vars(args)
    }
    torch.save(all_states, f'{save_path}/best_model_{i}.pt')

In [16]:
class WeightedFocalLoss(torch.nn.Module):
    "Non weighted version of Focal Loss"
    def __init__(self, alpha=.25, gamma=1):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1-alpha]).cuda()
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = torch.nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
        targets = targets.type(torch.long)
        
        at = self.alpha.gather(0, targets.data.view(-1))
        at = at.data.view(targets.shape[0], -1)

        pt = torch.exp(-BCE_loss)
        F_loss = at*(1-pt)**self.gamma * BCE_loss
        return F_loss.mean()
    
    
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=5, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma


    def forward(self, inputs, targets):

        BCE_loss = torch.nn.functional.binary_cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

 
        return torch.mean(F_loss)

# num_birds = 24
# pos_weights = torch.ones(num_birds)
# pos_weights = pos_weights * 8   
# loss_function = torch.nn.BCELoss(pos_weights.cuda())
# loss_function = torch.nn.BCELoss()
loss_function = WeightedFocalLoss()

In [17]:
t_total = (
    len(runner.train_dataloader)
    // args.gradient_accumulation_steps
    * args.epochs
)

print(t_total)

1400


In [18]:
# plist = [
#          {'params': runner.model.upstream.parameters(), 'lr': 1e-4,},
#          {'params':runner.model.downstream.parameters(), 'lr': 1e-2}
#          ]

    
optimizer = AdamW(params= runner.model.downstream.parameters(), lr= 0.01, )

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

In [19]:
accumulation_steps= args.gradient_accumulation_steps

In [None]:
best_lwrap= 0

for i in range(args.epochs):
    
    train_loss = []
    train_lwlrap = []
    
    runner.model.train()
    if not runner.args.training['upstream']:
        runner.model.upstream.eval()

    runner.model.downstream.train()

    pbar = tqdm(enumerate(runner.train_dataloader), 
                                      total= len(runner.train_dataloader))

    runner.model.zero_grad()
    for batch, (data, target) in  pbar :
        
        target = target.cuda()
        
        output = runner.model([d.cuda() for d in data])
        loss = loss_function(output, target)
        
        _score= LWLRAP(output.cpu(), target.cpu())
        
        train_lwlrap.append(_score)
        train_loss.append(loss.item())
        
        pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)}")
        
        
        loss = loss / accumulation_steps
        loss.backward()
        
        if (i+1) % accumulation_steps == 0 or (batch+1) == len(runner.train_dataloader):
            optimizer.step()
            scheduler.step()
            runner.model.zero_grad()
    
    for g in optimizer.param_groups:
        
        lr = g['lr']
    print(f'Epoch : {i}  training end. LR: {round(lr, 3)}  Loss: {round (np.mean(train_loss) , 3)}  lwrlrap_score: {round(np.mean(train_lwlrap), 3)}')
        
        
#     with torch.no_grad():
#         val_loss = []
#         val_lwlrap = []

#         runner.model.eval()
#         pbar = tqdm(enumerate(runner.eval_dataloader), 
#                                           total= len(runner.eval_dataloader))
#         for batch, (data, target) in pbar:

#             target = target.cuda()

#             output = runner.model([d.cuda() for d in data])
#             loss = loss_function(output, target)

#             val_loss.append(loss.item())

#             _score= LWLRAP(output.cpu(), target.cpu())
#             val_lwlrap.append(_score)
#             pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)}")

#         print(f'Valid Loss: {np.mean(val_loss)}  val_lwrlrap_score: {np.mean(val_lwlrap)}')
    
#     if np.mean(val_lwlrap) > best_lwrap:
#         save_ckpt(runner.model, optimizer, save_path,args, i)
#         best_lwrap = np.mean(val_lwlrap)
        

# save_ckpt(runner.model, optimizer, save_path,args, i)

HBox(children=(FloatProgress(value=0.0, max=566.0), HTML(value='')))


Epoch : 0  training end. LR: 0.01  Loss: 0.035  lwrlrap_score: 0.534


HBox(children=(FloatProgress(value=0.0, max=566.0), HTML(value='')))


Epoch : 1  training end. LR: 0.01  Loss: 0.057  lwrlrap_score: 0.357


HBox(children=(FloatProgress(value=0.0, max=566.0), HTML(value='')))

In [None]:
# save_ckpt(runner.model, optimizer, save_path,args, i)

In [None]:
# pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)} {output.argmax(axis=1).item()} {target.argmax(axis=1).item()} ")

In [None]:
    with torch.no_grad():
        val_loss = []
        val_lwlrap = []

        runner.model.eval()
        pbar = tqdm(enumerate(runner.eval_dataloader), 
                                          total= len(runner.eval_dataloader))
        for batch, (data, target) in pbar:

            target = target.cuda()

            output = runner.model([d.cuda() for d in data])
            loss = loss_function(output, target)
            print(f'{output.argmax(axis=1)} {target.argmax(axis=1)}')

            val_loss.append(loss.item())

            _score= LWLRAP(output.cpu(), target.cpu())
            val_lwlrap.append(_score)
            pbar.set_description(f"loss : {round(loss.item(), 3)} score: {round(_score,3)}")

        print(f'Valid Loss: {np.mean(val_loss)}  val_lwrlrap_score: {np.mean(val_lwlrap)}')

In [None]:
output

In [None]:
target.argmax(axis=1)

In [None]:
1