In [None]:
'''
File to train the pipeline
clip -> feed forward network
'''
import torch
import numpy as np
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from model_data import PriceModel, PriceDataset, collate_function, SMAPELoss, LogCoshLoss
from tqdm import tqdm
from time import time

In [2]:

def train_one_epoch(
    epoch,
    model,
    train_loader,
    eval_loader,
    loss_fn,
    metric_fn,
    optimizer,
    scheduler,
    steps, 
    save_dir, 
    best_metric,
    patience,
    no_improvement, 
    grad_accum_steps
):
    model.train()
    n_steps = len(train_loader)
    n_samples = len(train_loader.dataset)
    start_time = time()
    losses = []
    early_stopping = False
    for i, batch in enumerate(train_loader):
        images, texts, targets = batch
        targets = targets.to(device=model.backbone.device)
        predictions = model(images, texts)
        temp = loss_fn(predictions, targets)
        loss = temp/grad_accum_steps
        
        loss.backward()
        if (i+1)%grad_accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()
            
        losses.append(temp.item()*len(images))
        print(f'Epoch: {epoch} \t Step[{i+1}/{n_steps}] \t Loss : {temp.item():.3f}')

        if (i+1)%steps==0:
            print('Starting Evaluation'+'='*50)
            eval_loss, eval_metric, eval_time = evaluate(model, eval_loader, loss_fn, metric_fn)
            scheduler.step(eval_loss)

            print(f'Eval Time: {eval_time} \t Loss: {eval_loss:.3f} \t Eval Metric: {eval_metric:.3f}')
            checkpoint = {
                'model': model.state_dict(),
                # 'optimizer': optimizer.state_dict(),
                'train_loss': temp.item(),
                'eval_loss': eval_loss,
                'epoch': epoch,
                'step': i
            }
            torch.save(checkpoint, f'{save_dir}/checkpoint.pth')
            with open(f'{save_dir}/log.txt', 'a') as f:
                line = f'{i},{temp.item()},{eval_loss}\n'
                f.write(line)
            
            if eval_metric < best_metric:
                best_metric = eval_metric
                no_improvement = 0
                torch.save(checkpoint, f'{save_dir}/best.pth')
            else:
                no_improvement += 1
            if no_improvement >= patience:
                print(f'Early Stopping, no improvement for {no_improvement} steps')
                early_stopping = True
                break
            model.train()
            
    end_time = time()
    return np.sum(losses)/n_samples, end_time - start_time, early_stopping, no_improvement, best_metric


def evaluate(model, eval_loader, loss_fn, metric_fn):
    losses = []
    metrics = []
    model.eval()
    n_steps = len(eval_loader)
    n_samples = len(eval_loader.dataset)
    start_time = time()
    with torch.inference_mode():
        for i, batch in enumerate(eval_loader):
            images, texts, targets = batch
            targets = targets.to(device=model.backbone.device)
            predictions = model(images, texts)
            loss = loss_fn(predictions, targets)
            losses.append(loss.item()*len(images))
            metric = metric_fn(predictions, targets)
            metrics.append(metric.item()*len(images))
            
            print(f'Evaluation Step[{i+1}/{n_steps}] \t Loss: {(losses[-1]/len(images)):.3f} \t Metric: {(metrics[-1]/len(images)):.3f}')
        end_time = time()
    return np.sum(losses)/n_samples, np.sum(metrics)/n_samples, end_time - start_time


def train(    
    model,
    train_loader,
    eval_loader,
    loss_fn,
    metric_fn,
    optimizer,
    scheduler,
    epochs,
    patience,
    save_dir,
    steps, 
    grad_accum_steps,
    best_metric
    
):
    start_time = time()
    no_improvement = 0
    for epoch in range(epochs):
        train_loss, epoch_time, early_stopping, no_improvement, best_metric = train_one_epoch(epoch, model, train_loader, eval_loader,
                                                                              loss_fn, metric_fn, optimizer, scheduler, steps, save_dir,
                                                                              best_metric, patience, no_improvement, grad_accum_steps)
        print(f'Epoch: {epoch} \t Train Time: {epoch_time} \t Loss: {train_loss:.3f}')
        if early_stopping:
            break
        
    end_time = time()
    print(f'Total Train Time: {end_time - start_time}')
    return
        
    
    
    

In [3]:
def split_dataset(train_size, full_dataset, seed):
    train_len = int(train_size*len(full_dataset))
    test_len = len(full_dataset) - train_len
    
    return random_split(full_dataset, [train_len, test_len], torch.manual_seed(seed))

In [4]:
batch_size=128
steps = 100
grad_accum_steps = 1

model = PriceModel(checkpoint='google/siglip2-large-patch16-256', cache_dir = 'hf_models')
model.load_checkpoint('output/best.pth')
dataset = PriceDataset(annotations_file='student_resource/dataset/train_focused.csv', 
                       image_dir = 'train_images', content_col='focused_sentence')
train_dataset, test_dataset = split_dataset(0.9, dataset, 0)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn = collate_function)
eval_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, collate_fn = collate_function)
loss_fn = LogCoshLoss()
metric_fn = SMAPELoss()
# loss_fn = SMAPELoss()
optimizer = AdamW(model.parameters(), 1e-5)
scheduler = ReduceLROnPlateau(optimizer, min_lr=1e-8)

torch.cuda.empty_cache()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
# check_dataset = torch.utils.data.Subset(train_dataset, list(range(23)))
# check_loader = DataLoader(dataset=check_dataset, batch_size=batch_size, shuffle=True, collate_fn = collate_function)

In [None]:
train(
    model = model,
    train_loader=train_loader,
    eval_loader=eval_loader,
    loss_fn = loss_fn,
    metric_fn = metric_fn,
    optimizer = optimizer,
    scheduler=scheduler,
    epochs = 3,
    patience = 5,
    save_dir = 'output',
    steps = steps,
    grad_accum_steps = grad_accum_steps,
    best_metric = 0.197
)
# train(
#     model = model,
#     train_loader=check_loader,
#     eval_loader=check_loader,
#     loss_fn = loss_fn,
#     metric_fn = metric_fn,
#     optimizer = optimizer,
#     scheduler=scheduler,
#     epochs = 3,
#     patience = 5,
#     save_dir = 'test_output',
#     steps = steps
# )

Epoch: 0 	 Step[1/517] 	 Loss : 0.031
Epoch: 0 	 Step[2/517] 	 Loss : 0.302
Epoch: 0 	 Step[3/517] 	 Loss : 0.185
Epoch: 0 	 Step[4/517] 	 Loss : 0.066
Epoch: 0 	 Step[5/517] 	 Loss : 0.093
Epoch: 0 	 Step[6/517] 	 Loss : 0.114
Epoch: 0 	 Step[7/517] 	 Loss : 0.104
Epoch: 0 	 Step[8/517] 	 Loss : 0.065
