# 1. Introduction

This notebook outlines the creation, compilation, and training of a deep learing network using the [TorchSuite](https://github.com/sergio-sanz-rodriguez/torchsuite) framework. In particular, the ConvNeXt model will be used to make predictions on visual quality scores.

## 1.1. Motivation

The widespread use of aesthetic filters on social media introduces new challenges for Image Quality Assessment (IQA), as traditional distortion-based metrics often fail to capture the subjective, content-aware characteristics of these enhancements. This notebook proposes a no-reference IQA baseline model based on the ConvNeXt-Large architecture to evaluate the subjective quality of filter-manipulated images. The output of the model is a Mean Opinion Score (MOS)-like grade scale ranging from 0 to 1, where 1 represents excellent subjective quality and 0 represents poor subjective quality. More details about the proposed model can be found in the TorchSuite repository: papers/ConvNeXt_Ensemble_IQA.pdf."

For the sake of simplicity, this notebook focuses only on the training process of a single ConvNeXt-Large model, and does not implement the entire pipeline.

This work is part of the VCIP 2025 conference’s Image Manipulation Quality Assessment (IMQA) Grand Challenge: https://jiangliu5.github.io/imqac.github.io/. The image database is publicly available through the link.

# 2. Importing Libraries

In [None]:
import os
import torch
import torch.backends.cudnn as cudnn
import pandas as pd
import numpy as np
import random

from torchinfo import summary
from torchvision.transforms import v2
from pathlib import Path
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, StepLR
from random import sample
from torchvision.transforms import InterpolationMode
from sklearn.model_selection import KFold

# Import custom libraries
from engines.regression import RegressionEngine
from engines.schedulers import FixedLRSchedulerWrapper
from utils.common_utils import download_data, set_seeds
from utils.regression_utils import display_random_images_regression
from dataloaders.image_dataloaders import RegressionDataset, create_regression_dataloaders
from models.convnext import convnext_tiny, convnext_small, convnext_base, convnext_large, convnext_xlarge, model_urls
from models.pretrained_models import build_pretrained_model

import warnings
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="torch.autograd.graph")
warnings.filterwarnings("ignore", category=FutureWarning, module="onnxscript.converter")

# 3. Importing Dataset

The dataset is organized as follows, with one subdirectory for the images and another subdirectory containing the CSV files with the scores:

```
dataset/
├── all_images/   
│   ├── img1.jpg
│   ├── img2.png
│   └── ...
└── labels/
    └── train_labels.csv
    └── test_labels.csv
```

Alternatively:

```
dataset/
├── all_images/   
│   ├── img1.jpg
│   ├── img2.png
│   └── ...
└── labels/
    └── labels.csv
```

The labels.csv file can be split into two separate CSV files or dataframes: one for training and another for validation/testing.

The CSV file(s) can also be placed in a different location within the dataset directory if preferred.

The CSV file(s) should contain at least two columns:

1. Image paths in the first column, and
2. Scores (e.g., MOS) in the second column:

```
image_name,mos
Act2_clahe_1,0.372379619
Act4_toning_2,0.705094234
```

Alternatively:

```
image_name,mos
Act2_clahe_1.jpg,0.372379619
Act4_toning_2.jpg,0.705094234
```

Or:

```
image_name,mos
<path>/Act2_clahe_1.jpg,0.372379619
<path>/Act4_toning_2.jpg,0.705094234
```

In [None]:
# Define some constants
NUM_WORKERS = os.cpu_count()
AMOUNT_TO_GET = 1.0
SEED = 42
THEME = 'dark'

# Define target data directory

DONWLOAD_DATA = False
if DONWLOAD_DATA:
    download_data(
        source="https://drive.google.com/uc?export=download&id=1d81_Lb7J1fpVU7Jw60cvXYrmYF3Qa8lU",
        destination="data/regression")
BASELINE_NAME = f"data/regression/VCIP"
BASELINE = Path(BASELINE_NAME)
TARGET_DIR = BASELINE / "EQ420_image"
TARGET_LABEL = BASELINE / "Labels"
TARGET_BASE = BASELINE / "IMQA"

# Setup training and test directories
TARGET_DIR.mkdir(parents=True, exist_ok=True)

# Create target model directory
MODEL_DIR = Path("outputs")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Set seeds
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    set_seeds(seed)
    # torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
seed_torch(seed=SEED)

# 4. Classes and Functions

In [None]:
def oversample_mos_ranges(df, target_count=60, bins=5, random_state=42):
    """
    Oversamples underrepresented MOS ranges to reach a target count per bin.
    Does NOT undersample overrepresented bins — all original data is kept.
    """
    # Bin the MOS scores
    df['mos_bin'] = pd.cut(df['mos'], bins=bins)

    resampled_dfs = []

    for bin_range, bin_df in df.groupby('mos_bin'):
        if len(bin_df) < target_count:
            # Oversample with replacement
            resampled_bin = bin_df.sample(
                n=target_count - len(bin_df),
                replace=True,
                random_state=random_state
            )
            # Combine original + oversampled
            resampled_dfs.append(pd.concat([bin_df, resampled_bin]))
        else:
            # Keep as is
            resampled_dfs.append(bin_df)

    resampled_df = pd.concat(resampled_dfs).drop(columns='mos_bin').reset_index(drop=True)
    return resampled_df

# 5. Specifying the Target Device

In [None]:
# Activate cuda benchmark
cudnn.benchmark = True

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

if device == "cuda":
    !nvidia-smi

# 6. Image Visualization

In [None]:
# Display images
manual_transforms = v2.Compose([
    v2.Resize((512)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True)
])

csv_data = pd.read_csv(TARGET_LABEL / 'mos_fold_train.csv').sample(frac=1)
fold_list = list(range(1, 9))
fold_train = sample(fold_list, 7)
train_data = csv_data[csv_data['folds'].isin(fold_train)]
train_data.drop('folds', axis=1, inplace=True)
train_dataset = RegressionDataset(
    data=train_data,
    image_folder=TARGET_DIR,
    transform=manual_transforms,
    )

fold_test = list(set(fold_list) - set(fold_train))
test_data = csv_data[csv_data['folds'].isin(fold_test)].copy()
test_data.drop('folds', axis=1, inplace=True)

##datasets.ImageFolder(TRAIN_DIR, transform=manual_transforms)
display_random_images_regression(
    train_dataset,
    n=15,                   
    rows=5,
    cols=3,
    theme=THEME,
    display_shape=False,
    seed=None)

# 7. Preparing Dataloaders

In [None]:
# Split the csv file into two dataframes: train and test
csv_data = pd.read_csv(TARGET_LABEL / 'mos_fold_train.csv').sample(frac=1)
fold_list = list(range(1, 9))
fold_train = sample(fold_list, 7) # Train split: 87.5%
fold_test = list(set(fold_list) - set(fold_train))  # Test split: 12.5%
train_data = csv_data[csv_data['folds'].isin(fold_train)]
test_data = csv_data[csv_data['folds'].isin(fold_test)].copy()
train_data.drop('folds', axis=1, inplace=True)
test_data.drop('folds', axis=1, inplace=True)

# Pre-processing transformations
IMG_SIZE = 512
BATCH_SIZE = 2
train_transforms = v2.Compose([
    v2.Resize((IMG_SIZE), interpolation=InterpolationMode.BICUBIC),
    v2.RandomHorizontalFlip(p=0.2),
    v2.RandomResizedCrop(IMG_SIZE, scale=(0.95, 1.0), ratio=(0.95, 1.05)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
test_transforms = v2.Compose([
    v2.Resize((IMG_SIZE), interpolation=InterpolationMode.BICUBIC),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

# Create the training and test dataloaders
train_dataloader, test_dataloader = create_regression_dataloaders(
        train_data=train_data,
        test_data=test_data,
        train_image_folder=TARGET_DIR,
        test_image_folder=TARGET_DIR,
        train_transform=train_transforms,
        test_transform=test_transforms,
        batch_size=BATCH_SIZE,
        num_workers=0 #NUM_WORKERS
        )
dataloaders = {
    'train':         train_dataloader,
    'test':          test_dataloader
    }

# 8. Creating a ConvNeXT-Large Model

In [None]:
# Instantiate the model
NUM_METRICS = 1
DROPOUT = 0.3
HIDDEN_DIM = 256
MODEL_ARCH="convnext_large"

# Build a ConvNeXt-Large model from the convnext.py library
model = convnext_large(
    pretrained=True,
    in_22k=True,
    freeze_backbone=False,
    mlp_hidden_dim=HIDDEN_DIM,
    output_dim=NUM_METRICS,
    drop_path_rate=DROPOUT,
    dropout=DROPOUT,
    )

# Or just use PyTorch's default ConvNeXt.
# However, for this task better performance is achieved by the prevous model
#model = build_pretrained_model(
#    model="convnext_l",
#    mlp_hidden_dim=HIDDEN_DIM,
#    output_dim=NUM_METRICS,
#    dropout=DROPOUT,
#    freeze_backbone=False,
#    device=device,
#    seed=SEED)

summary(model,
        input_size=(BATCH_SIZE,3,IMG_SIZE, IMG_SIZE),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

# 7. Training the Model

In [None]:
# Constant definition
EPOCHS = 150
LR = 1e-5
model_type="model_convnext_reg"
model_name = model_type + ".pth"

# Create AdamW optimizer
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=1e-4
)

# Create loss function: MSE oss
loss_fn = torch.nn.MSELoss()
# Or Huber loss with delta between 0.0 and 1.0:
# Low delta → more robust to outliers: switches to Mean Absolute Error (L1 loss) sooner, good for noisy data
# Higher delta → smoother fit: stays in Mean Squared Error (L2 loss) region longer, better for clean data
# loss_fn = torch.nn.HuberLoss(delta=0.1)

# Create scheduler
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=LR/100)
#scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LR/100)

# And train...

# Instantiate the classification engine with the created model and the target device
engine = RegressionEngine(
    model=model,                                # Model to be trained
    optimizer=optimizer,                        # Optimizer
    loss_fn=loss_fn,                            # Loss function
    scheduler=scheduler,                        # Scheduler 
    use_distillation=False,                     # Optional, use_distillation is False by default    
    log_verbose=True,                           # Verbosity
    theme=THEME,                                # Theme
    device=device                               # Target device
    )

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    resume=False,                               # Resume training from the last saved checkpoint
    save_best_model=["last", "loss", "r2"],     # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
    dataloaders=dataloaders,                    # Dictionary with the dataloaders     
    apply_validation=True,                      # Enable validation step
    augmentation_strategy="always",             # Augmentation strategy    
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
    )