# HF SF e2e eval MVP

In [None]:
use_eetq = False
use_half = False

In [None]:
%%time
if use_eetq:
    # !pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121
    torch_whl = "torch-2.1.2%2Bcu121-cp310-cp310-linux_x86_64.whl"
    torch_url = "https://download.pytorch.org/whl/cu121"
    !pip install -qq {torch_url}/{torch_whl} # --no-cache-dir

In [None]:
%%time
!pip install -qq 'transformers==4.44.0'
!pip install -qq 'evaluate==0.4.2'

In [None]:
%%time
!pip install -qq "quanto==0.2.0"
!pip install -qq "optimum-quanto"
# !pip install -qq "accelerate==0.33.0"
# !pip install -qq "bitsandbytes==0.43.3" # --upgrade # >0.37.0
# !pip install -qq "hqq==0.1.8" # hqq slow install
# GPTQ "optimum==1.21.4" "auto-qptq==0.7.1"
#!pip install -qq "optimum==1.21.4"
#!pip install -qq "auto-gptq==0.7.1"
# !pip install -qq autoawq
if use_eetq:
    eetq_whl = 'EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl'
    eetq_url = 'https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0'
    !pip install -qq --no-cache-dir {eetq_url}/{eetq_whl}
    !pip install -qq "accelerate==0.33.0"

In [None]:
%%time
!pip install -qq 'wandb==0.17.8'

In [None]:
!accelerate test
!nvcc -V
!pip list | grep cuda

In [None]:
from evaluate import load
from transformers import (
    SegformerImageProcessor, SegformerForSemanticSegmentation,
    QuantoConfig,
    # BitsAndBytesConfig,
    # HqqConfig,
    # EetqConfig,
    # GPTQConfig
)
from datasets import load_dataset, load_from_disk
# turn off dataset.map() message
# https://github.com/huggingface/datasets/issues/1627
# any logging level higher than WARNING turns off the progress bar
# https://github.com/huggingface/datasets/issues/2651
# not_verbose = bool(logger.getEffectiveLevel() > WARNING)
from datasets.utils.logging import set_verbosity_error

### Difficulties
- EETQ import: `ImportError: /opt/conda/lib/python3.10/site-packages/EETQ.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c106detail23torchInternalAssertFailEPKcS2_jS2_RKSs`
    - Solution not working: install [torch 2.1.2](https://pypi.org/project/torch/2.1.2/) for [EETQ 1.0.0](https://github.com/NetEase-FuXi/EETQ/releases)
    - Solution not working: [torch 2.1.2 + cu121](https://pytorch.org/get-started/previous-versions/) `!pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121` 
    - Solution: wheel [torch-2.1.2%2Bcu121-cp310-cp310-linux_x86_64.whl](https://download.pytorch.org/whl/torch_stable.html) `!pip install -qq --no-cache-dir {torch_url}/{torch_whl}`
- EETQ convert `RuntimeError: [FT][ERROR] Unsupported Arch Assertion fail: /data/EETQ/csrc/cutlass_kernels/cutlass_preprocessors.cc:125`
    -  [Unsupported Arch Assertion fail #30](github.com/NetEase-FuXi/EETQ/issues/30)
- GPTQ quantization: `RuntimeError: We can only quantize pure text model`

In [None]:
import torch
from PIL import Image
from copy import deepcopy
from os import environ, makedirs
from os.path import exists
from datetime import datetime
#from pathlib import Path

In [None]:
import quanto

In [None]:
# https://docs.wandb.ai/guides/track/environment-variables
# https://docs.wandb.ai/ref/python/init
import wandb
# from kaggle_secrets import UserSecretsClient
# wandb_key = UserSecretsClient().get_secret("WANDB_API_KEY")
# wandb.login(key=wandb_key)

In [None]:
wandb_tag_mode = "eval"
wandb_tag_runmode = "multiple-runs"
dataset_name = "scene_parse_150"
model_name_short = "b2"
model_name = f"nvidia/segformer-{model_name_short}-finetuned-ade-512-512"
metric_name = 'mean_iou'
ds_num_shards = 100
ds_shards_mod = ds_num_shards / 10

In [None]:
#model_id = f"{model_repo}/{model_name}"
#model_save_path = f"{drive_mount_path}/MyDrive/Models/{model_id}"
#tokenizer_save_path = f"{drive_mount_path}/MyDrive/Tokenizer/{model_id}"
model_save_path = f"./models/{model_name}"
tokenizer_save_path = f"./tokenizers/{model_name}"
dataset_save_path = f"./datasets/{dataset_name}"

In [None]:
environ['WANDB_PROJECT'] = f'segformer-{dataset_name}-{wandb_tag_mode}-{wandb_tag_runmode}'
environ['WANDB_ENTITY'] = 'ba-segformer' # team

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
compute_dtype = torch.float32
device

## Model

In [None]:
models = {}

In [None]:
%%time
if not exists(model_save_path):
    makedirs(model_save_path, exist_ok=True)
try:
    print("loading from disk")
    models[model_name_short] = SegformerForSemanticSegmentation.from_pretrained(
        model_save_path
    )
except:
    print("loading from source and saving to disk")
    models[model_name_short] = SegformerForSemanticSegmentation.from_pretrained(
        model_name,
        torch_dtype=compute_dtype,
        # device_map="auto" # not implemented for SegFormer
    )
    models[model_name_short].save_pretrained(model_save_path)
models[model_name_short].to(device)
print(models[model_name_short].get_memory_footprint())
models[model_name_short].config._name_or_path

In [None]:
id2label = models[model_name_short].config.id2label
label2id = models[model_name_short].config.label2id
num_labels = len(id2label)
ignore_index = models[model_name_short].config.semantic_loss_ignore_index # 255

### Quantize

In [None]:
if use_half:
    models[f"{model_name_short}-half"] = deepcopy(models[model_name_short]).half()
    model_half.get_memory_footprint()

In [None]:
# quanto
config_quanto = {}
bits_quanto_w = ['float8', 'int8', 'int4', 'int2']
for nbits in bits_quanto_w:
    config_quanto[nbits] = QuantoConfig(
        weights=nbits,
        # activations='int8',
        # modules_to_not_convert = None
    )
for nbits in bits_quanto_w:
    model_htype = f"{model_name_short}-quanto-{nbits}"
    # model_save_path_quanto = model_save_path.parent.absolute() \
    #    / f"{model_name}-{model_htype}"
    model_save_path_quanto = f"{model_save_path}/{model_name}-{model_htype}"
    try:
        print(f"loading local {model_htype}")
        models[model_htype] = SegformerForSemanticSegmentation.from_pretrained(
            model_save_path_quanto
            # local_files_only=True,
        )
        #models[model_htype] = quanto.quantize(
        #    models[model_htype_orig],
        #    quantization_config=config_quanto[nbits]
        #)
    except:
        try:
            print(f"loading local {model_name}")
            models[model_htype] = SegformerForSemanticSegmentation.from_pretrained(
                model_save_path,
                local_files_only=True,
                torch_dtype=compute_dtype,
                quantization_config=config_quanto[nbits],
            )
        except:
            print(f"loading online {model_name}")
            models[model_htype] = SegformerForSemanticSegmentation.from_pretrained(
                model_name,
                #local_files_only=True,
                torch_dtype=compute_dtype,
                quantization_config=config_quanto[nbits],
            )
        # ValueError: The model is quantized with quanto and is not
        # serializable - check out the warnings from the logger on
        # the traceback to understand the reason why the quantized
        # model is not serializable.
        # models[model_htype].save_pretrained(model_save_path_quanto)
    quanto.freeze(models[model_htype])
    models[model_htype] = models[model_htype].to(device)

In [None]:
models.pop(model_name_short);

In [None]:
for m in models.keys():
    print(
        f"{m} size {models[m].get_memory_footprint()*1.0e-6:.2f} MB"
        f" on {models[m].device}"
    )

## Processor

In [None]:
try:
    image_processor = SegformerImageProcessor.from_pretrained(
        tokenizer_save_path
    )
except:
    image_processor = SegformerImageProcessor.from_pretrained(
        model_name
    )
    image_processor.save_pretrained(tokenizer_save_path)
image_processor

## Metric

In [None]:
metric = load(metric_name)
metric.features

In [None]:
metrics_kwargs = {
    'num_labels': num_labels, # Sceneparse 150, ADE20K 3688,
    'reduce_labels': False, # ignore bg cls
    # mandatory?, background class?
    'ignore_index': ignore_index
}

## Dataset

In [None]:
try:
    dataset = load_from_disk(dataset_save_path)
except:
    dataset = load_dataset(dataset_name, trust_remote_code=True)
    dataset.save_to_disk(dataset_save_path)
dataset

## WandB

In [None]:
wandb.login(relogin=True, force=True, key=environ['WANDB_API_KEY'])

In [None]:
def create_wandb_run(verbose=False):
    wandb_run = wandb.init(
        project = environ['WANDB_PROJECT'],
        entity = environ['WANDB_ENTITY'],
        name = quant_used,
        group = dataset_name
    )
    assert wandb_run is wandb.run
    if verbose:
        print(wandb_run)
    return wandb_run

In [None]:
def create_wandb_run_meta(wandb_run, verbose=False):
    wandb_tags = [model_name, dataset_name, device.type, wandb_tag_mode]
    if quant_used:
        wandb_tags += [quant_used]
    elif bias_dtype in [torch.half, torch.float16]:
        wandb_tags += [str(bias_dtype)]
    else:
        wandb_tags += [str(model_used.config.torch_dtype)]
    wandb_run.tags = wandb_tags
    wandb_run.tags
    wandb_run.notes = f"{datetime.now().isoformat()}, " \
        f"model size {model_used.get_memory_footprint()*1.0e-6:.2f} MB, " \
        f"{ds_num_shards=}, {ds_shards_mod=}"
    if verbose:
        print(wandb_run)
    return wandb_run

## Eval

Note: Depending on image dataset annotations/segmentations might have to be converted to PIL mode RGB or L

In [None]:
def get_processed_inputs(dataset, bias_dtype = None):
    set_verbosity_error() # dataset.map()
    dataset = dataset.map(convert_to_RGB, batched=True);
    dataset = image_processor.preprocess(
        images=dataset['image'],
        segmentation_maps=dataset['annotation'],
        return_tensors="pt",
        # do_rescale=False
    )
    pixel_values = dataset['pixel_values'].to(device)
    labels = dataset['labels'].to(device)
    if not bias_dtype is None \
        and bias_dtype in [torch.half, torch.float16]:
        pixel_values = pixel_values.half()
    return pixel_values, labels

def convert_to_RGB(dataset):
    images = [img.convert("RGB") for img in dataset['image']]
    annotations = [img.convert("L") for img in dataset['annotation']]
    # import numpy as np
    # [print(np.array(img).shape) for img in images] # (w, h, d)
    # [print(np.array(img).shape) for img in annotations] # (w, h)
    return {'image': images, 'annotation': annotations}
    
def infer_model(model, pixel_values, labels):    
    with torch.no_grad():
        outputs = model(
            pixel_values=pixel_values,
            labels=labels
        )
    return outputs.loss, outputs.logits

### Difficulties

- hqq8: `RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half` not equal to `RuntimeError: Input type (c10::Half) and bias type (float) should be the same`
- `/opt/conda/lib/python3.10/site-packages/bitsandbytes/nn/modules.py:435: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed. warnings.warn(`
- `/opt/conda/lib/python3.10/site-packages/datasets/features/image.py:348: UserWarning: Downcasting array dtype int64 to int32 to be compatible with 'Pillow' warnings.warn(f"Downcasting array dtype {dtype} to {dest_dtype} to be compatible with 'Pillow'")`

In [None]:
%%time
# Be aware if using CPU and half(): RuntimeError: "slow_conv2d_cpu" not implemented for 'Half'
ds_subset = 'validation' # 'train'
verbose=False

for model in models:
    results = []
    quant_used = model
    model_used = models[quant_used]
    #if quant_used == 'hqq8':
    #    bias_dtype = torch.half
    #else:
    bias_dtype = model_used.base_model.encoder.block[0][0].mlp.dense1.bias.dtype
    model_used.eval()
    for k in range(0, ds_num_shards):
        if k == 0 or k % ds_shards_mod == 0:
            print(f"ds shard {k}/{ds_num_shards}")
        create_wandb_run_meta(create_wandb_run(verbose), verbose)
        pixel_values, labels = get_processed_inputs(
            dataset[ds_subset].shard(num_shards=ds_num_shards, index=k),
            bias_dtype
        )
        loss, logits = infer_model(model_used, pixel_values, labels)
        predictions = torch.nn.functional.interpolate(
            logits,
            # upsample/upsize pred
            size=labels.shape[-2:], # img_size
            mode="bilinear", align_corners=False
        ).argmax(dim=1)
        '''
        metric.add_batch( # .detach()
            predictions=predictions.cpu().numpy(),
            references=labels.cpu().numpy()
        )
        '''
        results = metric.compute(
            predictions=predictions.cpu().numpy(),
            references=labels.cpu().numpy(),
            num_labels=len(id2label), # Sceneparse 150, ADE20K 3688,
            # reduce_labels=False, # ignore bg cls
            # mandatory?, background class?
            ignore_index=model_used.config.semantic_loss_ignore_index # 255
        )
        wandb.log({
            'mean_iou': results['mean_iou'],
            'mean_accuracy': results['mean_accuracy'],
            'overall_accuracy': results['overall_accuracy'],
            'memory_footprint_MB': float(f"{model_used.get_memory_footprint()*1.0e-6:.2f}")
        })
        wandb.finish(quiet=verbose)

In [None]:
wandb.finish()