In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="datasets.features.image")


In [3]:
!pip install -q transformers datasets evaluate segments-ai
# apt-get install git-lfs
# git lfs install
# huggingface-cli login

In [4]:
import wandb

import os

## Load and Prepare Dataset

In [5]:
from datasets import load_dataset

repo_id = "mattmdjaga/human_parsing_dataset"

dataset = load_dataset(repo_id)


## Shuffle and Split Dataset

### Renaming the Dataset Columns

In [6]:
dataset = dataset.rename_column('image', 'pixel_values')
dataset = dataset.rename_column('mask','label')

In [7]:
dataset = dataset.shuffle(seed=1)
dataset = dataset["train"].train_test_split(test_size=0.2)
train_ds = dataset["train"]
test_ds = dataset["test"]

# Select the first 100 samples from the train dataset
train_ds = train_ds.select(range(10000))

# Select the first 100 samples from the test dataset
test_ds = test_ds.select(range(1000))

In [8]:
import json
from huggingface_hub import hf_hub_download

# repo_id = f"datasets/{hf_dataset_identifier}"
filename = "id2label.json"
id2label = json.load(open(hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(id2label)


In [9]:
import json

# Pretty-print the JSON content
print("id2label.json Content:")
print(json.dumps(id2label, indent=4))


id2label.json Content:
{
    "0": "Background",
    "1": "Hat",
    "2": "Hair",
    "3": "Sunglasses",
    "4": "Upper-clothes",
    "5": "Skirt",
    "6": "Pants",
    "7": "Dress",
    "8": "Belt",
    "9": "Left-shoe",
    "10": "Right-shoe",
    "11": "Face",
    "12": "Left-leg",
    "13": "Right-leg",
    "14": "Left-arm",
    "15": "Right-arm",
    "16": "Bag",
    "17": "Scarf"
}


## Image Processor and Data Augmentation

In [10]:
from torchvision.transforms import ColorJitter
from transformers import SegformerImageProcessor

processor = SegformerImageProcessor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = processor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = processor(images, labels)
    return inputs


# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)

## Fine Tune a SegFormer

In [11]:
from transformers import SegformerForSemanticSegmentation
model_name="/kaggle/working/model_(2)/checkpoint-4000"
pretrained_model_name = "nvidia/mit-b0" 
model = SegformerForSemanticSegmentation.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id
)


## Trainer

In [12]:
from transformers import TrainingArguments

epochs = 50
lr = 0.00006
batch_size = 16


training_args = TrainingArguments(
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    report_to="wandb",  # enable logging to W&B
    run_name="MiT-Sagformer-0",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    weight_decay=0.01, 
    logging_steps=1,
    output_dir="/kaggle/working/model_(2)",
    eval_accumulation_steps=2,
    load_best_model_at_end=True,
)


## Evaluation Metric

In [13]:
import torch
from torch import nn
import evaluate

metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    metrics = metric.compute(
        predictions=pred_labels,
        references=labels,
        num_labels=len(id2label),
        ignore_index=-1,
        reduce_labels=processor.do_reduce_labels,
    )
    
    # add per category metrics as individual key-value pairs
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
    
    return metrics


## Trainer

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msayedhanan[0m ([33msayedhanan-virtual-university-of-pakistan[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Mean Iou,Mean Accuracy,Overall Accuracy,Accuracy Background,Accuracy Hat,Accuracy Hair,Accuracy Sunglasses,Accuracy Upper-clothes,Accuracy Skirt,Accuracy Pants,Accuracy Dress,Accuracy Belt,Accuracy Left-shoe,Accuracy Right-shoe,Accuracy Face,Accuracy Left-leg,Accuracy Right-leg,Accuracy Left-arm,Accuracy Right-arm,Accuracy Bag,Accuracy Scarf,Iou Background,Iou Hat,Iou Hair,Iou Sunglasses,Iou Upper-clothes,Iou Skirt,Iou Pants,Iou Dress,Iou Belt,Iou Left-shoe,Iou Right-shoe,Iou Face,Iou Left-leg,Iou Right-leg,Iou Left-arm,Iou Right-arm,Iou Bag,Iou Scarf
1000,0.1147,0.111936,0.69823,0.797594,0.96388,0.989463,0.792911,0.896427,0.65499,0.918034,0.899057,0.913413,0.890686,0.307133,0.705677,0.710873,0.909403,0.832087,0.861647,0.832224,0.849456,0.844319,0.548884,0.982305,0.695148,0.793873,0.545263,0.829608,0.819215,0.844968,0.774784,0.241525,0.54883,0.557239,0.820955,0.732723,0.745141,0.712466,0.719046,0.723843,0.481216
2000,0.1034,0.114878,0.694927,0.798868,0.963595,0.99003,0.798873,0.897977,0.663286,0.901628,0.921087,0.918703,0.883312,0.332959,0.711832,0.721291,0.917153,0.82642,0.859873,0.835391,0.859391,0.830948,0.509464,0.982401,0.686852,0.793755,0.548919,0.829725,0.822763,0.840993,0.770175,0.255029,0.545886,0.556385,0.819194,0.721697,0.734552,0.714116,0.719422,0.72106,0.445756
3000,0.1253,0.114571,0.697498,0.796754,0.963645,0.990283,0.812957,0.895693,0.643852,0.920236,0.886792,0.914694,0.870817,0.347248,0.693176,0.707506,0.906612,0.833168,0.850855,0.823352,0.83013,0.851712,0.562495,0.982595,0.69101,0.79527,0.544408,0.828381,0.805371,0.840632,0.76423,0.269055,0.554967,0.564978,0.822223,0.726568,0.741139,0.717719,0.723643,0.724506,0.458264
4000,0.0745,0.119346,0.697802,0.797206,0.963539,0.990622,0.783743,0.899836,0.662328,0.927238,0.88736,0.923021,0.821516,0.400057,0.670312,0.727449,0.910022,0.845996,0.856304,0.831416,0.859884,0.843441,0.509171,0.9826,0.688957,0.796632,0.554907,0.827092,0.800248,0.841903,0.757662,0.286001,0.547251,0.570056,0.824527,0.72668,0.736791,0.71655,0.724751,0.725879,0.451941
5000,0.0808,0.121473,0.694101,0.793134,0.962817,0.99011,0.802762,0.908075,0.66061,0.91894,0.899837,0.9244,0.830866,0.369882,0.700784,0.702719,0.897545,0.842678,0.839821,0.841857,0.82996,0.841156,0.474405,0.982709,0.68871,0.79692,0.555324,0.823708,0.785071,0.838908,0.737559,0.279086,0.552232,0.560727,0.825458,0.73427,0.744752,0.71394,0.726624,0.725664,0.422149
6000,0.0991,0.121702,0.701752,0.802259,0.963898,0.9909,0.790988,0.895143,0.650452,0.910867,0.907743,0.906644,0.858971,0.396961,0.69947,0.710068,0.913379,0.857187,0.848813,0.845468,0.847116,0.859427,0.551072,0.982706,0.690744,0.798754,0.551839,0.831115,0.801318,0.8329,0.765232,0.292136,0.561831,0.571201,0.824717,0.733158,0.747962,0.717866,0.729278,0.722554,0.476221
7000,0.0812,0.121086,0.700042,0.798237,0.963866,0.990567,0.79322,0.898134,0.662586,0.916684,0.890299,0.910994,0.872246,0.381593,0.718547,0.714771,0.914088,0.833693,0.855639,0.836606,0.840838,0.839322,0.498433,0.98276,0.695865,0.799413,0.557482,0.831596,0.802848,0.834479,0.760237,0.289042,0.557459,0.56694,0.82631,0.72974,0.74871,0.723701,0.730471,0.725642,0.438061
8000,0.0928,0.122522,0.702641,0.80146,0.963969,0.99039,0.79215,0.891586,0.661262,0.91681,0.901375,0.916268,0.867047,0.394485,0.703521,0.711023,0.911822,0.849053,0.848631,0.850539,0.844153,0.841075,0.535084,0.982825,0.687671,0.798186,0.559816,0.829054,0.801484,0.840137,0.758135,0.293505,0.560666,0.570059,0.827591,0.739182,0.751802,0.723631,0.731213,0.729113,0.463463




## Inference

In [None]:
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
from transformers import SegformerImageProcessor

processor = SegformerImageProcessor()
model_path = "/kaggle/working/checkpoint-1000"

# processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained(model_path)


In [None]:
image = test_ds[5]['pixel_values']
gt_seg = test_ds[5]['label']

In [None]:
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
from PIL import Image
import requests
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
from torch import nn

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

# First, rescale logits to original image size
upsampled_logits = nn.functional.interpolate(
    logits,
    size=image.size[::-1], # (height, width)
    mode='bilinear',
    align_corners=False
)

# Second, apply argmax on the class dimension
pred_seg = upsampled_logits.argmax(dim=1)[0]


In [None]:
plt.imshow(pred_seg)