<a href="https://colab.research.google.com/github/sdelta/ImageGen/blob/main/stylegan2_clip_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Prepare dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install click requests tqdm pyspng ninja imageio-ffmpeg==0.4.3 open_clip_torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspng
  Downloading pyspng-0.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (205 kB)
[K     |████████████████████████████████| 205 kB 4.4 MB/s 
[?25hCollecting ninja
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[K     |████████████████████████████████| 145 kB 56.3 MB/s 
[?25hCollecting imageio-ffmpeg==0.4.3
  Downloading imageio_ffmpeg-0.4.3-py3-none-manylinux2010_x86_64.whl (26.9 MB)
[K     |████████████████████████████████| 26.9 MB 738 kB/s 
[?25hCollecting open_clip_torch
  Downloading open_clip_torch-2.9.1-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 69.3 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 74.8 MB/s 
Collecting protobuf==3.20.*
  Downloading protobuf-3.20.3-cp38-c

In [3]:
!wget https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/ffhq-res256-mirror-paper256-noaug.pkl

--2022-12-31 11:06:30--  https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/ffhq-res256-mirror-paper256-noaug.pkl
Resolving nvlabs-fi-cdn.nvidia.com (nvlabs-fi-cdn.nvidia.com)... 18.160.225.32, 18.160.225.53, 18.160.225.119, ...
Connecting to nvlabs-fi-cdn.nvidia.com (nvlabs-fi-cdn.nvidia.com)|18.160.225.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 295744285 (282M) [binary/octet-stream]
Saving to: ‘ffhq-res256-mirror-paper256-noaug.pkl’


2022-12-31 11:06:41 (28.4 MB/s) - ‘ffhq-res256-mirror-paper256-noaug.pkl’ saved [295744285/295744285]



In [4]:
!cp drive/MyDrive/datasets/ffhq_256/ffhq.zip ./

In [None]:
!ls

### Debug code

In [None]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image = Image.open(requests.get(url, stream=True).raw)

In [None]:
import torch
import open_clip
from PIL import Image
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
assert device == "cuda"

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='laion400m_e32')
tokenizer = open_clip.get_tokenizer('ViT-B-32-quickgelu')

model = model.to(device)

In [None]:
src_images = [image]
src_texts = ["cat", "dog"]
images = torch.tensor(np.stack([preprocess(img) for img in src_images])).to(device)
texts = tokenizer(src_texts).to(device)

In [None]:
texts_features = model.encode_text(texts)
texts_features /= texts_features.norm(dim=-1, keepdim=True)

In [None]:
image_features = model.encode_image(images)
image_features /= image_features.norm(dim=-1, keepdim=True)

In [None]:
sim = torch.matmul(texts_features, image_features.permute(1, 0))

In [None]:
images.shape

In [None]:
sim

In [None]:
from torch.nn import functional as tfn
from torchvision import transforms


def normalize(x, mean, std):
    mean = mean.unsqueeze(1).unsqueeze(2)
    std = std.unsqueeze(1).unsqueeze(2)
    return (x - mean) / std

start = transforms.ToTensor()(image).unsqueeze(0).to(device)
sized = tfn.interpolate(start, size=224, mode='bicubic')
normed = normalize(
    sized,
    torch.tensor(open_clip.OPENAI_DATASET_MEAN).to(device),
    torch.tensor(open_clip.OPENAI_DATASET_STD).to(device)
)

In [None]:
torch.equal(images, normed)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(images[0].cpu().permute(1, 2, 0))

In [None]:
plt.imshow(normed[0].cpu().permute(1, 2, 0))

In [None]:
tr_lst = [
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224)
]

my_preprocess = transforms.Compose(tr_lst)

In [None]:
plt.imshow(tr_lst[0](start[0]).cpu().permute(1, 2, 0))

In [None]:
image_features.shape

In [None]:
input = start.detach()
input.requires_grad_(True)

In [None]:
class CLIPSubloss(object):
    def __init__(self, device, clip_phrase):
        self.device = device
        self.model = model
        self.model = self.model.to(device)
        tokenizer = open_clip.get_tokenizer('ViT-B-32-quickgelu')
        with torch.no_grad():
            self.texts_features = self.model.encode_text(tokenizer([clip_phrase]).to(device))
            self.texts_features /= self.texts_features.norm(dim=-1, keepdim=True)

    def _preprocess_images(self, images):
        resized = torch.nn.functional.interpolate(images, size=224, mode='bicubic')
        mean = torch.tensor(open_clip.OPENAI_DATASET_MEAN).to(self.device).unsqueeze(1).unsqueeze(2)
        std = torch.tensor(open_clip.OPENAI_DATASET_STD).to(self.device).unsqueeze(1).unsqueeze(2)
        return (resized - mean) / std
        
    def get_similarities(self, images):
        images_features = self.model.encode_image(self._preprocess_images(images))
        
        images_norm = images_features.norm(dim=-1, keepdim=True) + 1e-5
        print(images_norm.cpu())
        #return (images_features / images_norm).permute(1, 0)
        return torch.matmul(self.texts_features, (images_features / images_norm).permute(1, 0))

clip_subloss = CLIPSubloss(device, "glasses")

with torch.autograd.set_detect_anomaly(True):
    gen_clip = clip_subloss.get_similarities(input)
    gen_clip.mean().mul(4).backward()


In [None]:
preprocess

### Load repo

In [8]:
! rm -fR stylegan2-ada-pytorch

In [9]:
!git clone https://github.com/sdelta/stylegan2-ada-pytorch.git

Cloning into 'stylegan2-ada-pytorch'...
remote: Enumerating objects: 229, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 229 (delta 44), reused 1 (delta 0), pack-reused 164[K
Receiving objects: 100% (229/229), 1.15 MiB | 9.09 MiB/s, done.
Resolving deltas: 100% (127/127), done.


### Finetune

In [10]:
!python stylegan2-ada-pytorch/train.py --outdir=drive/MyDrive/stylegan_finetuning --data=ffhq.zip \
    --mirror=1 --gpus=1 --resume=ffhq-res256-mirror-paper256-noaug.pkl --kimg=1500 --cfg=paper256 \
    --snap=10 --metrics="none" \
    --freezed=10 --freezed_mapping=True \
    --clip_phrase='glasses' --clip_reg_interval=4


Training options:
{
  "num_gpus": 1,
  "image_snapshot_ticks": 10,
  "network_snapshot_ticks": 10,
  "metrics": [],
  "random_seed": 0,
  "training_set_kwargs": {
    "class_name": "training.dataset.ImageFolderDataset",
    "path": "ffhq.zip",
    "use_labels": false,
    "max_size": 70000,
    "xflip": true,
    "resolution": 256
  },
  "data_loader_kwargs": {
    "pin_memory": true,
    "num_workers": 3,
    "prefetch_factor": 2
  },
  "G_kwargs": {
    "class_name": "training.networks.Generator",
    "z_dim": 512,
    "w_dim": 512,
    "mapping_kwargs": {
      "num_layers": 8,
      "trainable": false
    },
    "synthesis_kwargs": {
      "channel_base": 16384,
      "channel_max": 512,
      "num_fp16_res": 4,
      "conv_clamp": 256
    }
  },
  "D_kwargs": {
    "class_name": "training.networks.Discriminator",
    "block_kwargs": {
      "freeze_layers": 10
    },
    "mapping_kwargs": {
      "trainable": false
    },
    "epilogue_kwargs": {
      "mbstd_group_size": 8
    }

### Results

Failure: not much more people with glasses from generator. Furthermore starting from 30 tick imagegrid show signs of mode collapse - all faces have similar structure