<a href="https://colab.research.google.com/github/roberttwomey/ml-art-code/blob/master/sg3/StyleGAN3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EMAR349 ML for the Arts - Twomey - [ml.roberttwomey.com](https://ml.roberttwomey.com)

#StyleGAN3

This notebook explores **StyleGAN3** (aka Alias-Free GAN) released in [this repo](https://github.com/NVlabs/stylegan3) by NVidia in 2021. Adapted for ML for the Arts from [a Colab](https://colab.research.google.com/drive/1OkZZa5Yzt4scTwbwBORkki5-zNEAdXW1#scrollTo=IkBzNIQ9QsFB) produced by [crimeacs](https://twitter.com/EarthML1).
____
**[UPD 18.10.2021]** Added ThisSneakersDoesn'tExist model by [@stan_vossen](https://twitter.com/stan_vossen)  +  seems like [@l4rz](https://twitter.com/l4rz) killed the model for cosplay

[UPD 17.10.2021] Added Music Video Generation (originally inspired by [this tweet](https://twitter.com/hexorcismos/status/1449032666574213125?s=20))

[UPD 14.10.2021] Added Cosplay Faces trained by [@l4rz](https://twitter.com/l4rz)

In [None]:
#!pip install --upgrade torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
!pip install --upgrade torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
!pip install einops ninja
#!pip install --upgrade https://download.pytorch.org/whl/nightly/cu111/torch-1.11.0.dev20211012%2Bcu111-cp37-cp37m-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu111/torchvision-0.12.0.dev20211012%2Bcu111-cp37-cp37m-linux_x86_64.whl
!git clone https://github.com/NVlabs/stylegan3
!pip install einops ninja

In [None]:
import sys
sys.path.append('./stylegan3')

import tensorflow
import io
import os, time
import pickle
import shutil
import numpy as np
from PIL import Image
import torch
import torch.nn.functional as F
import requests
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF
from tqdm.notebook import tqdm
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from IPython.display import display
from einops import rearrange

In [None]:
#@title Generate an image
#@markdown StyleGAN3 pre-trained models for config T (translation equiv.) and config R (translation and rotation equiv.)
seed = 4011 #@param {type:"slider", min:0, max:9999, step:1}

baselink ='https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/'
model = "stylegan3-r-metfaces-1024x1024.pkl" #@param ["sneakers", "stylegan2-cosplay-faces-512x512-px", "stylegan3-r-afhqv2-512x512.pkl", "stylegan3-r-ffhq-1024x1024.pkl", "stylegan3-r-ffhqu-1024x1024.pkl","stylegan3-r-ffhqu-256x256.pkl","stylegan3-r-metfaces-1024x1024.pkl","stylegan3-r-metfacesu-1024x1024.pkl","stylegan3-t-afhqv2-512x512.pkl","stylegan3-t-ffhq-1024x1024.pkl","stylegan3-t-ffhqu-1024x1024.pkl","stylegan3-t-ffhqu-256x256.pkl","stylegan3-t-metfaces-1024x1024.pkl","stylegan3-t-metfacesu-1024x1024.pkl"]

if model == "stylegan2-cosplay-faces-512x512-px":
    baselink = 'https://l4rz.net/'
    model = 'cosplayface-snapshot-004000-18160-FID367.pkl'

if model == 'sneakers':
    if 'sneaksnap.pkl' not in os.listdir('/content/stylegan3'):
        !gdown --id 1ReK9P4dkkClvpswdSuew35xCx2xjVsQa
    baselink = '/content/stylegan3/'
    model = 'sneaksnap.pkl'

# Generate an image using pre-trained model
!python stylegan3/gen_images.py --outdir=out --trunc=1 \
 --seeds=$seed --network=$baselink$model

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

plt.figure(figsize=(10,10))
img = Image.open('/content/out/seed%04d.png' % seed);
plt.imshow(img);
plt.axis('off');

Open file browser at left and download still images from `/out` to to your local machine.

In [None]:
#@title Generate an interpolation video
%cd /content/stylegan3

start_seed = 42 #@param {type:"number"}
stop_seed = 669 #@param {type:"number"}
n_cols =  1#@param {type:"number"}
n_rows = 2 #@param {type:"number"}

#@markdown How many key frames to have?
num_keyframes = 3 #@param {type:"number"}

#@markdown How many frames for interpolation?
w_frames = 90 #@param {type:"number"}

#@markdown Total length in frames is `num_keyframes`*`w_frames`

assert stop_seed > start_seed, 'Stop_seed should be larger then start_seed'
baselink ='https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/'
model = "stylegan3-r-metfaces-1024x1024.pkl" #@param ["sneakers", "stylegan2-cosplay-faces-512x512-px", "stylegan3-r-afhqv2-512x512.pkl", "stylegan3-r-ffhq-1024x1024.pkl", "stylegan3-r-ffhqu-1024x1024.pkl","stylegan3-r-ffhqu-256x256.pkl","stylegan3-r-metfaces-1024x1024.pkl","stylegan3-r-metfacesu-1024x1024.pkl","stylegan3-t-afhqv2-512x512.pkl","stylegan3-t-ffhq-1024x1024.pkl","stylegan3-t-ffhqu-1024x1024.pkl","stylegan3-t-ffhqu-256x256.pkl","stylegan3-t-metfaces-1024x1024.pkl","stylegan3-t-metfacesu-1024x1024.pkl"]

if model == "stylegan2-cosplay-faces-512x512-px":
    baselink = 'https://l4rz.net/'
    model = 'cosplayface-snapshot-004000-18160-FID367.pkl'

if model == 'sneakers':
    if 'sneaksnap.pkl' not in os.listdir('/content/stylegan3'):
        !gdown --id 1ReK9P4dkkClvpswdSuew35xCx2xjVsQa
    baselink = '/content/stylegan3/'
    model = 'sneaksnap.pkl'

# Render a  grid of interpolations for seeds N through K.
!python gen_video.py --output=lerp.mp4 --trunc=1 --seeds=$start_seed-$stop_seed --grid={n_rows}x{n_cols} \
    --network=$baselink$model --num-keyframes=$num_keyframes \
    --w-frames=$w_frames

In [None]:

from IPython.display import HTML
from base64 import b64encode
mp4 = open('lerp.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

Open file browser at left and download `lerp.mp4` to save it on your local machine. (Found in `/stylegan3` directory)

# Activities:
- Try experimenting with different seeds. You can use the image generation code above to test out seeds.
- Slow down the interpolation, generating a higher number of frames between keyframes.
  - change `w_frames` in the code above.
- Try a different number of keyframes.
- Try a different model. See a list here: [awesome-pretrained-stylegan3](https://github.com/justinpinkney/awesome-pretrained-stylegan3)
- Play with the truncation value (currently 1).
  - Select **Show code** and modify the `--trunc=1` argument to `!python gen_video.py`
- Make your own `gen_video.py` method below that pauses (holds) on each keyframe as opposed to smoothly interpolating through. (source is [here](https://github.com/NVlabs/stylegan3/blob/main/gen_video.py))


In [None]:
#@title # Optional: Generate 🎵 music video
#@markdown ##**Choose your settings**
from IPython.display import clear_output
%cd /content/stylegan3

import requests
import pickle
import torch
import os
import numpy as np
import matplotlib.pyplot as plt

import librosa
from scipy.io import wavfile

import time
import torchvision.transforms.functional as TF
from tqdm.notebook import tqdm
from PIL import ImageOps

def fetch(url_or_path):
    if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):
        r = requests.get(url_or_path)
        r.raise_for_status()
        fd = io.BytesIO()
        fd.write(r.content)
        fd.seek(0)
        return fd
    return open(url_or_path, 'rb')

def fetch_model(url_or_path):
    basename = os.path.basename(url_or_path)
    if os.path.exists(basename):
        return basename
    else:
        !wget -c '{url_or_path}'
        return basename

baselink ='https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/'
model = "stylegan3-r-metfaces-1024x1024.pkl" #@param ["sneakers", "stylegan2-cosplay-faces-512x512-px", "stylegan3-r-afhqv2-512x512.pkl", "stylegan3-r-ffhq-1024x1024.pkl", "stylegan3-r-ffhqu-1024x1024.pkl","stylegan3-r-ffhqu-256x256.pkl","stylegan3-r-metfaces-1024x1024.pkl","stylegan3-r-metfacesu-1024x1024.pkl","stylegan3-t-afhqv2-512x512.pkl","stylegan3-t-ffhq-1024x1024.pkl","stylegan3-t-ffhqu-1024x1024.pkl","stylegan3-t-ffhqu-256x256.pkl","stylegan3-t-metfaces-1024x1024.pkl","stylegan3-t-metfacesu-1024x1024.pkl"]

if model == "stylegan2-cosplay-faces-512x512-px":
    baselink = 'https://l4rz.net/'
    model = 'cosplayface-snapshot-004000-18160-FID367.pkl'

network_url = baselink + model
device = torch.device('cuda:0')

if model == 'sneakers':
    if 'sneaksnap.pkl' not in os.listdir('/content/stylegan3'):
        !gdown --id 1ReK9P4dkkClvpswdSuew35xCx2xjVsQa
    network_url = '/content/stylegan3/sneaksnap.pkl'

with open(fetch_model(network_url), 'rb') as fp:
  G = pickle.load(fp)['G_ema'].to(device)

seed =  42#@param {type:"number"}

#@markdown How variable should the video be? (lower values - less variable)
#if you are reading that - you are smart enough to map frequencies to psi as well
truncation_psi = 0.5 #@param {type:"number"}

#@markdown How *strongly* should the image change?
effect_strength =  1#@param {type:"number"}

zs = torch.randn([10000, G.mapping.z_dim], device=device)
w_stds = G.mapping(zs, None).std(0)

#@markdown Link to MP3 audio file (you can also extact music from a Youtube link)
audio_link = 'https://cdn.pixabay.com/download/audio/2021/03/26/audio_dd57ac8732.mp3?filename=east-2-west-3513.mp3' #@param {type:"string"}
if 'youtu.be' not in audio_link:
    !wget {audio_link} -O audio.mp3
else:
    !youtube-dl --extract-audio --audio-format mp3 https://youtu.be/0OkiUUU3Odw -o music_temp.mp3
    !ffmpeg -i music_temp.mp3 -af silenceremove=1:0:-50dB audio.mp3

#@markdown Cut audio to N seconds
cut_start =  15#@param {type:"number"}
cut_end =  30#@param {type:"number"}

cut_len = cut_end-cut_start

#@markdown How many frames to use for interpolation?
interp_frames =  5#@param {type:"number"}

#@markdown Which frequencies to use?
freqs = 'all' #@param ['low', 'high', 'all']

arr, fr = librosa.load('audio.mp3')
arr = arr[int(fr*cut_start):int(fr*cut_end)]

wavfile.write('audio.wav', fr, arr)

# stft = torch.stft(torch.tensor(arr),
#            G.mapping.z_dim*2-1,
#            hop_length=G.mapping.z_dim//4,
#            center=False,
#            pad_mode='reflect',
#            normalized=True,
#            onesided=True,
#            return_complex=True)

stft=librosa.feature.melspectrogram(y=arr,
                               sr=fr,
                               n_fft=2048,
                               hop_length=G.mapping.z_dim*4,
                               n_mels=G.mapping.z_dim)

stft = torch.log(torch.tensor(stft).abs())

if freqs == 'low':
    stft[stft.size(0)//2:, :] *= 10

if freqs == 'high':
    stft[:stft.size(0)//2, :] *= 10

clear_output()

#FRAMES
import time
import torchvision.transforms.functional as TF
from tqdm.notebook import tqdm

zq = []
with torch.no_grad():
    timestring = time.strftime('%Y%m%d%H%M%S')
    # rand_z = torch.randn(stft.size(-1), G.mapping.z_dim).to(device)
    # q = (G.mapping(rand_z, None, truncation_psi=truncation_psi))

    for i in range(stft.size(-1)):
        frame = stft[:,i].T.to(device)
        z = torch.mean(G.mapping(frame.unsqueeze(0), None, truncation_psi=truncation_psi), dim=0)
        zq.append(z.unsqueeze(0)*effect_strength)

    count = 0
    for k in tqdm(range(len(zq)-1)):
        i_val = torch.linspace(0,1,interp_frames).to(device)
        for interpolation in tqdm(i_val, leave=False):
            interp = torch.lerp(zq[k], zq[k+1], interpolation)
            images = G.synthesis(interp)
            images = ((images + 1)/2).clamp(0,1)
            pil_image = TF.to_pil_image(images[0].cpu())
            if model == 'sneakers':
                pil_image = ImageOps.invert(pil_image)
            os.makedirs(f'samples/{timestring}', exist_ok=True)
            pil_image.save(f'samples/{timestring}/{count:04}.png')
            count+=1


#VIDEO
from IPython import display
from base64 import b64encode
from tqdm.notebook import tqdm
from PIL import Image

fps = count/cut_len

frames = []
# tqdm.write('Generating video...')
for i in sorted(os.listdir(f'samples/{timestring}')): #
    frames.append(Image.open(f"samples/{timestring}/{i}"))

from subprocess import Popen, PIPE
p = Popen(['ffmpeg', '-y', '-f', 'image2pipe', '-vcodec', 'png', '-r', str(fps), '-i', '-', '-vcodec', 'libx264', '-r', str(fps), '-pix_fmt', 'yuv420p', '-crf', '17', '-preset', 'veryslow', 'video.mp4'], stdin=PIPE)
for im in tqdm(frames):
    im.save(p.stdin, 'PNG')
p.stdin.close()
p.wait()

!ffmpeg -y -i video.mp4 -i audio.wav -map 0 -map 1:a -c:v copy -shortest video_audio.mp4

clear_output()
# mp4 = open('video.mp4','rb').read()
mp4 = open('video_audio.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

display.HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

#@markdown P.S.: *If it crushed - look for `video-audio.mp4` in `stylegan3` folder*

In [None]:
# @title
# # work in progress
# # make visualizer
# # stop looping, start parallelizing
# # Clone Real-ESRGAN and enter the Real-ESRGAN
# !git clone https://github.com/xinntao/Real-ESRGAN.git
# %cd Real-ESRGAN
# # Set up the environment
# !pip install --upgrade basicsr
# # !pip install facexlib
# # !pip install gfpgan
# # !pip install -r requirements.txt
# # !python setup.py develop
# # # Download the pre-trained model
# # !wget https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth -P experiments/pretrained_models