In [1]:
import torch
print(torch.__version__, torch.version.cuda, torch.cuda.is_available())


2.8.0+cu128 12.8 True


In [6]:
# Johnson Main Module (Van Gogh)
import os, math, time, random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from PIL import Image
import numpy as np

# --------------------- Config ---------------------
CONTENT_IMAGE = "content test2.jpg"     # (your content path)
STYLE_IMAGE   = "Van_Gogh_-_Starry_Night.jpg"   # (your style path)
OUTPUT_IMAGE  = "johnson_stylized2.jpg"
MODEL_PATH    = "transform_net.pth"

# Minimal "dataset" defaults to the single content image; point this to a folder for better training
CONTENT_DIR   = "content"       # folder of training content images (e.g., "./coco_subset"); "." uses the single content image
IMAGE_SIZE    = 512       # train/infer resolution (256/512/768). Higher => better quality, slower training
BATCH_SIZE    = 4
NUM_EPOCHS    = 2         # increase (e.g., 2–4) if you have a folder of content images
LEARNING_RATE = 1e-3
SEED          = 42
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Perceptual loss weights (Johnson et al. typical ranges)
CONTENT_WEIGHT = 0.5
STYLE_WEIGHT   = 1e4      # try 5e3–1e4; if style is weak, raise this
TV_WEIGHT      = 1e-6

# VGG style/content layers (Johnson et al.: content=relu2_2; style=relu1_2, relu2_2, relu3_3, relu4_3)
LAYER_IDX = {
    "relu1_2": 3,
    "relu2_2": 8,
    "relu3_3": 15,
    "relu4_3": 22
}
STYLE_LAYERS   = ["relu1_2","relu2_2","relu3_3","relu4_3"]
CONTENT_LAYER  = "relu2_2"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --------------------- Utils ---------------------
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

def to_tensor(img):
    return transforms.ToTensor()(img)  # in [0,1]

def to_image(t: torch.Tensor) -> Image.Image:
    t = t.detach().clamp(0,1).cpu()
    return transforms.ToPILImage()(t.squeeze(0))

def normalize_batch(x):
    mean = IMAGENET_MEAN.to(x.device)
    std  = IMAGENET_STD.to(x.device)
    return (x - mean) / std

def gram_matrix(feat: torch.Tensor):
    # feat: [N,C,H,W] -> Gram per sample, normalized by C*H*W
    N, C, H, W = feat.size()
    f = feat.view(N, C, H*W)
    G = torch.bmm(f, f.transpose(1,2)) / (C*H*W)
    return G

def tv_loss(x):
    return (torch.mean(torch.abs(x[..., :, 1:] - x[..., :, :-1])) +
            torch.mean(torch.abs(x[..., 1:, :] - x[..., :-1, :])))

# --------------------- Transform Net (Johnson et al.) ---------------------
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        # Residuals
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        # Decoder
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0  # map from [-1,1] to [0,1]

# --------------------- VGG16 Loss Network ---------------------
class VGG16Features(nn.Module):
    def __init__(self, layers_idx):
        super().__init__()
        vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_FEATURES).features.eval()
        for p in vgg.parameters(): p.requires_grad_(False)
        self.vgg = vgg
        self.layers_idx = layers_idx
    def forward(self, x):
        feats = {}
        for i, layer in enumerate(self.vgg):
            x = layer(x)
            for name, idx in self.layers_idx.items():
                if i == idx:
                    feats[name] = x
        return feats

# --------------------- Dataset ---------------------
class SingleOrFolderDataset(Dataset):
    def __init__(self, content_dir_or_file, image_size):
        super().__init__()
        p = Path(content_dir_or_file)
        if p.is_dir():
            self.paths = [str(pp) for pp in p.glob("*") if pp.suffix.lower() in [".jpg",".png",".jpeg",".bmp",".webp"]]
        else:
            self.paths = [str(p)]
        self.T = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
    def __len__(self):
        return max(200, len(self.paths))  # repeat if just one file
    def __getitem__(self, idx):
        p = self.paths[idx % len(self.paths)]
        img = Image.open(p).convert("RGB")
        return self.T(img)

# --------------------- Prep ---------------------
assert Path(CONTENT_IMAGE).exists(), f"Not found: {CONTENT_IMAGE}"
assert Path(STYLE_IMAGE).exists(),   f"Not found: {STYLE_IMAGE}"

device = torch.device(DEVICE)
print("Using device:", device)

# Style targets (Gram matrices of VGG features for the style image)
style_img = load_image(STYLE_IMAGE, size=IMAGE_SIZE)
style_t   = to_tensor(style_img).unsqueeze(0).to(device)
vgg_loss_net = VGG16Features(LAYER_IDX).to(device)
with torch.no_grad():
    style_feats = vgg_loss_net(normalize_batch(style_t))
style_targets = {k: gram_matrix(v).detach() for k, v in style_feats.items() if k in STYLE_LAYERS}

# --------------------- Train Transform Net (per-style) ---------------------
transform_net = TransformNet().to(device)
optim = torch.optim.Adam(transform_net.parameters(), lr=LEARNING_RATE)

ds = SingleOrFolderDataset(CONTENT_DIR if Path(CONTENT_DIR).exists() else CONTENT_IMAGE, IMAGE_SIZE)
loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)

print(f"Training on {len(ds)} samples (repeating if needed). Epochs={NUM_EPOCHS}")
global_step = 0
for epoch in range(NUM_EPOCHS):
    t0 = time.time()
    for batch in loader:
        global_step += 1
        batch = batch.to(device)
        optim.zero_grad()

        y_hat = transform_net(batch)                # [0,1]
        y_hat_norm = normalize_batch(y_hat.clone()) # normalize for VGG

        # VGG features
        feats_y_hat = vgg_loss_net(y_hat_norm)
        feats_x     = vgg_loss_net(normalize_batch(batch))

        # Content loss (relu2_2)
        content_loss = F.mse_loss(feats_y_hat[CONTENT_LAYER], feats_x[CONTENT_LAYER])

        # Style loss (sum over layers)
        style_loss = 0.0
        for k in STYLE_LAYERS:
            gm_y = gram_matrix(feats_y_hat[k])
            gm_s = style_targets[k].expand_as(gm_y)
            style_loss += F.mse_loss(gm_y, gm_s)

        # TV regularization
        tv = tv_loss(y_hat)

        loss = CONTENT_WEIGHT * content_loss + STYLE_WEIGHT * style_loss + TV_WEIGHT * tv
        loss.backward()
        optim.step()

        if global_step % 50 == 0:
            print(f"epoch {epoch+1}/{NUM_EPOCHS} step {global_step:05d} | "
                  f"content {content_loss.item():.4f} | style {style_loss.item():.4f} | tv {tv.item():.5f}")

    print(f"Epoch {epoch+1} done in {time.time()-t0:.1f}s")

torch.save(transform_net.state_dict(), MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

# --------------------- Stylize your content image ---------------------
content_img = load_image(CONTENT_IMAGE, size=IMAGE_SIZE)
content_t   = to_tensor(content_img).unsqueeze(0).to(device)

transform_net.eval()
with torch.no_grad():
    out = transform_net(content_t)

to_image(out).save(OUTPUT_IMAGE)
print(f"Saved stylized image to {OUTPUT_IMAGE}")


Using device: cuda
Training on 49981 samples (repeating if needed). Epochs=2
epoch 1/2 step 00050 | content 93.7331 | style 0.1886 | tv 0.01818
epoch 1/2 step 00100 | content 115.5576 | style 0.0714 | tv 0.06030
epoch 1/2 step 00150 | content 166.1798 | style 0.0260 | tv 0.11096
epoch 1/2 step 00200 | content 162.0240 | style 0.0194 | tv 0.12365
epoch 1/2 step 00250 | content 172.3693 | style 0.0130 | tv 0.12441
epoch 1/2 step 00300 | content 139.0951 | style 0.0098 | tv 0.12284
epoch 1/2 step 00350 | content 149.9817 | style 0.0080 | tv 0.12195
epoch 1/2 step 00400 | content 133.9678 | style 0.0078 | tv 0.12116
epoch 1/2 step 00450 | content 163.1645 | style 0.0059 | tv 0.13026
epoch 1/2 step 00500 | content 148.6179 | style 0.0066 | tv 0.12953
epoch 1/2 step 00550 | content 145.7158 | style 0.0044 | tv 0.12841
epoch 1/2 step 00600 | content 156.4479 | style 0.0049 | tv 0.12683
epoch 1/2 step 00650 | content 158.4084 | style 0.0043 | tv 0.13104
epoch 1/2 step 00700 | content 140.4634 

In [7]:
# Inference for Johnson et al. transform net (single-style)
# Inputs:
#   MODEL_PATH: trained weights you saved
#   INPUT_CONTENT: any content image you want to stylize
# Output:
#   OUTPUT_IMAGE: stylized result

import torch
from PIL import Image
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F

MODEL_PATH    = "transform_net.pth"
INPUT_CONTENT = "content test3.jpg"   # <-- put your content image here
OUTPUT_IMAGE  = "stylized_result.jpg"
IMAGE_SIZE    = 512
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# ----- minimal helpers (match training-time scaling [0,1]) -----
def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

to_tensor = transforms.ToTensor()
to_pil    = transforms.ToPILImage()

# ----- TransformNet definition (must match what you trained) -----
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0

# ----- run inference -----
net = TransformNet().to(DEVICE)
net.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
net.eval()

img = load_image(INPUT_CONTENT, size=IMAGE_SIZE)
t   = to_tensor(img).unsqueeze(0).to(DEVICE)

with torch.no_grad():
    out = net(t).clamp(0,1)

to_pil(out.squeeze(0).cpu()).save(OUTPUT_IMAGE)
print(f"Saved: {OUTPUT_IMAGE}")


Saved: stylized_result.jpg


In [1]:
# Main Johnson module ( Van Gogh )
import os, math, time, random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from PIL import Image
import numpy as np

# --------------------- Config ---------------------
CONTENT_IMAGE = "content test2.jpg"     # (your content path)
STYLE_IMAGE   = "Van_Gogh_-_Starry_Night.jpg"   # (your style path)
OUTPUT_IMAGE  = "johnson_stylized3.jpg"
MODEL_PATH    = "transform_net2.pth"

# Minimal "dataset" defaults to the single content image; point this to a folder for better training
CONTENT_DIR   = "content"       # folder of training content images (e.g., "./coco_subset"); "." uses the single content image
IMAGE_SIZE    = 512       # train/infer resolution (256/512/768). Higher => better quality, slower training
BATCH_SIZE    = 4
NUM_EPOCHS    = 2         # increase (e.g., 2–4) if you have a folder of content images
LEARNING_RATE = 1e-3
SEED          = 42
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Perceptual loss weights (Johnson et al. typical ranges)
CONTENT_WEIGHT = 0.3
STYLE_WEIGHT   = 2e4
TV_WEIGHT      = 5e-7

# VGG style/content layers (Johnson et al.: content=relu2_2; style=relu1_2, relu2_2, relu3_3, relu4_3)
LAYER_IDX = {
    "relu1_2": 3,
    "relu2_2": 8,
    "relu3_3": 15,
    "relu4_3": 22
}
STYLE_LAYERS   = ["relu1_2","relu2_2","relu3_3","relu4_3"]
CONTENT_LAYER  = "relu2_2"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --------------------- Utils ---------------------
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

def to_tensor(img):
    return transforms.ToTensor()(img)  # in [0,1]

def to_image(t: torch.Tensor) -> Image.Image:
    t = t.detach().clamp(0,1).cpu()
    return transforms.ToPILImage()(t.squeeze(0))

def normalize_batch(x):
    mean = IMAGENET_MEAN.to(x.device)
    std  = IMAGENET_STD.to(x.device)
    return (x - mean) / std

def gram_matrix(feat: torch.Tensor):
    # feat: [N,C,H,W] -> Gram per sample, normalized by C*H*W
    N, C, H, W = feat.size()
    f = feat.view(N, C, H*W)
    G = torch.bmm(f, f.transpose(1,2)) / (C*H*W)
    return G

def tv_loss(x):
    return (torch.mean(torch.abs(x[..., :, 1:] - x[..., :, :-1])) +
            torch.mean(torch.abs(x[..., 1:, :] - x[..., :-1, :])))

# --------------------- Transform Net (Johnson et al.) ---------------------
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        # Residuals
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        # Decoder
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0  # map from [-1,1] to [0,1]

# --------------------- VGG16 Loss Network ---------------------
class VGG16Features(nn.Module):
    def __init__(self, layers_idx):
        super().__init__()
        vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_FEATURES).features.eval()
        for p in vgg.parameters(): p.requires_grad_(False)
        self.vgg = vgg
        self.layers_idx = layers_idx
    def forward(self, x):
        feats = {}
        for i, layer in enumerate(self.vgg):
            x = layer(x)
            for name, idx in self.layers_idx.items():
                if i == idx:
                    feats[name] = x
        return feats

# --------------------- Dataset ---------------------
class SingleOrFolderDataset(Dataset):
    def __init__(self, content_dir_or_file, image_size):
        super().__init__()
        p = Path(content_dir_or_file)
        if p.is_dir():
            self.paths = [str(pp) for pp in p.glob("*") if pp.suffix.lower() in [".jpg",".png",".jpeg",".bmp",".webp"]]
        else:
            self.paths = [str(p)]
        self.T = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
    def __len__(self):
        return max(200, len(self.paths))  # repeat if just one file
    def __getitem__(self, idx):
        p = self.paths[idx % len(self.paths)]
        img = Image.open(p).convert("RGB")
        return self.T(img)

# --------------------- Prep ---------------------
assert Path(CONTENT_IMAGE).exists(), f"Not found: {CONTENT_IMAGE}"
assert Path(STYLE_IMAGE).exists(),   f"Not found: {STYLE_IMAGE}"

device = torch.device(DEVICE)
print("Using device:", device)

# Style targets (Gram matrices of VGG features for the style image)
style_img = load_image(STYLE_IMAGE, size=IMAGE_SIZE)
style_t   = to_tensor(style_img).unsqueeze(0).to(device)
vgg_loss_net = VGG16Features(LAYER_IDX).to(device)
with torch.no_grad():
    style_feats = vgg_loss_net(normalize_batch(style_t))
style_targets = {k: gram_matrix(v).detach() for k, v in style_feats.items() if k in STYLE_LAYERS}

# --------------------- Train Transform Net (per-style) ---------------------
transform_net = TransformNet().to(device)
optim = torch.optim.Adam(transform_net.parameters(), lr=LEARNING_RATE)

ds = SingleOrFolderDataset(CONTENT_DIR if Path(CONTENT_DIR).exists() else CONTENT_IMAGE, IMAGE_SIZE)
loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)

print(f"Training on {len(ds)} samples (repeating if needed). Epochs={NUM_EPOCHS}")
global_step = 0
for epoch in range(NUM_EPOCHS):
    t0 = time.time()
    for batch in loader:
        global_step += 1
        batch = batch.to(device)
        optim.zero_grad()

        y_hat = transform_net(batch)                # [0,1]
        y_hat_norm = normalize_batch(y_hat.clone()) # normalize for VGG

        # VGG features
        feats_y_hat = vgg_loss_net(y_hat_norm)
        feats_x     = vgg_loss_net(normalize_batch(batch))

        # Content loss (relu2_2)
        content_loss = F.mse_loss(feats_y_hat[CONTENT_LAYER], feats_x[CONTENT_LAYER])

        # Style loss (sum over layers)
        style_loss = 0.0
        for k in STYLE_LAYERS:
            gm_y = gram_matrix(feats_y_hat[k])
            gm_s = style_targets[k].expand_as(gm_y)
            style_loss += F.mse_loss(gm_y, gm_s)

        # TV regularization
        tv = tv_loss(y_hat)

        loss = CONTENT_WEIGHT * content_loss + STYLE_WEIGHT * style_loss + TV_WEIGHT * tv
        loss.backward()
        optim.step()

        if global_step % 50 == 0:
            print(f"epoch {epoch+1}/{NUM_EPOCHS} step {global_step:05d} | "
                  f"content {content_loss.item():.4f} | style {style_loss.item():.4f} | tv {tv.item():.5f}")

    print(f"Epoch {epoch+1} done in {time.time()-t0:.1f}s")

torch.save(transform_net.state_dict(), MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

# --------------------- Stylize your content image ---------------------
content_img = load_image(CONTENT_IMAGE, size=IMAGE_SIZE)
content_t   = to_tensor(content_img).unsqueeze(0).to(device)

transform_net.eval()
with torch.no_grad():
    out = transform_net(content_t)

to_image(out).save(OUTPUT_IMAGE)
print(f"Saved stylized image to {OUTPUT_IMAGE}")


Using device: cuda
Training on 49981 samples (repeating if needed). Epochs=2
epoch 1/2 step 00050 | content 89.7276 | style 0.1904 | tv 0.01657
epoch 1/2 step 00100 | content 111.8093 | style 0.0880 | tv 0.05242
epoch 1/2 step 00150 | content 174.2180 | style 0.0257 | tv 0.11028
epoch 1/2 step 00200 | content 171.5444 | style 0.0197 | tv 0.13163
epoch 1/2 step 00250 | content 183.7618 | style 0.0119 | tv 0.12803
epoch 1/2 step 00300 | content 153.4534 | style 0.0089 | tv 0.13070
epoch 1/2 step 00350 | content 167.8714 | style 0.0079 | tv 0.12718
epoch 1/2 step 00400 | content 153.0028 | style 0.0071 | tv 0.13008
epoch 1/2 step 00450 | content 181.9163 | style 0.0056 | tv 0.13547
epoch 1/2 step 00500 | content 165.8787 | style 0.0051 | tv 0.13324
epoch 1/2 step 00550 | content 166.4273 | style 0.0045 | tv 0.13566
epoch 1/2 step 00600 | content 178.4802 | style 0.0051 | tv 0.13308
epoch 1/2 step 00650 | content 182.4744 | style 0.0041 | tv 0.13657
epoch 1/2 step 00700 | content 168.4446 

In [4]:
# Hyper Parameter Van Gogh 
import os, math, time, random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from PIL import Image
import numpy as np

# --------------------- Config ---------------------
CONTENT_IMAGE = "content test2.jpg"     # (your content path)
STYLE_IMAGE   = "Van_Gogh_-_Starry_Night.jpg"   # (your style path)
OUTPUT_IMAGE  = "johnson_stylized5.jpg"
MODEL_PATH    = "transform_net4.pth"

# Minimal "dataset" defaults to the single content image; point this to a folder for better training
CONTENT_DIR   = "content"       # folder of training content images (e.g., "./coco_subset"); "." uses the single content image
IMAGE_SIZE    = 512       # train/infer resolution (256/512/768). Higher => better quality, slower training
BATCH_SIZE    = 4
NUM_EPOCHS    = 2         # increase (e.g., 2–4) if you have a folder of content images
LEARNING_RATE = 1e-3
SEED          = 42
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Perceptual loss weights (Johnson et al. typical ranges)
CONTENT_WEIGHT = 1.0
STYLE_WEIGHT   = 5e3
TV_WEIGHT      = 1e-6

# VGG style/content layers (Johnson et al.: content=relu2_2; style=relu1_2, relu2_2, relu3_3, relu4_3)
LAYER_IDX = {
    "relu1_2": 3,
    "relu2_2": 8,
    "relu3_3": 15,
    "relu4_3": 22
}
STYLE_LAYERS   = ["relu1_2","relu2_2","relu3_3","relu4_3"]
CONTENT_LAYER  = "relu2_2"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --------------------- Utils ---------------------
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

def to_tensor(img):
    return transforms.ToTensor()(img)  # in [0,1]

def to_image(t: torch.Tensor) -> Image.Image:
    t = t.detach().clamp(0,1).cpu()
    return transforms.ToPILImage()(t.squeeze(0))

def normalize_batch(x):
    mean = IMAGENET_MEAN.to(x.device)
    std  = IMAGENET_STD.to(x.device)
    return (x - mean) / std

def gram_matrix(feat: torch.Tensor):
    # feat: [N,C,H,W] -> Gram per sample, normalized by C*H*W
    N, C, H, W = feat.size()
    f = feat.view(N, C, H*W)
    G = torch.bmm(f, f.transpose(1,2)) / (C*H*W)
    return G

def tv_loss(x):
    return (torch.mean(torch.abs(x[..., :, 1:] - x[..., :, :-1])) +
            torch.mean(torch.abs(x[..., 1:, :] - x[..., :-1, :])))

# --------------------- Transform Net (Johnson et al.) ---------------------
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        # Residuals
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        # Decoder
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0  # map from [-1,1] to [0,1]

# --------------------- VGG16 Loss Network ---------------------
class VGG16Features(nn.Module):
    def __init__(self, layers_idx):
        super().__init__()
        vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_FEATURES).features.eval()
        for p in vgg.parameters(): p.requires_grad_(False)
        self.vgg = vgg
        self.layers_idx = layers_idx
    def forward(self, x):
        feats = {}
        for i, layer in enumerate(self.vgg):
            x = layer(x)
            for name, idx in self.layers_idx.items():
                if i == idx:
                    feats[name] = x
        return feats

# --------------------- Dataset ---------------------
class SingleOrFolderDataset(Dataset):
    def __init__(self, content_dir_or_file, image_size):
        super().__init__()
        p = Path(content_dir_or_file)
        if p.is_dir():
            self.paths = [str(pp) for pp in p.glob("*") if pp.suffix.lower() in [".jpg",".png",".jpeg",".bmp",".webp"]]
        else:
            self.paths = [str(p)]
        self.T = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
    def __len__(self):
        return max(200, len(self.paths))  # repeat if just one file
    def __getitem__(self, idx):
        p = self.paths[idx % len(self.paths)]
        img = Image.open(p).convert("RGB")
        return self.T(img)

# --------------------- Prep ---------------------
assert Path(CONTENT_IMAGE).exists(), f"Not found: {CONTENT_IMAGE}"
assert Path(STYLE_IMAGE).exists(),   f"Not found: {STYLE_IMAGE}"

device = torch.device(DEVICE)
print("Using device:", device)

# Style targets (Gram matrices of VGG features for the style image)
style_img = load_image(STYLE_IMAGE, size=IMAGE_SIZE)
style_t   = to_tensor(style_img).unsqueeze(0).to(device)
vgg_loss_net = VGG16Features(LAYER_IDX).to(device)
with torch.no_grad():
    style_feats = vgg_loss_net(normalize_batch(style_t))
style_targets = {k: gram_matrix(v).detach() for k, v in style_feats.items() if k in STYLE_LAYERS}

# --------------------- Train Transform Net (per-style) ---------------------
transform_net = TransformNet().to(device)
optim = torch.optim.Adam(transform_net.parameters(), lr=LEARNING_RATE)

ds = SingleOrFolderDataset(CONTENT_DIR if Path(CONTENT_DIR).exists() else CONTENT_IMAGE, IMAGE_SIZE)
loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)

print(f"Training on {len(ds)} samples (repeating if needed). Epochs={NUM_EPOCHS}")
global_step = 0
for epoch in range(NUM_EPOCHS):
    t0 = time.time()
    for batch in loader:
        global_step += 1
        batch = batch.to(device)
        optim.zero_grad()

        y_hat = transform_net(batch)                # [0,1]
        y_hat_norm = normalize_batch(y_hat.clone()) # normalize for VGG

        # VGG features
        feats_y_hat = vgg_loss_net(y_hat_norm)
        feats_x     = vgg_loss_net(normalize_batch(batch))

        # Content loss (relu2_2)
        content_loss = F.mse_loss(feats_y_hat[CONTENT_LAYER], feats_x[CONTENT_LAYER])

        # Style loss (sum over layers)
        style_loss = 0.0
        for k in STYLE_LAYERS:
            gm_y = gram_matrix(feats_y_hat[k])
            gm_s = style_targets[k].expand_as(gm_y)
            style_loss += F.mse_loss(gm_y, gm_s)

        # TV regularization
        tv = tv_loss(y_hat)

        loss = CONTENT_WEIGHT * content_loss + STYLE_WEIGHT * style_loss + TV_WEIGHT * tv
        loss.backward()
        optim.step()

        if global_step % 50 == 0:
            print(f"epoch {epoch+1}/{NUM_EPOCHS} step {global_step:05d} | "
                  f"content {content_loss.item():.4f} | style {style_loss.item():.4f} | tv {tv.item():.5f}")

    print(f"Epoch {epoch+1} done in {time.time()-t0:.1f}s")

torch.save(transform_net.state_dict(), MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

# --------------------- Stylize your content image ---------------------
content_img = load_image(CONTENT_IMAGE, size=IMAGE_SIZE)
content_t   = to_tensor(content_img).unsqueeze(0).to(device)

transform_net.eval()
with torch.no_grad():
    out = transform_net(content_t)

to_image(out).save(OUTPUT_IMAGE)
print(f"Saved stylized image to {OUTPUT_IMAGE}")


Using device: cuda
Training on 49981 samples (repeating if needed). Epochs=2
epoch 1/2 step 00050 | content 86.1079 | style 0.1638 | tv 0.02250
epoch 1/2 step 00100 | content 116.9638 | style 0.0531 | tv 0.07404
epoch 1/2 step 00150 | content 142.3306 | style 0.0230 | tv 0.11302
epoch 1/2 step 00200 | content 133.6479 | style 0.0157 | tv 0.12329
epoch 1/2 step 00250 | content 135.8813 | style 0.0122 | tv 0.12260
epoch 1/2 step 00300 | content 109.3613 | style 0.0108 | tv 0.12176
epoch 1/2 step 00350 | content 114.5092 | style 0.0088 | tv 0.12198
epoch 1/2 step 00400 | content 105.2851 | style 0.0095 | tv 0.11909
epoch 1/2 step 00450 | content 119.9118 | style 0.0070 | tv 0.12765
epoch 1/2 step 00500 | content 104.9453 | style 0.0091 | tv 0.13067
epoch 1/2 step 00550 | content 103.9139 | style 0.0064 | tv 0.12674
epoch 1/2 step 00600 | content 110.0003 | style 0.0079 | tv 0.12633
epoch 1/2 step 00650 | content 106.5581 | style 0.0065 | tv 0.12715
epoch 1/2 step 00700 | content 95.5246 |

In [5]:
# kandinsky Module
import os, math, time, random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from PIL import Image
import numpy as np

# --------------------- Config ---------------------
CONTENT_IMAGE = "content test2.jpg"     # (your content path)
STYLE_IMAGE   = "kandinsky style.jpg"   # (your style path)
OUTPUT_IMAGE  = "kandinsky_johnson_stylized.jpg"
MODEL_PATH    = "kandinsky_transform_net.pth"

# Minimal "dataset" defaults to the single content image; point this to a folder for better training
CONTENT_DIR   = "content"       # folder of training content images (e.g., "./coco_subset"); "." uses the single content image
IMAGE_SIZE    = 512       # train/infer resolution (256/512/768). Higher => better quality, slower training
BATCH_SIZE    = 4
NUM_EPOCHS    = 2         # increase (e.g., 2–4) if you have a folder of content images
LEARNING_RATE = 1e-3
SEED          = 42
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Perceptual loss weights (Johnson et al. typical ranges)
CONTENT_WEIGHT = 0.5
STYLE_WEIGHT   = 1e4      # try 5e3–1e4; if style is weak, raise this
TV_WEIGHT      = 1e-6

# VGG style/content layers (Johnson et al.: content=relu2_2; style=relu1_2, relu2_2, relu3_3, relu4_3)
LAYER_IDX = {
    "relu1_2": 3,
    "relu2_2": 8,
    "relu3_3": 15,
    "relu4_3": 22
}
STYLE_LAYERS   = ["relu1_2","relu2_2","relu3_3","relu4_3"]
CONTENT_LAYER  = "relu2_2"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --------------------- Utils ---------------------
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

def to_tensor(img):
    return transforms.ToTensor()(img)  # in [0,1]

def to_image(t: torch.Tensor) -> Image.Image:
    t = t.detach().clamp(0,1).cpu()
    return transforms.ToPILImage()(t.squeeze(0))

def normalize_batch(x):
    mean = IMAGENET_MEAN.to(x.device)
    std  = IMAGENET_STD.to(x.device)
    return (x - mean) / std

def gram_matrix(feat: torch.Tensor):
    # feat: [N,C,H,W] -> Gram per sample, normalized by C*H*W
    N, C, H, W = feat.size()
    f = feat.view(N, C, H*W)
    G = torch.bmm(f, f.transpose(1,2)) / (C*H*W)
    return G

def tv_loss(x):
    return (torch.mean(torch.abs(x[..., :, 1:] - x[..., :, :-1])) +
            torch.mean(torch.abs(x[..., 1:, :] - x[..., :-1, :])))

# --------------------- Transform Net (Johnson et al.) ---------------------
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        # Residuals
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        # Decoder
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0  # map from [-1,1] to [0,1]

# --------------------- VGG16 Loss Network ---------------------
class VGG16Features(nn.Module):
    def __init__(self, layers_idx):
        super().__init__()
        vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_FEATURES).features.eval()
        for p in vgg.parameters(): p.requires_grad_(False)
        self.vgg = vgg
        self.layers_idx = layers_idx
    def forward(self, x):
        feats = {}
        for i, layer in enumerate(self.vgg):
            x = layer(x)
            for name, idx in self.layers_idx.items():
                if i == idx:
                    feats[name] = x
        return feats

# --------------------- Dataset ---------------------
class SingleOrFolderDataset(Dataset):
    def __init__(self, content_dir_or_file, image_size):
        super().__init__()
        p = Path(content_dir_or_file)
        if p.is_dir():
            self.paths = [str(pp) for pp in p.glob("*") if pp.suffix.lower() in [".jpg",".png",".jpeg",".bmp",".webp"]]
        else:
            self.paths = [str(p)]
        self.T = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
    def __len__(self):
        return max(200, len(self.paths))  # repeat if just one file
    def __getitem__(self, idx):
        p = self.paths[idx % len(self.paths)]
        img = Image.open(p).convert("RGB")
        return self.T(img)

# --------------------- Prep ---------------------
assert Path(CONTENT_IMAGE).exists(), f"Not found: {CONTENT_IMAGE}"
assert Path(STYLE_IMAGE).exists(),   f"Not found: {STYLE_IMAGE}"

device = torch.device(DEVICE)
print("Using device:", device)

# Style targets (Gram matrices of VGG features for the style image)
style_img = load_image(STYLE_IMAGE, size=IMAGE_SIZE)
style_t   = to_tensor(style_img).unsqueeze(0).to(device)
vgg_loss_net = VGG16Features(LAYER_IDX).to(device)
with torch.no_grad():
    style_feats = vgg_loss_net(normalize_batch(style_t))
style_targets = {k: gram_matrix(v).detach() for k, v in style_feats.items() if k in STYLE_LAYERS}

# --------------------- Train Transform Net (per-style) ---------------------
transform_net = TransformNet().to(device)
optim = torch.optim.Adam(transform_net.parameters(), lr=LEARNING_RATE)

ds = SingleOrFolderDataset(CONTENT_DIR if Path(CONTENT_DIR).exists() else CONTENT_IMAGE, IMAGE_SIZE)
loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)

print(f"Training on {len(ds)} samples (repeating if needed). Epochs={NUM_EPOCHS}")
global_step = 0
for epoch in range(NUM_EPOCHS):
    t0 = time.time()
    for batch in loader:
        global_step += 1
        batch = batch.to(device)
        optim.zero_grad()

        y_hat = transform_net(batch)                # [0,1]
        y_hat_norm = normalize_batch(y_hat.clone()) # normalize for VGG

        # VGG features
        feats_y_hat = vgg_loss_net(y_hat_norm)
        feats_x     = vgg_loss_net(normalize_batch(batch))

        # Content loss (relu2_2)
        content_loss = F.mse_loss(feats_y_hat[CONTENT_LAYER], feats_x[CONTENT_LAYER])

        # Style loss (sum over layers)
        style_loss = 0.0
        for k in STYLE_LAYERS:
            gm_y = gram_matrix(feats_y_hat[k])
            gm_s = style_targets[k].expand_as(gm_y)
            style_loss += F.mse_loss(gm_y, gm_s)

        # TV regularization
        tv = tv_loss(y_hat)

        loss = CONTENT_WEIGHT * content_loss + STYLE_WEIGHT * style_loss + TV_WEIGHT * tv
        loss.backward()
        optim.step()

        if global_step % 50 == 0:
            print(f"epoch {epoch+1}/{NUM_EPOCHS} step {global_step:05d} | "
                  f"content {content_loss.item():.4f} | style {style_loss.item():.4f} | tv {tv.item():.5f}")

    print(f"Epoch {epoch+1} done in {time.time()-t0:.1f}s")

torch.save(transform_net.state_dict(), MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

# --------------------- Stylize your content image ---------------------
content_img = load_image(CONTENT_IMAGE, size=IMAGE_SIZE)
content_t   = to_tensor(content_img).unsqueeze(0).to(device)

transform_net.eval()
with torch.no_grad():
    out = transform_net(content_t)

to_image(out).save(OUTPUT_IMAGE)
print(f"Saved stylized image to {OUTPUT_IMAGE}")


Using device: cuda
Training on 49981 samples (repeating if needed). Epochs=2
epoch 1/2 step 00050 | content 214.3441 | style 0.1052 | tv 0.05966
epoch 1/2 step 00100 | content 250.9243 | style 0.0377 | tv 0.08799
epoch 1/2 step 00150 | content 255.2714 | style 0.0332 | tv 0.09363
epoch 1/2 step 00200 | content 258.5710 | style 0.0263 | tv 0.10379
epoch 1/2 step 00250 | content 276.2679 | style 0.0179 | tv 0.10499
epoch 1/2 step 00300 | content 239.4314 | style 0.0156 | tv 0.10274
epoch 1/2 step 00350 | content 248.5527 | style 0.0134 | tv 0.10206
epoch 1/2 step 00400 | content 232.0596 | style 0.0160 | tv 0.09910
epoch 1/2 step 00450 | content 252.0291 | style 0.0106 | tv 0.10957
epoch 1/2 step 00500 | content 238.8265 | style 0.0120 | tv 0.10728
epoch 1/2 step 00550 | content 239.7148 | style 0.0101 | tv 0.11011
epoch 1/2 step 00600 | content 251.8123 | style 0.0119 | tv 0.10883
epoch 1/2 step 00650 | content 260.0256 | style 0.0084 | tv 0.11139
epoch 1/2 step 00700 | content 247.6087

In [6]:
# Pablo Picasso Module
import os, math, time, random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import models, transforms
from PIL import Image
import numpy as np

# --------------------- Config ---------------------
CONTENT_IMAGE = "content test2.jpg"     # (your content path)
STYLE_IMAGE   = "Pablo_Picasso_style.jpeg"   # (your style path)
OUTPUT_IMAGE  = "Pablo_Picasso_johnson_stylized.jpg"
MODEL_PATH    = "Pablo_Picasso_transform_net.pth"

# Minimal "dataset" defaults to the single content image; point this to a folder for better training
CONTENT_DIR   = "content"       # folder of training content images (e.g., "./coco_subset"); "." uses the single content image
IMAGE_SIZE    = 512       # train/infer resolution (256/512/768). Higher => better quality, slower training
BATCH_SIZE    = 4
NUM_EPOCHS    = 2         # increase (e.g., 2–4) if you have a folder of content images
LEARNING_RATE = 1e-3
SEED          = 42
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Perceptual loss weights (Johnson et al. typical ranges)
CONTENT_WEIGHT = 0.5
STYLE_WEIGHT   = 1e4      # try 5e3–1e4; if style is weak, raise this
TV_WEIGHT      = 1e-6

# VGG style/content layers (Johnson et al.: content=relu2_2; style=relu1_2, relu2_2, relu3_3, relu4_3)
LAYER_IDX = {
    "relu1_2": 3,
    "relu2_2": 8,
    "relu3_3": 15,
    "relu4_3": 22
}
STYLE_LAYERS   = ["relu1_2","relu2_2","relu3_3","relu4_3"]
CONTENT_LAYER  = "relu2_2"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# --------------------- Utils ---------------------
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

def load_image(path, size=None):
    img = Image.open(path).convert("RGB")
    if size is not None:
        img = img.resize((size, size), Image.BICUBIC)
    return img

def to_tensor(img):
    return transforms.ToTensor()(img)  # in [0,1]

def to_image(t: torch.Tensor) -> Image.Image:
    t = t.detach().clamp(0,1).cpu()
    return transforms.ToPILImage()(t.squeeze(0))

def normalize_batch(x):
    mean = IMAGENET_MEAN.to(x.device)
    std  = IMAGENET_STD.to(x.device)
    return (x - mean) / std

def gram_matrix(feat: torch.Tensor):
    # feat: [N,C,H,W] -> Gram per sample, normalized by C*H*W
    N, C, H, W = feat.size()
    f = feat.view(N, C, H*W)
    G = torch.bmm(f, f.transpose(1,2)) / (C*H*W)
    return G

def tv_loss(x):
    return (torch.mean(torch.abs(x[..., :, 1:] - x[..., :, :-1])) +
            torch.mean(torch.abs(x[..., 1:, :] - x[..., :-1, :])))

# --------------------- Transform Net (Johnson et al.) ---------------------
class ConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, stride):
        super().__init__()
        pad = kernel // 2
        self.pad = nn.ReflectionPad2d(pad)
        self.conv = nn.Conv2d(in_c, out_c, kernel, stride)
    def forward(self, x):
        return self.conv(self.pad(x))

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = ConvLayer(channels, channels, 3, 1)
        self.in1   = nn.InstanceNorm2d(channels, affine=True)
        self.conv2 = ConvLayer(channels, channels, 3, 1)
        self.in2   = nn.InstanceNorm2d(channels, affine=True)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = self.in2(self.conv2(y))
        return x + y

class UpsampleConvLayer(nn.Module):
    def __init__(self, in_c, out_c, kernel, upsample=None):
        super().__init__()
        self.upsample = upsample
        self.pad = nn.ReflectionPad2d(kernel // 2)
        self.conv = nn.Conv2d(in_c, out_c, kernel, 1)
    def forward(self, x):
        if self.upsample:
            x = F.interpolate(x, scale_factor=self.upsample, mode="nearest")
        return self.conv(self.pad(x))

class TransformNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = ConvLayer(3, 32, 9, 1)
        self.in1   = nn.InstanceNorm2d(32, affine=True)
        self.conv2 = ConvLayer(32, 64, 3, 2)
        self.in2   = nn.InstanceNorm2d(64, affine=True)
        self.conv3 = ConvLayer(64, 128, 3, 2)
        self.in3   = nn.InstanceNorm2d(128, affine=True)
        # Residuals
        self.res   = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        # Decoder
        self.up1   = UpsampleConvLayer(128, 64, 3, upsample=2)
        self.in4   = nn.InstanceNorm2d(64, affine=True)
        self.up2   = UpsampleConvLayer(64, 32, 3, upsample=2)
        self.in5   = nn.InstanceNorm2d(32, affine=True)
        self.conv4 = ConvLayer(32, 3, 9, 1)
    def forward(self, x):
        y = F.relu(self.in1(self.conv1(x)))
        y = F.relu(self.in2(self.conv2(y)))
        y = F.relu(self.in3(self.conv3(y)))
        y = self.res(y)
        y = F.relu(self.in4(self.up1(y)))
        y = F.relu(self.in5(self.up2(y)))
        y = torch.tanh(self.conv4(y))
        return (y + 1) / 2.0  # map from [-1,1] to [0,1]

# --------------------- VGG16 Loss Network ---------------------
class VGG16Features(nn.Module):
    def __init__(self, layers_idx):
        super().__init__()
        vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_FEATURES).features.eval()
        for p in vgg.parameters(): p.requires_grad_(False)
        self.vgg = vgg
        self.layers_idx = layers_idx
    def forward(self, x):
        feats = {}
        for i, layer in enumerate(self.vgg):
            x = layer(x)
            for name, idx in self.layers_idx.items():
                if i == idx:
                    feats[name] = x
        return feats

# --------------------- Dataset ---------------------
class SingleOrFolderDataset(Dataset):
    def __init__(self, content_dir_or_file, image_size):
        super().__init__()
        p = Path(content_dir_or_file)
        if p.is_dir():
            self.paths = [str(pp) for pp in p.glob("*") if pp.suffix.lower() in [".jpg",".png",".jpeg",".bmp",".webp"]]
        else:
            self.paths = [str(p)]
        self.T = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
    def __len__(self):
        return max(200, len(self.paths))  # repeat if just one file
    def __getitem__(self, idx):
        p = self.paths[idx % len(self.paths)]
        img = Image.open(p).convert("RGB")
        return self.T(img)

# --------------------- Prep ---------------------
assert Path(CONTENT_IMAGE).exists(), f"Not found: {CONTENT_IMAGE}"
assert Path(STYLE_IMAGE).exists(),   f"Not found: {STYLE_IMAGE}"

device = torch.device(DEVICE)
print("Using device:", device)

# Style targets (Gram matrices of VGG features for the style image)
style_img = load_image(STYLE_IMAGE, size=IMAGE_SIZE)
style_t   = to_tensor(style_img).unsqueeze(0).to(device)
vgg_loss_net = VGG16Features(LAYER_IDX).to(device)
with torch.no_grad():
    style_feats = vgg_loss_net(normalize_batch(style_t))
style_targets = {k: gram_matrix(v).detach() for k, v in style_feats.items() if k in STYLE_LAYERS}

# --------------------- Train Transform Net (per-style) ---------------------
transform_net = TransformNet().to(device)
optim = torch.optim.Adam(transform_net.parameters(), lr=LEARNING_RATE)

ds = SingleOrFolderDataset(CONTENT_DIR if Path(CONTENT_DIR).exists() else CONTENT_IMAGE, IMAGE_SIZE)
loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)

print(f"Training on {len(ds)} samples (repeating if needed). Epochs={NUM_EPOCHS}")
global_step = 0
for epoch in range(NUM_EPOCHS):
    t0 = time.time()
    for batch in loader:
        global_step += 1
        batch = batch.to(device)
        optim.zero_grad()

        y_hat = transform_net(batch)                # [0,1]
        y_hat_norm = normalize_batch(y_hat.clone()) # normalize for VGG

        # VGG features
        feats_y_hat = vgg_loss_net(y_hat_norm)
        feats_x     = vgg_loss_net(normalize_batch(batch))

        # Content loss (relu2_2)
        content_loss = F.mse_loss(feats_y_hat[CONTENT_LAYER], feats_x[CONTENT_LAYER])

        # Style loss (sum over layers)
        style_loss = 0.0
        for k in STYLE_LAYERS:
            gm_y = gram_matrix(feats_y_hat[k])
            gm_s = style_targets[k].expand_as(gm_y)
            style_loss += F.mse_loss(gm_y, gm_s)

        # TV regularization
        tv = tv_loss(y_hat)

        loss = CONTENT_WEIGHT * content_loss + STYLE_WEIGHT * style_loss + TV_WEIGHT * tv
        loss.backward()
        optim.step()

        if global_step % 50 == 0:
            print(f"epoch {epoch+1}/{NUM_EPOCHS} step {global_step:05d} | "
                  f"content {content_loss.item():.4f} | style {style_loss.item():.4f} | tv {tv.item():.5f}")

    print(f"Epoch {epoch+1} done in {time.time()-t0:.1f}s")

torch.save(transform_net.state_dict(), MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

# --------------------- Stylize your content image ---------------------
content_img = load_image(CONTENT_IMAGE, size=IMAGE_SIZE)
content_t   = to_tensor(content_img).unsqueeze(0).to(device)

transform_net.eval()
with torch.no_grad():
    out = transform_net(content_t)

to_image(out).save(OUTPUT_IMAGE)
print(f"Saved stylized image to {OUTPUT_IMAGE}")


Using device: cuda
Training on 49981 samples (repeating if needed). Epochs=2
epoch 1/2 step 00050 | content 68.7457 | style 0.0267 | tv 0.01154
epoch 1/2 step 00100 | content 67.5726 | style 0.0153 | tv 0.01926
epoch 1/2 step 00150 | content 95.6307 | style 0.0062 | tv 0.03187
epoch 1/2 step 00200 | content 85.1662 | style 0.0056 | tv 0.03692
epoch 1/2 step 00250 | content 96.1107 | style 0.0046 | tv 0.03700
epoch 1/2 step 00300 | content 55.4893 | style 0.0040 | tv 0.03730
epoch 1/2 step 00350 | content 67.7435 | style 0.0029 | tv 0.03518
epoch 1/2 step 00400 | content 53.3389 | style 0.0035 | tv 0.03565
epoch 1/2 step 00450 | content 78.9932 | style 0.0030 | tv 0.03894
epoch 1/2 step 00500 | content 71.6288 | style 0.0029 | tv 0.03594
epoch 1/2 step 00550 | content 65.4218 | style 0.0028 | tv 0.03793
epoch 1/2 step 00600 | content 86.7335 | style 0.0032 | tv 0.03858
epoch 1/2 step 00650 | content 85.6664 | style 0.0031 | tv 0.03875
epoch 1/2 step 00700 | content 73.2846 | style 0.003