#REINFORCEMENT LEARNING FOR AUTONOMUS PAINTING USING MULTI OBJECTIVE REWARD FUNCTIONS
#Three Painting Modes

This notebook extends the beginner Paint-RL project with three painting modes:

1. **Paint-by-number** — agent fills semantic/quantized color regions (region-fill actions).
2. **Stylization** — agent tries to produce a painterly version of a photo; reward uses a simplified perceptual-ish loss (VGG is optional, but kept lightweight here).
3. **Contour reconstruction** — agent learns to draw the object's edges/outlines.


The notebook uses small images (32×32) for fast experimentation. Upload an image where applicable, or use auto-generated targets.


In [None]:
# DL framework, img and gif r/w, kmeans
!pip install -q torch torchvision pillow imageio matplotlib scikit-image scikit-learn

print('Installed packages: torch, torchvision, pillow, imageio, matplotlib, scikit-image, scikit-learn')


Installed packages: torch, torchvision, pillow, imageio, matplotlib, scikit-image, scikit-learn


## Upload a target image

Upload an image to use as the target for modes that need it (paint-by-number, stylization, contour). If you don't upload, demo circle/shape targets are generated automatically.

In [None]:
#upload files
from google.colab import files
uploaded = files.upload()
TARGET_IMAGE_PATH = None
if uploaded:
    TARGET_IMAGE_PATH = list(uploaded.keys())[0]
    print('Uploaded:', TARGET_IMAGE_PATH)
else:
    print('No upload detected. Notebook will use generated targets where needed.')


## Choose a mode

Set `MODE` to one of: `'paint_by_number'`, `'stylization'`, `'contour'`, `'creative'`. Default in the example below is `'paint_by_number'`.

In [None]:
# User input for mode
MODE = input("Enter mode (paint_by_number / stylization / contour): ").strip().lower()

# Validate
valid_modes = ["paint_by_number", "stylization", "contour"]

if MODE not in valid_modes:
    print("Invalid mode! Defaulting to 'paint_by_number'.")
    MODE = "paint_by_number"

print("MODE =", MODE)


Enter mode (paint_by_number / stylization / contour): contour
MODE = contour


In [None]:
# img- resize, draw- canvas,strokes filter- blur
import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from skimage import color, filters
import math


from sklearn.cluster import KMeans


def load_and_resize(path_or_pil, size=32):
    if isinstance(path_or_pil, str):
        img = Image.open(path_or_pil).convert('RGB')
    else:
        img = path_or_pil.convert('RGB')
    img = img.resize((size, size), Image.BILINEAR) #32x32, smooth img
    return np.array(img).astype(np.float32) / 255.0

def quantize_colors(img, n_colors=6, seed=0):
    h,w,c = img.shape #hight, width, channel
    arr = img.reshape(-1,3) #2D into 1D
    km = KMeans(n_clusters=n_colors, random_state=seed).fit(arr) #label label 0to5
    labels = km.labels_.reshape(h,w)
    palette = km.cluster_centers_
    quant = palette[labels]
    return quant, labels, palette

def edge_map(img_gray):
    edges = filters.sobel(img_gray)
    edges = (edges - edges.min()) / (edges.max() - edges.min() + 1e-8) #normalize to 0-1, cal reward, avoid div 0
    return edges

def symmetry_score(canvas):
    h,w,c = canvas.shape #hue, satu, value
    left = canvas[:, :w//2, :]
    right = canvas[:, w - (w//2):, :]
    right_flipped = np.flip(right, axis=1)
    score = -np.mean((left - right_flipped)**2) #mirror
    return score

from skimage import color as skcolor

def hue_std_score(canvas):
    hsv = skcolor.rgb2hsv(np.clip(canvas, 0, 1))
    hue = hsv[:,:,0]
    return -float(np.std(hue)) #var high, std high, reward low

from skimage import filters as skfilters

def contrast_score(canvas):
    gray = skcolor.rgb2gray(np.clip(canvas,0,1)) #pixel value remains in valid range, grey scale
    mag = np.sqrt(skfilters.sobel_h(gray)**2 + skfilters.sobel_v(gray)**2) #sobel edge horizontal n verti edge detect
    return float(np.mean(mag)) #mean high sharp, low smooth

class PaintEnvMulti:
    def __init__(self, mode='paint_by_number', target=None, canvas_size=32, max_steps=30, n_regions=6):
        self.mode = mode
        self.canvas_size = canvas_size
        self.max_steps = max_steps
        self.n_regions = n_regions
        if target is None:
            if mode in ('paint_by_number','stylization'):
                tgt = Image.new('RGB',(canvas_size,canvas_size),(255,255,255))
                d = ImageDraw.Draw(tgt); d.ellipse((6,6,26,26), fill=(200,50,50))
                self.target = np.array(tgt).astype(np.float32)/255.0
            elif mode == 'contour':
                tgt = Image.new('RGB',(canvas_size,canvas_size),(255,255,255))
                d = ImageDraw.Draw(tgt); d.ellipse((6,6,26,26), outline=(0,0,0), width=2)
                self.target = np.array(tgt).astype(np.float32)/255.0
            else:
                self.target = None
        else:
            self.target = load_and_resize(target, size=canvas_size) if mode!='creative' else None

        if self.mode == 'paint_by_number' and self.target is not None:
            try:
                quant, labels, palette = quantize_colors(self.target, n_colors=self.n_regions)
            except Exception as e:
                h,w,_ = self.target.shape
                labels = np.zeros((h,w), dtype=np.int32)
                palette = np.linspace(0,1,self.n_regions)[:,None].repeat(3,axis=1)
                quant = palette[labels]
            self.region_labels = labels
            self.palette = palette
            self.region_masks = [(self.region_labels == i).astype(np.float32)[:,:,None] for i in range(self.n_regions)]
        elif self.mode == 'contour' and self.target is not None:
            gray = skcolor.rgb2gray(self.target)
            self.target_edges = edge_map(gray)
        elif self.mode == 'stylization' and self.target is not None:
            self.target_blur = np.array(Image.fromarray((self.target*255).astype('uint8')).filter(ImageFilter.GaussianBlur(radius=2))).astype(np.float32)/255.0
        self.reset()

    def reset(self):
        if self.target is None:
            self.canvas = np.ones((self.canvas_size, self.canvas_size, 3), dtype=np.float32)
        else:
            self.canvas = np.ones_like(self.target)
        self.step_idx = 0
        return self._obs()

    def _obs(self):
        if self.mode == 'creative':
            obs = self.canvas.transpose(2,0,1).astype(np.float32)
        else:
            obs = np.concatenate([self.canvas, self.target], axis=2).transpose(2,0,1).astype(np.float32)
        return obs

    def step(self, action):
        if self.mode == 'paint_by_number':
            region_norm = action[0]
            region_id = int(np.clip(np.floor(region_norm*self.n_regions), 0, self.n_regions-1))
            r,g,b = np.clip(action[1:4],0,1)
            mask = self.region_masks[region_id]
            alpha = 0.6
            self.canvas = (1-alpha*self.mask_region_strength(mask))*self.canvas + (alpha*self.mask_region_strength(mask))*np.array([r,g,b])[None,None,:]
            reward = self._paint_by_number_reward(region_id, r, g, b)
        elif self.mode == 'stylization':
            self._draw_line(action)
            reward = self._stylization_reward()
        elif self.mode == 'contour':
            self._draw_line(action, thickness_scale=1.0, monochrome=True)
            reward = self._contour_reward()
        elif self.mode == 'creative':
            self._draw_line(action)
            reward = self._creative_reward()
        else:
            reward = 0.0
        self.step_idx += 1
        done = (self.step_idx >= self.max_steps)
        return self._obs(), float(reward), done, {}

    def mask_region_strength(self, mask):
        from scipy.ndimage import gaussian_filter
        return gaussian_filter(mask[:,:,0], sigma=1.0)[:,:,None]

    def _draw_line(self, action, thickness_scale=1.0, monochrome=False):
        x0,y0,x1,y1,r,g,b,thickness = action
        s = self.canvas_size
        x0_px = int(np.clip(x0,0,1)*(s-1)); y0_px = int(np.clip(y0,0,1)*(s-1))
        x1_px = int(np.clip(x1,0,1)*(s-1)); y1_px = int(np.clip(y1,0,1)*(s-1))
        w = max(3, int(1 + thickness * (s//3) * thickness_scale))
        RR = int(np.clip(r*1.5,0,1)*255)
        GG = int(np.clip(g*1.5,0,1)*255)
        BB = int(np.clip(b*1.5,0,1)*255)
        col = (RR, GG, BB, 255)
        from PIL import Image as PILImage, ImageDraw as PILDraw
        pil = PILImage.fromarray((self.canvas*255).astype('uint8'))
        draw = PILDraw.Draw(pil, 'RGBA')
        if monochrome:
            draw.line((x0_px,y0_px,x1_px,y1_px), fill=(0,0,0,255), width=w)
        else:
            draw.line((x0_px,y0_px,x1_px,y1_px), fill=col, width=w)
        self.canvas = np.array(pil).astype(np.float32)/255.0

    def _paint_by_number_reward(self, region_id, r,g,b): #-mse(reg_canva_color,targ_color)
        target_region_color = self.palette[region_id]
        canvas_region = (self.canvas * self.region_masks[region_id]).reshape(-1,3)
        target_vals = (target_region_color[None,:] * self.region_masks[region_id].reshape(-1,1))
        mask_flat = self.region_masks[region_id].reshape(-1)
        if mask_flat.sum() < 1:
            return -0.0
        mse = np.sum(((canvas_region - target_vals)**2) * mask_flat[:,None]) / (mask_flat.sum()+1e-8)
        return -mse

    def _stylization_reward(self): #mse(reg_blur,targ_blur)
        canvas_blur = np.array(Image.fromarray((self.canvas*255).astype('uint8')).filter(ImageFilter.GaussianBlur(radius=2))).astype(np.float32)/255.0
        l2 = -np.mean((canvas_blur - self.target_blur)**2)
        canvas_mean = np.mean(self.canvas.reshape(-1,3), axis=0)
        target_mean = np.mean(self.target.reshape(-1,3), axis=0)
        color_sim = -np.mean((canvas_mean - target_mean)**2)
        return float(0.7*l2 + 0.3*color_sim)

    def _contour_reward(self): #mse(reg_edge,tar_edg)
        canvas_gray = skcolor.rgb2gray(np.clip(self.canvas,0,1))
        canvas_edges = edge_map(canvas_gray)
        mse = np.mean((canvas_edges - self.target_edges)**2)
        return -mse

## Actor-Critic model (shared across modes)

In [None]:
import torch, torch.nn as nn
from torch.distributions import Normal

class ActorCritic(nn.Module):
    def __init__(self, in_channels=6, action_dim=8, hidden=128):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, 32, 3, padding=1), nn.ReLU(),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.Flatten()  #flatten 1D vector
        ) #CNN for img recognition and preprocessing

        #use dummy image to know the output size
        dummy = torch.zeros(1, in_channels, 32, 32)
        conv_out = self.conv(dummy).shape[1]
        self.shared = nn.Sequential(nn.Linear(conv_out+1, hidden), nn.ReLU())
        self.mu = nn.Linear(hidden, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim)-1.0) #tell how random action should be
        self.value = nn.Linear(hidden,1) #gives number of dependency upon paint quality
    #extract feature from input
    def forward(self, x, step_frac):
        z = self.conv(x)
        z = torch.cat([z, step_frac.unsqueeze(1)], dim=1)
        h = self.shared(z)
        mu = torch.tanh(self.mu(h))
        std = torch.exp(self.log_std) #how noisy the random action should be
        value = self.value(h).squeeze(-1) #how good the painting is
        return mu, std, value #mean, randomness, critic judgement


In [None]:
def save_full_painting_gif(mode, target_path, model, device, num_strokes=120, size=128, duration=0.12):
    from PIL import Image as PILImage
    env = PaintEnvMulti(mode=mode, target=target_path, canvas_size=32, max_steps=num_strokes)
    obs = env.reset()

    frames = []

    # ADD BLANK FRAME FIRST
    blank = PILImage.fromarray((env.canvas*255).astype('uint8')).resize((size,size))
    frames.append(np.asarray(blank))

    for step in range(num_strokes):
        # BEFORE PAINT
        before = PILImage.fromarray((env.canvas*255).astype('uint8')).resize((size,size))
        frames.append(np.asarray(before))

        # COMPUTE ACTION
        obs_t = torch.tensor(obs).unsqueeze(0).to(device)
        step_frac = torch.tensor([step/env.max_steps], dtype=torch.float32).to(device)
        mu, std, _ = model(obs_t, step_frac)

        action = map_action_raw_to_env(mu.detach()[0].cpu().numpy())

        # APPLY STROKE
        obs, reward, done, _ = env.step(action)

        # AFTER PAINT
        after = PILImage.fromarray((env.canvas*255).astype('uint8')).resize((size,size))
        frames.append(np.asarray(after))

        if done:
            break

    outpath = f'full_paint_timelapse_{mode}.gif'
    imageio.mimsave(outpath, frames, duration=duration)
    print("Saved:", outpath)
    return outpath


In [None]:
# Training loop that accepts mode and optional target path
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple
from torch.distributions import Normal
import imageio, numpy as np
Transition = namedtuple('Transition', ['obs','step_frac','action_raw','logp','reward','done','value'])

#convert raw network outputs to environment range
def map_action_raw_to_env(a_raw):
    a = (a_raw + 1.0) / 2.0
    return np.clip(a, 0.0, 1.0)

#samples an action from a Normal Distribution and return the sample and its log- probability
def action_from_dist(mu, std):
    dist = Normal(mu, std)
    raw = dist.rsample()
    logp = dist.log_prob(raw).sum(dim=-1)
    return raw, logp

#Main Training loop
def train_mode(mode='paint_by_number', target_path=None, num_iterations=200, episodes_per_iter=6, max_steps=30, lr=3e-4, gamma=0.99, save_every=60):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if target_path is None and mode!='creative':
        target = None
    else:
        target = target_path
    env = PaintEnvMulti(mode=mode, target=target, canvas_size=32, max_steps=max_steps, n_regions=6)
    in_ch = 3 if mode=='creative' else 6
    model = ActorCritic(in_channels=in_ch, action_dim=8, hidden=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for it in range(1, num_iterations+1):
        transitions = []
        rewards = []
        mses = []
        for ep in range(episodes_per_iter):
            obs = env.reset()
            ep_reward = 0.0
            for step in range(max_steps):
                obs_t = torch.tensor(obs).unsqueeze(0).to(device)
                step_frac = torch.tensor([step/max_steps], dtype=torch.float32).to(device)
                mu, std, value = model(obs_t, step_frac)
                raw, logp = action_from_dist(mu, std)
                action = raw.detach().cpu().numpy()[0]
                action_mapped = map_action_raw_to_env(action)
                next_obs, reward, done, info = env.step(action_mapped)
                transitions.append(Transition(obs=obs.copy(), step_frac=step/max_steps, action_raw=action, logp=logp.detach().cpu().numpy(), reward=reward, done=done, value=value.detach().cpu().numpy()))
                obs = next_obs
                ep_reward += reward
                if done: break
            rewards.append(ep_reward)
            if mode != 'creative' and env.target is not None:
                mses.append(np.mean((env.canvas - env.target)**2))
        # compute returns
        returns = []
        R = 0.0
        for t in reversed(range(len(transitions))):
            R = transitions[t].reward + gamma * R * (1.0 - float(transitions[t].done))
            returns.insert(0, R)
        returns = torch.tensor(returns, dtype=torch.float32).to(device)

        obs_batch = torch.tensor(np.stack([tr.obs for tr in transitions], axis=0)).to(device)
        step_batch = torch.tensor([tr.step_frac for tr in transitions], dtype=torch.float32).to(device)
        action_raw_batch = torch.tensor(np.stack([tr.action_raw for tr in transitions], axis=0)).to(device)
        old_logp_batch = torch.tensor(np.stack([tr.logp for tr in transitions], axis=0)).to(device)

        mu_b, std_b, value_b = model(obs_batch, step_batch)
        dist = Normal(mu_b, std_b)
        logp_b = dist.log_prob(action_raw_batch).sum(dim=-1)
        advantages = returns - value_b.detach()
        value_loss = F.mse_loss(value_b, returns)
        policy_loss = -(advantages.detach() * logp_b).mean()
        entropy_loss = -dist.entropy().sum(dim=-1).mean() * 0.01
        loss = policy_loss + 0.5 * value_loss + entropy_loss

        optimizer.zero_grad(); loss.backward(); optimizer.step()

        if it % 10 == 0:
            avg_r = float(np.mean(rewards))
            avg_mse = float(np.mean(mses)) if mses else 0.0
            print(f"Iter {it:4d} | AvgReward {avg_r:+.4f} | AvgMSE {avg_mse:.6f} | Loss {loss.item():.4f}")

                # save demos more often (every 10 iters by default)
        if it % 10 == 0 or it == 1:
            frames = demo_mode_frames(env, model, device, num_strokes=max_steps, repeat_each=3, overlay_thickness=2)
            fname = f'paint_mode_{mode}_iter{it}.gif'
            # duration controls speed (seconds per frame). increase for slower playback.
            imageio.mimsave(fname, frames, duration=0.22)
            print('Saved demo:', fname)


    print('Training finished for mode:', mode)
    print("Generating final stroke-by-stroke GIF...")
    save_full_painting_gif(mode, target_path, model, device, num_strokes=max_steps)


#creates stroke visible GIF
def demo_mode_frames(env, model, device, num_strokes=30, repeat_each=2, overlay_thickness=2):
    """
    Generate frames for a slow, stroke-visible GIF.

    - repeat_each: how many times to repeat each frame (slows GIF).
    - overlay_thickness: extra thickness for a visibly highlighted stroke overlay.
    """
    obs = env.reset()
    frames = []
    from PIL import Image as PILImage, ImageDraw as PILDraw

    # initial canvas and optional target
    canvas = PILImage.fromarray((env.canvas*255).astype('uint8')).resize((128,128))
    frames.append(np.asarray(canvas))

    for step in range(num_strokes):
        obs_t = torch.tensor(obs).unsqueeze(0).to(device)
        step_frac = torch.tensor([step/env.max_steps], dtype=torch.float32).to(device)
        mu, std, _ = model(obs_t, step_frac)

        # use deterministic mean for demo so frames are consistent and visible
        raw = (mu + std * torch.randn_like(mu)).detach()[0].cpu().numpy()
        action_mapped = map_action_raw_to_env(raw)

        # BEFORE stepping, compute pixel coords & color to overlay later
        s = env.canvas_size
        # map action_mapped expected layout:
        # [x0, y0, x1, y1, r, g, b, thickness]
        x0, y0, x1, y1, r, g, b, thickness = action_mapped
        x0_px = int(np.clip(x0,0,1)*(s-1)); y0_px = int(np.clip(y0,0,1)*(s-1))
        x1_px = int(np.clip(x1,0,1)*(s-1)); y1_px = int(np.clip(y1,0,1)*(s-1))
        w_px = int(1 + thickness * (s//4) * 1.0)

        # apply the action to the env (this updates env.canvas)
        obs, reward, done, _ = env.step(action_mapped)

        # take a snapshot of the updated canvas and resize for presentation
        canvas = PILImage.fromarray((env.canvas*255).astype('uint8')).resize((128,128))

        # create an overlay copy to highlight the most recent stroke (thicker + higher alpha)
        vis = canvas.copy().convert('RGBA')
        draw = PILDraw.Draw(vis)
        # scale coords to resized canvas
        scale = 128.0 / float(s)
        ox0, oy0, ox1, oy1 = int(x0_px*scale), int(y0_px*scale), int(x1_px*scale), int(y1_px*scale)
        overlay_width = max(2, int(w_px * scale * overlay_thickness))

        # brighter visible color: if monochrome (contour) use black, else use stroke color
        if env.mode == 'contour':
            draw.line((ox0,oy0,ox1,oy1), fill=(0,0,0,255), width=overlay_width)
        else:
            vis_color = (int(np.clip(r,0,1)*255), int(np.clip(g,0,1)*255), int(np.clip(b,0,1)*255), 255)
            # draw a bold semi-transparent overlay to make stroke pop
            draw.line((ox0,oy0,ox1,oy1), fill=vis_color, width=overlay_width)

        # convert back to RGB numpy array
        frames.append(np.asarray(vis.convert('RGB')))

        # repeat the last frame a few times so it shows in slow motion
        for _ in range(repeat_each - 1):
            frames.append(np.asarray(vis.convert('RGB')))

        if done:
            break

    # finally append the target image (if any) so viewers can compare
    if env.target is not None:
        target = PILImage.fromarray((env.target*255).astype('uint8')).resize((128,128))
        frames.append(np.asarray(target))

    return frames



## Run training for selected mode

Set `MODE` (above) and `TARGET_IMAGE_PATH` (upload cell) then run the cell below. Example: `train_mode(mode='contour', target_path=TARGET_IMAGE_PATH, num_iterations=200)`

In [None]:
# Example run (edit parameters as desired). Uses MODE variable from earlier cell and uploaded TARGET_IMAGE_PATH.
try:
    TARGET_IMAGE_PATH
except NameError:
    TARGET_IMAGE_PATH = None

print('Running mode =', MODE, 'target =', TARGET_IMAGE_PATH)
train_mode(mode=MODE, target_path=TARGET_IMAGE_PATH, num_iterations=120, episodes_per_iter=6, max_steps=30, save_every=60)


Running mode = contour target = penguin.png
Saved demo: paint_mode_contour_iter1.gif
Iter   10 | AvgReward -5.1042 | AvgMSE 0.321707 | Loss -0.3553
Saved demo: paint_mode_contour_iter10.gif
Iter   20 | AvgReward -4.7878 | AvgMSE 0.267949 | Loss 0.7089
Saved demo: paint_mode_contour_iter20.gif
Iter   30 | AvgReward -4.8739 | AvgMSE 0.256703 | Loss 1.1088
Saved demo: paint_mode_contour_iter30.gif
Iter   40 | AvgReward -4.8967 | AvgMSE 0.252604 | Loss -0.6901
Saved demo: paint_mode_contour_iter40.gif
Iter   50 | AvgReward -4.6471 | AvgMSE 0.268326 | Loss 1.7715
Saved demo: paint_mode_contour_iter50.gif
Iter   60 | AvgReward -4.7329 | AvgMSE 0.273923 | Loss -0.1421
Saved demo: paint_mode_contour_iter60.gif
Iter   70 | AvgReward -4.8385 | AvgMSE 0.285763 | Loss 0.2020
Saved demo: paint_mode_contour_iter70.gif
Iter   80 | AvgReward -4.7715 | AvgMSE 0.265250 | Loss 0.6219
Saved demo: paint_mode_contour_iter80.gif
Iter   90 | AvgReward -4.6940 | AvgMSE 0.275868 | Loss 0.9259
Saved demo: paint_

## View generated demo GIFs

In [None]:
import glob, IPython.display as disp

# Find normal demo GIFs
demo_gifs = sorted(glob.glob('paint_mode_*.gif'))

# Find final combined full timelapse GIF
final_gifs = sorted(glob.glob('full_paint_timelapse_*.gif'))

# Display demo GIFs
print("=== Demo GIFs (Iteration Previews) ===")
if not demo_gifs:
    print('No demo GIFs found yet. Run training to produce them.')
else:
    for g in demo_gifs:
        print('Found:', g)
        disp.display(disp.Image(g))

# Download instructions
print("\n=== Download Instructions ===")
if demo_gifs:
    print(f"To download latest demo GIF:\nfrom google.colab import files; files.download('{demo_gifs[-1]}')")

In [None]:
from google.colab import files; files.download('full_paint_timelapse_stylization.gif')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files; files.download('paint_mode_stylization_iter90.gif')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files; files.download('paint_mode_stylization_iter90.gif')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files; files.download('full_paint_timelapse_stylization.gif')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>