# **TIPS: Text-Induced Pose Synthesis**

This notebook demonstrates the inference pipeline of TIPS.

*Accepted in The European Conference on Computer Vision (ECCV) 2022.*

https://prasunroy.github.io/tips


## Getting started

Download and extract the required resources

In [None]:
!rm * -Rf
!wget "https://drive.google.com/uc?id=1zTG9M06ckW0z4MvJks3-JSC8sJguiZJH&confirm=t&uuid=eff55096-c74c-413e-874f-c8ca2ea4cb27&at=ALgDtsy0Pr78iHzB8TK5eTUpy5j2:1676916580137" -O tips.zip
!unzip -oq tips.zip && rm tips.zip
!ls -lah

Import dependencies

In [None]:
import datetime
import numpy as np
import os
import pandas as pd
from PIL import Image

In [None]:
from tips import TIPS
from tips import visualize_skeletons, visualize

In [None]:
from google.colab import files

Configure environment

In [None]:
prng = np.random.default_rng(1)

ckpt_text2pose = './checkpoints/text2pose_75000.pth'
ckpt_refinenet = './checkpoints/refinenet_100.pth'
ckpt_pose2pose = './checkpoints/pose2pose_260500.pth'

timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

data_root = './data'
save_root_df2df = f'./output/{timestamp}/df2df'
save_root_df2rw = f'./output/{timestamp}/df2rw'

keypoints = pd.read_csv('./data/keypoints.csv', index_col='file_id')
encodings = pd.read_csv('./data/encodings.csv', index_col='file_id')
img_descs = pd.read_csv('./data/descriptions.csv', index_col='file_id')
img_pairs_df2df = pd.read_csv('./data/img_pairs_df2df.csv')
img_pairs_df2rw = pd.read_csv('./data/img_pairs_df2rw.csv')

font = './data/FreeMono.ttf'
bbox = (40, 0, 216, 256)

file_id = lambda path: os.path.splitext(os.path.basename(path))[0]

if not os.path.isdir(save_root_df2df): os.makedirs(save_root_df2df)
if not os.path.isdir(save_root_df2rw): os.makedirs(save_root_df2rw)

# Sample a random noise vector from a standard normal distribution
z = prng.normal(size=128).astype(np.float32)

## Initialize TIPS

In [None]:
tips = TIPS(ckpt_text2pose, ckpt_refinenet, ckpt_pose2pose)

## Generation with DeepFashion targets (*within distribution*)

#### Load a random test sample

In [None]:
index = np.random.randint(0, len(img_pairs_df2df))

fpA = img_pairs_df2df.iloc[index].imgA
fpB = img_pairs_df2df.iloc[index].imgB

source_image = Image.open(f'{data_root}/{fpA}')
target_image = Image.open(f'{data_root}/{fpB}')

source_keypoints = keypoints.loc[file_id(fpA)].values[2:38].astype(np.int32)
target_keypoints = keypoints.loc[file_id(fpB)].values[2:38].astype(np.int32)

source_text_encoding = encodings.loc[file_id(fpA)].values[0:84].astype(np.float32)
target_text_encoding = encodings.loc[file_id(fpB)].values[0:84].astype(np.float32)

source_text_description = img_descs.loc[file_id(fpA)].description
target_text_description = img_descs.loc[file_id(fpB)].description

#### Keypoints guided benchmark

In [None]:
generated_image = tips.benchmark(source_image, source_keypoints, target_keypoints)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_k': generated_image.crop(bbox),
    'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
    'kB': Image.fromarray(visualize_skeletons([target_keypoints], head_color=(100, 255, 100))).crop(bbox)
}

layout = [['iA', 'kA', 'iB', 'kB', 'iB_k']]

grid = visualize(images_dict, layout, True, font)

display(grid)

#### Partially text guided pipeline

In [None]:
out1 = tips.pipeline(source_image, source_keypoints, target_text_encoding, z)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_c': out1['iB_c'].crop(bbox),
    'iB_f': out1['iB_f'].crop(bbox),
    'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
    'kB_c': Image.fromarray(visualize_skeletons([out1['kB_c']], head_color=(255, 100, 100))).crop(bbox),
    'kB_f': Image.fromarray(visualize_skeletons([out1['kB_f']], head_color=(100, 100, 255))).crop(bbox)
}

layout = [['iA', 'kA', 'iB', 'kB_c', 'iB_c'], ['iA', 'kA', 'iB', 'kB_f', 'iB_f']]

grid = visualize(images_dict, layout, True, font)

display(grid)
print('\nTarget description:\n\n' + target_text_description.replace('. ', '.\n'))

#### Fully text guided pipeline

In [None]:
out2 = tips.pipeline_full(source_image, source_text_encoding, target_text_encoding, z)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_c': out2['iB_c'].crop(bbox),
    'iB_f': out2['iB_f'].crop(bbox),
    'kA_c': Image.fromarray(visualize_skeletons([out2['kA_c']], head_color=(255, 100, 100))).crop(bbox),
    'kA_f': Image.fromarray(visualize_skeletons([out2['kA_f']], head_color=(100, 100, 255))).crop(bbox),
    'kB_c': Image.fromarray(visualize_skeletons([out2['kB_c']], head_color=(255, 100, 100))).crop(bbox),
    'kB_f': Image.fromarray(visualize_skeletons([out2['kB_f']], head_color=(100, 100, 255))).crop(bbox)
}

layout = [['iA', 'kA_c', 'iB', 'kB_c', 'iB_c'], ['iA', 'kA_f', 'iB', 'kB_f', 'iB_f']]

grid = visualize(images_dict, layout, True, font)

display(grid)
print('\nSource description:\n\n' + source_text_description.replace('. ', '.\n'))
print('\nTarget description:\n\n' + target_text_description.replace('. ', '.\n'))

## Generation with Real World targets (*out of distribution*)

#### Load a random test sample

In [None]:
index = np.random.randint(0, len(img_pairs_df2rw))

fpA = img_pairs_df2rw.iloc[index].imgA
fpB = img_pairs_df2rw.iloc[index].imgB

source_image = Image.open(f'{data_root}/{fpA}')
target_image = Image.open(f'{data_root}/{fpB}')

source_keypoints = keypoints.loc[file_id(fpA)].values[2:38].astype(np.int32)
target_keypoints = keypoints.loc[file_id(fpB)].values[2:38].astype(np.int32)

source_text_encoding = encodings.loc[file_id(fpA)].values[0:84].astype(np.float32)
target_text_encoding = encodings.loc[file_id(fpB)].values[0:84].astype(np.float32)

source_text_description = img_descs.loc[file_id(fpA)].description
target_text_description = img_descs.loc[file_id(fpB)].description

#### Keypoints guided benchmark

In [None]:
generated_image = tips.benchmark(source_image, source_keypoints, target_keypoints)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_k': generated_image.crop(bbox),
    'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
    'kB': Image.fromarray(visualize_skeletons([target_keypoints], head_color=(100, 255, 100))).crop(bbox)
}

layout = [['iA', 'kA', 'iB', 'kB', 'iB_k']]

grid = visualize(images_dict, layout, True, font)

display(grid)

#### Partially text guided pipeline

In [None]:
out1 = tips.pipeline(source_image, source_keypoints, target_text_encoding, z)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_c': out1['iB_c'].crop(bbox),
    'iB_f': out1['iB_f'].crop(bbox),
    'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
    'kB_c': Image.fromarray(visualize_skeletons([out1['kB_c']], head_color=(255, 100, 100))).crop(bbox),
    'kB_f': Image.fromarray(visualize_skeletons([out1['kB_f']], head_color=(100, 100, 255))).crop(bbox)
}

layout = [['iA', 'kA', 'iB', 'kB_c', 'iB_c'], ['iA', 'kA', 'iB', 'kB_f', 'iB_f']]

grid = visualize(images_dict, layout, True, font)

display(grid)
print('\nTarget description:\n\n' + target_text_description.replace('. ', '.\n'))

#### Fully text guided pipeline

In [None]:
out2 = tips.pipeline_full(source_image, source_text_encoding, target_text_encoding, z)

images_dict = {
    'iA': source_image.crop(bbox),
    'iB': target_image.crop(bbox),
    'iB_c': out2['iB_c'].crop(bbox),
    'iB_f': out2['iB_f'].crop(bbox),
    'kA_c': Image.fromarray(visualize_skeletons([out2['kA_c']], head_color=(255, 100, 100))).crop(bbox),
    'kA_f': Image.fromarray(visualize_skeletons([out2['kA_f']], head_color=(100, 100, 255))).crop(bbox),
    'kB_c': Image.fromarray(visualize_skeletons([out2['kB_c']], head_color=(255, 100, 100))).crop(bbox),
    'kB_f': Image.fromarray(visualize_skeletons([out2['kB_f']], head_color=(100, 100, 255))).crop(bbox)
}

layout = [['iA', 'kA_c', 'iB', 'kB_c', 'iB_c'], ['iA', 'kA_f', 'iB', 'kB_f', 'iB_f']]

grid = visualize(images_dict, layout, True, font)

display(grid)
print('\nSource description:\n\n' + source_text_description.replace('. ', '.\n'))
print('\nTarget description:\n\n' + target_text_description.replace('. ', '.\n'))

## Generate all *within distribution* samples

This will generate all *within distribution* test samples for reproducibility.

Note: Output will be compressed and downloaded as a zip archive for offline viewing.


In [None]:
layout = [
    ['iA', 'kA',    'iB', 'kB',    'iB_k0'],
    ['iA', 'kA',    'iB', 'kB_c1', 'iB_c1'],
    ['iA', 'kA',    'iB', 'kB_f1', 'iB_f1'],
    ['iA', 'kA_c2', 'iB', 'kB_c2', 'iB_c2'],
    ['iA', 'kA_f2', 'iB', 'kB_f2', 'iB_f2']
]

for i in range(len(img_pairs_df2df)):
    fpA = img_pairs_df2df.iloc[i].imgA
    fpB = img_pairs_df2df.iloc[i].imgB
    
    source_text_encoding = encodings.loc[file_id(fpA)].values[0:84].astype(np.float32)
    target_text_encoding = encodings.loc[file_id(fpB)].values[0:84].astype(np.float32)
    
    source_keypoints = keypoints.loc[file_id(fpA)].values[2:38].astype(np.int32)
    target_keypoints = keypoints.loc[file_id(fpB)].values[2:38].astype(np.int32)
    
    source_image = Image.open(f'{data_root}/{fpA}')
    target_image = Image.open(f'{data_root}/{fpB}')
    
    iB_k = tips.benchmark(source_image, source_keypoints, target_keypoints)
    out1 = tips.pipeline(source_image, source_keypoints, target_text_encoding, z)
    out2 = tips.pipeline_full(source_image, source_text_encoding, target_text_encoding, z)
    
    images_dict = {
        'iA': source_image.crop(bbox),
        'iB': target_image.crop(bbox),
        'iB_k0': iB_k.crop(bbox),
        'iB_c1': out1['iB_c'].crop(bbox),
        'iB_f1': out1['iB_f'].crop(bbox),
        'iB_c2': out2['iB_c'].crop(bbox),
        'iB_f2': out2['iB_f'].crop(bbox),
        'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
        'kB': Image.fromarray(visualize_skeletons([target_keypoints], head_color=(100, 255, 100))).crop(bbox),
        'kA_c2': Image.fromarray(visualize_skeletons([out2['kA_c']], head_color=(255, 100, 100))).crop(bbox),
        'kA_f2': Image.fromarray(visualize_skeletons([out2['kA_f']], head_color=(100, 100, 255))).crop(bbox),
        'kB_c1': Image.fromarray(visualize_skeletons([out1['kB_c']], head_color=(255, 100, 100))).crop(bbox),
        'kB_f1': Image.fromarray(visualize_skeletons([out1['kB_f']], head_color=(100, 100, 255))).crop(bbox),
        'kB_c2': Image.fromarray(visualize_skeletons([out2['kB_c']], head_color=(255, 100, 100))).crop(bbox),
        'kB_f2': Image.fromarray(visualize_skeletons([out2['kB_f']], head_color=(100, 100, 255))).crop(bbox),
    }
    
    grid = visualize(images_dict, layout, True, font)
    grid.save(f'{save_root_df2df}/{file_id(fpA)}____{file_id(fpB)}.png')
    print(f'\r[DF2DF] Testing TIPS inference pipeline... {i+1}/{len(img_pairs_df2df)}', end='')

print('')

!zip -rq tips_output_df2df.zip $save_root_df2df

files.download('tips_output_df2df.zip')

## Generate all *out of distribution* samples

This will generate all *out of distribution* test samples for reproducibility.

Note: Output will be compressed and downloaded as a zip archive for offline viewing.


In [None]:
layout = [
    ['iA', 'kA',    'iB', 'kB',    'iB_k0'],
    ['iA', 'kA',    'iB', 'kB_c1', 'iB_c1'],
    ['iA', 'kA',    'iB', 'kB_f1', 'iB_f1'],
    ['iA', 'kA_c2', 'iB', 'kB_c2', 'iB_c2'],
    ['iA', 'kA_f2', 'iB', 'kB_f2', 'iB_f2']
]

for i in range(len(img_pairs_df2rw)):
    fpA = img_pairs_df2rw.iloc[i].imgA
    fpB = img_pairs_df2rw.iloc[i].imgB
    
    source_text_encoding = encodings.loc[file_id(fpA)].values[0:84].astype(np.float32)
    target_text_encoding = encodings.loc[file_id(fpB)].values[0:84].astype(np.float32)
    
    source_keypoints = keypoints.loc[file_id(fpA)].values[2:38].astype(np.int32)
    target_keypoints = keypoints.loc[file_id(fpB)].values[2:38].astype(np.int32)
    
    source_image = Image.open(f'{data_root}/{fpA}')
    target_image = Image.open(f'{data_root}/{fpB}')
    
    iB_k = tips.benchmark(source_image, source_keypoints, target_keypoints)
    out1 = tips.pipeline(source_image, source_keypoints, target_text_encoding, z)
    out2 = tips.pipeline_full(source_image, source_text_encoding, target_text_encoding, z)
    
    images_dict = {
        'iA': source_image.crop(bbox),
        'iB': target_image.crop(bbox),
        'iB_k0': iB_k.crop(bbox),
        'iB_c1': out1['iB_c'].crop(bbox),
        'iB_f1': out1['iB_f'].crop(bbox),
        'iB_c2': out2['iB_c'].crop(bbox),
        'iB_f2': out2['iB_f'].crop(bbox),
        'kA': Image.fromarray(visualize_skeletons([source_keypoints], head_color=(100, 255, 100))).crop(bbox),
        'kB': Image.fromarray(visualize_skeletons([target_keypoints], head_color=(100, 255, 100))).crop(bbox),
        'kA_c2': Image.fromarray(visualize_skeletons([out2['kA_c']], head_color=(255, 100, 100))).crop(bbox),
        'kA_f2': Image.fromarray(visualize_skeletons([out2['kA_f']], head_color=(100, 100, 255))).crop(bbox),
        'kB_c1': Image.fromarray(visualize_skeletons([out1['kB_c']], head_color=(255, 100, 100))).crop(bbox),
        'kB_f1': Image.fromarray(visualize_skeletons([out1['kB_f']], head_color=(100, 100, 255))).crop(bbox),
        'kB_c2': Image.fromarray(visualize_skeletons([out2['kB_c']], head_color=(255, 100, 100))).crop(bbox),
        'kB_f2': Image.fromarray(visualize_skeletons([out2['kB_f']], head_color=(100, 100, 255))).crop(bbox),
    }
    
    grid = visualize(images_dict, layout, True, font)
    grid.save(f'{save_root_df2rw}/{file_id(fpA)}____{file_id(fpB)}.png')
    print(f'\r[DF2RW] Testing TIPS inference pipeline... {i+1}/{len(img_pairs_df2rw)}', end='')

print('')

!zip -rq tips_output_df2rw.zip $save_root_df2rw

files.download('tips_output_df2rw.zip')

# ***Thank you for checking out TIPS!***
