# GPT

가장 주인공 같은 인물, 혹은 가장 움직이는게 좋을 거 같은 인물 하나를 지정해서 설명해달라고 해야할 것 같다.

seg_prompt는 위치로 인물을 지정하는 게 가장 정확한 듯하다. 그리고 손에 들고 있는 것까지 언급을 해주면 다 같이 마스킹 할 수 있게끔 하였다. 근데 다른 공통적인 물건까지 포함시키면 전혀 다른 위치의 해당 물건이 같은 마스크가 되어버린다.

motion_prompt는 다른 물건 언급 없이 동작 자체만 설명하는 것이 좋을 것 같다. 

In [None]:
# gpt 프롬프트 입니다.
# 워크스페이스에서 API 호출이 안되어 외부에서 돌린 결과를 임의로 가져와서 사용하였습니다.

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are an expert in analyzing storybook illustrations and describing animations for human-like characters. Return results in a dictionary format."},
        {
            "role": "user", "content": """
Analyze the provided storybook page image.
1. Regocnize the texts and illustrations on the storybook image.
2. Choose one human character that you think would be best to animate.
3. Describe the character's position on the image and what he/she is holding.
4. Suggest a simple motion in HumanML3D style based on context. You should focus only on the character's motion and do not include any other objects.

Output Format:
{   [
        "name": "One-word name for the character",
        "description": "Name and location and object. ex) A boy on left side holding a gun.",
        "motion": "Simple motion description for the character in HumanML3D dataset style. Start with 'A person is'."
    ]
}
"""
        },
        {"role": "user", "content": [{"type": "image_url", "image_url": {"url":  f"data:image/jpeg;base64,{base64_image}"}}]}
    ]
)

# 데이터 경로 지정

In [1]:
import os
import json

image_name = "a1"  # 이미지 이름 지정
data_dir= "/home/jovyan/data/axe/" + image_name + "/"  # 책 이름 지정
image_path = data_dir + image_name + ".png"
char_dir = data_dir + "char/"
os.makedirs(char_dir, exist_ok=True)
inpaint_dir = data_dir + "inpaint/"
os.makedirs(inpaint_dir, exist_ok=True)

motion_path = char_dir + image_name + ".bvh"

seg_prompt = "A boy on the left side."  # segment를 위한 prompt
motion_prompt = "\"A person is running energetically with a skipping motion.\""  # motion 생성을 위한 prompt

env_vars = {  # 커널을 변경하여도 변수 동일하게 사용하기 위해 별도의 파일에 저장
    "data_dir": data_dir,
    "image_name": image_name,
    "image_path": image_path,
    "char_dir": char_dir,
    "motion_path": motion_path,
    "inpaint_dir": inpaint_dir,
    "seg_prompt": seg_prompt,
    "motion_prompt": motion_prompt
}

with open("env_vars.json", "w") as f:
    json.dump(env_vars, f)

# Segmentation

In [1]:
import json

# 파일에서 변수 읽기
with open("env_vars.json", "r") as f:
    env_vars = json.load(f)

# 불러온 변수 사용
data_dir = env_vars.get("data_dir")
image_name = env_vars.get("image_name")
image_path = env_vars.get("image_path")
char_dir = env_vars.get("char_dir")
motion_path = env_vars.get("motion_path")
inpaint_dir = env_vars.get("inpaint_dir")
seg_prompt = env_vars.get("seg_prompt")
motion_prompt = env_vars.get("motion_prompt")

In [None]:
from PIL import Image
from lang_sam import LangSAM
import numpy as np
import cv2
import yaml

In [3]:
model = LangSAM(sam_type="sam2.1_hiera_large")

In [None]:
image_pil = Image.open(image_path).convert("RGB")
result = model.predict([image_pil], [seg_prompt])[0]

mask = np.logical_or.reduce(result["masks"]).astype(int)  # 생성된 모든 mask 하나로 통합
mask = np.asarray(mask * 255, dtype=np.uint8)

kernel = np.ones((10, 10), np.uint8)
dilated_mask = cv2.dilate(mask, kernel, iterations=1)  # inpainting을 위한 dilated mask 생성

bbox = np.array([np.min(result["boxes"][:, 0]), np.min(result["boxes"][:, 1]), np.max(result["boxes"][:, 2]), np.max(result["boxes"][:, 3])], dtype=np.float32)
l, t, r, b = [round(x) for x in bbox]
cropped_image = image_pil.crop(bbox)
cropped_mask = mask[t:b, l:r]

with open(char_dir + "/bounding_box.yaml", 'w') as f:  # 전체 이미지에서 bounding box 위치 별도 저장
        yaml.dump({
            'left': l,
            'top': t,
            'right': r,
            'bottom': b
        }, f)

cropped_image.save(char_dir + "/texture.png")  # character 생성을 위해 mask 위치만 crop한 이미지 저장
mask = Image.fromarray(mask)
dilated_mask = Image.fromarray(dilated_mask)
cropped_mask = Image.fromarray(cropped_mask)

mask.save(data_dir + image_name + "_mask.png")
dilated_mask.save(inpaint_dir + image_name + "_mask.png")
cropped_mask.save(char_dir + "mask.png")

# Inpainting

In [1]:
import json

# 파일에서 변수 읽기
with open("env_vars.json", "r") as f:
    env_vars = json.load(f)

# 불러온 변수 사용
data_dir = env_vars.get("data_dir")
image_name = env_vars.get("image_name")
image_path = env_vars.get("image_path")
char_dir = env_vars.get("char_dir")
motion_path = env_vars.get("motion_path")
inpaint_dir = env_vars.get("inpaint_dir")
seg_prompt = env_vars.get("seg_prompt")
motion_prompt = env_vars.get("motion_prompt")

In [None]:
import shutil
shutil.copy(image_path, inpaint_dir + image_name + ".png")

# 캐릭터가 제거된 배경 inpainting
!cd lama && export TORCH_HOME=$(pwd) && export PYTHONPATH=$(pwd) && python bin/predict.py model.path=$(pwd)/big-lama indir={inpaint_dir} outdir={inpaint_dir}

# Pose Estimation

In [1]:
import json

# 파일에서 변수 읽기
with open("env_vars.json", "r") as f:
    env_vars = json.load(f)

# 불러온 변수 사용
data_dir = env_vars.get("data_dir")
image_name = env_vars.get("image_name")
image_path = env_vars.get("image_path")
char_dir = env_vars.get("char_dir")
motion_path = env_vars.get("motion_path")
inpaint_dir = env_vars.get("inpaint_dir")
seg_prompt = env_vars.get("seg_prompt")
motion_prompt = env_vars.get("motion_prompt")

In [None]:
import torch, torchvision
import mmpose
from IPython.display import Image, display
import cv2
import os
import yaml
import numpy as np
from mmpose.apis import (inference_top_down_pose_model, init_pose_model, vis_pose_result)

In [None]:
pose_config = '/home/jovyan/mmpose/config.py'
pose_checkpoint = '/home/jovyan/mmpose/best_AP_epoch_72.pth'  # meta에서 제공하는 스케치 이미지에 fine-tune된 custom 모델 사용
cropped = cv2.imread(char_dir + "/texture.png")

pose_model = init_pose_model(pose_config, pose_checkpoint)

pose_results, _ = inference_top_down_pose_model(  # 관절 위치 추정
    pose_model, char_dir + "/texture.png", person_results=None)

# vis_result = vis_pose_result(
#     pose_model,
#     char_dir + "/image.png",
#     pose_results)

# cv2.imwrite("test.png", vis_result)
# display(Image("test.png"))

In [None]:
kpts = np.array(pose_results[0]['keypoints'])[:, :2]

skeleton = []
skeleton.append({'loc' : [round(x) for x in (kpts[11]+kpts[12])/2], 'name': 'root'          , 'parent': None})
skeleton.append({'loc' : [round(x) for x in (kpts[11]+kpts[12])/2], 'name': 'hip'           , 'parent': 'root'})
skeleton.append({'loc' : [round(x) for x in (kpts[5]+kpts[6])/2  ], 'name': 'torso'         , 'parent': 'hip'})
skeleton.append({'loc' : [round(x) for x in  kpts[0]             ], 'name': 'neck'          , 'parent': 'torso'})
skeleton.append({'loc' : [round(x) for x in  kpts[6]             ], 'name': 'right_shoulder', 'parent': 'torso'})
skeleton.append({'loc' : [round(x) for x in  kpts[8]             ], 'name': 'right_elbow'   , 'parent': 'right_shoulder'})
skeleton.append({'loc' : [round(x) for x in  kpts[10]            ], 'name': 'right_hand'    , 'parent': 'right_elbow'})
skeleton.append({'loc' : [round(x) for x in  kpts[5]             ], 'name': 'left_shoulder' , 'parent': 'torso'})
skeleton.append({'loc' : [round(x) for x in  kpts[7]             ], 'name': 'left_elbow'    , 'parent': 'left_shoulder'})
skeleton.append({'loc' : [round(x) for x in  kpts[9]             ], 'name': 'left_hand'     , 'parent': 'left_elbow'})
skeleton.append({'loc' : [round(x) for x in  kpts[12]            ], 'name': 'right_hip'     , 'parent': 'root'})
skeleton.append({'loc' : [round(x) for x in  kpts[14]            ], 'name': 'right_knee'    , 'parent': 'right_hip'})
skeleton.append({'loc' : [round(x) for x in  kpts[16]            ], 'name': 'right_foot'    , 'parent': 'right_knee'})
skeleton.append({'loc' : [round(x) for x in  kpts[11]            ], 'name': 'left_hip'      , 'parent': 'root'})
skeleton.append({'loc' : [round(x) for x in  kpts[13]            ], 'name': 'left_knee'     , 'parent': 'left_hip'})
skeleton.append({'loc' : [round(x) for x in  kpts[15]            ], 'name': 'left_foot'     , 'parent': 'left_knee'})


char_cfg = {'skeleton': skeleton, 'height': cropped.shape[0], 'width': cropped.shape[1]}  # 각 관절의 위치 별도 저장

with open(char_dir + '/char_cfg.yaml', 'w') as f:
        yaml.dump(char_cfg, f)

joint_overlay = cropped.copy()
for joint in skeleton:
        x, y = joint['loc']
        name = joint['name']
        cv2.circle(joint_overlay, (int(x), int(y)), 5, (0, 0, 0), 5)
        cv2.putText(joint_overlay, name, (int(x), int(y+15)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, 2)
cv2.imwrite(char_dir + '/joint_overlay.png', joint_overlay)

# Motion Generation

In [1]:
import json

# 파일에서 변수 읽기
with open("env_vars.json", "r") as f:
    env_vars = json.load(f)

# 불러온 변수 사용
data_dir = env_vars.get("data_dir")
image_name = env_vars.get("image_name")
image_path = env_vars.get("image_path")
char_dir = env_vars.get("char_dir")
motion_path = env_vars.get("motion_path")
inpaint_dir = env_vars.get("inpaint_dir")
seg_prompt = env_vars.get("seg_prompt")
motion_prompt = env_vars.get("motion_prompt")

In [None]:
# prompt로 motion 생성
!cd momask-codes && python gen_t2m.py --gpu_id 0 --ext exp1 --motion_length 100 --text_prompt ""{motion_prompt}""

In [None]:
import shutil
import glob
import os

source_dir = '/home/jovyan/momask-codes/generation/exp1/animations/0'
files = glob.glob(f'{source_dir}/*.bvh')
latest_file = max(files, key=os.path.getctime)
shutil.copy(latest_file, motion_path)

files = glob.glob(f'{source_dir}/*.mp4')
latest_file = max(files, key=os.path.getctime)
shutil.copy(latest_file, char_dir+image_name+".mp4")

# Animating

In [1]:
import json

# 파일에서 변수 읽기
with open("env_vars.json", "r") as f:
    env_vars = json.load(f)

# 불러온 변수 사용
data_dir = env_vars.get("data_dir")
image_name = env_vars.get("image_name")
image_path = env_vars.get("image_path")
char_dir = env_vars.get("char_dir")
motion_path = env_vars.get("motion_path")
inpaint_dir = env_vars.get("inpaint_dir")
seg_prompt = env_vars.get("seg_prompt")
motion_prompt = env_vars.get("motion_prompt")

In [2]:
import yaml
from PIL import Image

image = char_dir + "texture.png"
with Image.open(image) as img:
    width, height = img.size

window_width = int(width * 1.5)
window_height = int(height * 1.5)

yaml_path = "/home/jovyan/data/animating/config/mvc/pbl.yaml"

new_character_cfg = char_dir + "char_cfg.yaml"
video_path = data_dir+ image_name + ".gif"

with open(yaml_path, 'r') as file:
    config = yaml.safe_load(file)

# rendering 하는데 사용되는 config 파일 수정
config['scene']['ANIMATED_CHARACTERS'][0]['character_cfg'] = new_character_cfg
config['controller']['OUTPUT_VIDEO_PATH'] = video_path
config['view']['WINDOW_DIMENSIONS'] = [window_width, window_height]  # 영상 크기 조정

with open(yaml_path, 'w') as file:
    yaml.dump(config, file, default_flow_style=False)


yaml_path = "/home/jovyan/data/animating/config/motion/test.yaml"

new_bvh = motion_path

with open(yaml_path, 'r') as file:
    config = yaml.safe_load(file)

# rendering 하는데 사용되는 config 파일 수정
config['filepath'] = motion_path

with open(yaml_path, 'w') as file:
    yaml.dump(config, file, default_flow_style=False)

In [None]:
from animated_drawings import render
render.start('/home/jovyan/data/animating/config/mvc/pbl.yaml')  # segment한 이미지에 관절 위치 정보 이용해서 생성한 모션 입히기, gif 파일로 저장

# 배경 합치기

In [None]:
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
import yaml

yaml_file = char_dir + "bounding_box.yaml"
with open(yaml_file, 'r') as file:
    bounding_box = yaml.safe_load(file)

left = bounding_box['left']
right = bounding_box['right']
top = bounding_box['top']
bottom = bounding_box['bottom']

center_x = (left + right) / 2
center_y = (top + bottom) / 2

video_path = data_dir+ image_name + ".gif"
gif_file = video_path

clip = VideoFileClip(gif_file, has_mask=True)

adjusted_x = center_x - (clip.w / 2)
adjusted_y = center_y - (clip.h / 2)

background_file = inpaint_dir + image_name + "_mask.png"
background = ImageClip(background_file)

background = background.set_duration(clip.duration).set_fps(clip.fps)

# inpaint된 배경과 gif 파일 결합
final_clip = CompositeVideoClip([
    background,
    clip.set_position((adjusted_x, adjusted_y))
])

# 동영상으로 저장
final_clip.write_videofile(image_name + ".mp4")