#From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations

In this tutorial, we will walk you through setting up the environment and running the gradio app that will let you drive a photorealistic avatar using your voice.

More useful links: [Arxiv]() | [Code](https://github.com/facebookresearch/audio2photoreal/) | [Project page](https://people.eecs.berkeley.edu/~evonne_ng/projects/audio2photoreal/)

# Environment setup
Simply run through all of the 3 cells below. This will install the proper environment, download assets, and place them in the right places.

In [165]:
# Setup environment and install requirements
!pip install -r scripts/requirements.txt

Collecting einops==0.7.0 (from -r scripts/requirements.txt (line 2))
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fairseq==0.12.2 (from -r scripts/requirements.txt (line 3))
  Using cached fairseq-0.12.2-cp39-cp39-linux_x86_64.whl
Collecting gradio==4.31.3 (from -r scripts/requirements.txt (line 4))
  Downloading gradio-4.31.3-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting gradio_client==0.16.3 (from -r scripts/requirements.txt (line 5))
  Downloading gradio_client-0.16.3-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.8/315.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub==0.19.4 (from -r scripts/requirements.txt (line 6))

In [4]:
# download models, rendering assets, and prerequisite models respectively
!wget http://audio2photoreal_models.berkeleyvision.org/PXB184_models.tar
!tar xvf PXB184_models.tar
!rm PXB184_models.tar

!mkdir -p checkpoints/ca_body/data/
!wget https://github.com/facebookresearch/ca_body/releases/download/v0.0.1-alpha/PXB184.tar.gz
!tar xvf PXB184.tar.gz --directory checkpoints/ca_body/data/
!rm PXB184.tar.gz

!wget http://audio2photoreal_models.berkeleyvision.org/asset_models.tar
!tar xvf asset_models.tar
!rm asset_models.tar

--2024-10-15 12:52:34--  http://audio2photoreal_models.berkeleyvision.org/PXB184_models.tar
Resolving audio2photoreal_models.berkeleyvision.org (audio2photoreal_models.berkeleyvision.org)... 128.32.162.150
Connecting to audio2photoreal_models.berkeleyvision.org (audio2photoreal_models.berkeleyvision.org)|128.32.162.150|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1350748160 (1.3G) [application/octet-stream]
Saving to: 'PXB184_models.tar'


2024-10-15 12:56:16 (5.83 MB/s) - 'PXB184_models.tar' saved [1350748160/1350748160]

checkpoints/diffusion/c1_face/model000155000.pt
checkpoints/diffusion/c1_face/args.json
checkpoints/diffusion/c1_pose/model000340000.pt
checkpoints/diffusion/c1_pose/args.json
checkpoints/guide/c1_pose/args.json
checkpoints/guide/c1_pose/checkpoints/iter-0100000.pt
checkpoints/vq/c1_pose/args.json
checkpoints/vq/c1_pose/net_iter300000.pth
--2024-10-15 12:56:43--  https://github.com/facebookresearch/ca_body/releases/downlo

In [62]:
# install pytorch3d

import sys
import torch
pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
version_str="".join([
    f"py3{sys.version_info.minor}_cu",
    torch.version.cuda.replace(".",""),
    f"_pyt{pyt_version_str}"
])
print(version_str)
!pip install fvcore iopath
!pip install "git+https://github.com/facebookresearch/pytorch3d.git"

py39_cu118_pyt250
Collecting fvcore
  Using cached fvcore-0.1.5.post20221221-py3-none-any.whl
Collecting iopath
  Using cached iopath-0.1.10-py3-none-any.whl
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting termcolor>=1.1 (from fvcore)
  Downloading termcolor-2.5.0-py3-none-any.whl (7.8 kB)
Installing collected packages: yacs, termcolor, iopath, fvcore
Successfully installed fvcore-0.1.5.post20221221 iopath-0.1.10 termcolor-2.5.0 yacs-0.1.8
[0mCollecting git+https://github.com/facebookresearch/pytorch3d.git
  Cloning https://github.com/facebookresearch/pytorch3d.git to /tmp/pip-req-build-rgbba8qd
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/pytorch3d.git /tmp/pip-req-build-rgbba8qd
  Resolved https://github.com/facebookresearch/pytorch3d.git to commit e13848265d9d57927fca99d13061e8fba8d468d0
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages:

In [51]:
import pytorch3d
import torch

# 创建一个简单的测试
vertices = torch.rand((4, 3), device="cuda")
faces = torch.tensor([[0, 1, 2], [0, 2, 3]], device="cuda")
from pytorch3d.structures import Meshes
mesh = Meshes(verts=[vertices], faces=[faces])

# 测试渲染
from pytorch3d.renderer import (
    RasterizationSettings,
    MeshRasterizer,
    PerspectiveCameras,
)

cameras = PerspectiveCameras(device="cuda")
raster_settings = RasterizationSettings()
rasterizer = MeshRasterizer(
    cameras=cameras, 
    raster_settings=raster_settings
)

# 尝试光栅化
fragments = rasterizer(mesh)

ModuleNotFoundError: No module named 'pytorch3d'

In [79]:
import torch
import torchvision
import torchaudio

print(torch.__version__)
print(torchvision.__version__)
print(torchaudio.__version__)

ModuleNotFoundError: No module named 'torchvision'

In [55]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from pytorch3d.structures import Meshes
from pytorch3d.io import load_objs_as_meshes
from pytorch3d.renderer import (
    PerspectiveCameras,
    RasterizationSettings,
    MeshRenderer,
    MeshRasterizer,
    SoftPhongShader,
    PointLights,
)
from pytorch3d.utils import ico_sphere

# 检查 PyTorch3D 是否成功导入
try:
    print(f"PyTorch3D version: {torch.__version__}")
except ImportError:
    print("PyTorch3D 未正确安装，请检查安装。")

# 创建一个简单的3D模型 (例如：ico球体)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mesh = ico_sphere(4, device)  # 细分次数越高，网格越密集

# 设置摄像头
cameras = PerspectiveCameras(device=device)

# 设置光源
lights = PointLights(device=device, location=[[0.0, 0.0, -3.0]])

# 渲染设置
raster_settings = RasterizationSettings(
    image_size=512,
    blur_radius=0.0,
    faces_per_pixel=1,
)

# 创建渲染器
renderer = MeshRenderer(
    rasterizer=MeshRasterizer(cameras=cameras, raster_settings=raster_settings),
    shader=SoftPhongShader(device=device, cameras=cameras, lights=lights)
)

# 渲染图像
images = renderer(mesh)
plt.figure(figsize=(7, 7))
plt.imshow(images[0, ..., :3].cpu().numpy())
plt.axis("off")
plt.show()


ImportError: cannot import name 'Image' from 'PIL' (unknown location)

In [59]:
import torch

# 检查 CUDA 是否可用
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("PyTorch version:", torch.__version__)

CUDA available: True
CUDA version: 12.0
PyTorch version: 2.4.1.post303


# Run the model
**Important!!** Before you can run the model, there are two things you must fix.


1.   Fix the runtime settings for collections. With python >= 3.10, google collab will complain `ImportError: cannot import name 'Mapping' from 'collections`.
As a result, you *will* manually need to correct the path for collections from `import collections` to `import collections.abc` for all files that the environment complains about. You can just directly click into those files and change the path. See [this post](https://stackoverflow.com/questions/69381312/importerror-cannot-import-name-mapping-from-collections-using-python-3-10) for more details.

2.   Change the demo script to deploy a public link. You will need to go into `audio2photoreal/demo/demo.py` and on line 272, change from `demo.launch(show_api=False)` to `demo.launch(share=True)`

These are all the file paths I had to change:


* /usr/local/lib/python3.10/dist-packages/attrdict/mapping.py
* /usr/local/lib/python3.10/dist-packages/attrdict/mixins.py
* /usr/local/lib/python3.10/dist-packages/attrdict/merge.py
* /usr/local/lib/python3.10/dist-packages/attrdict/default.py
* /content/audio2photoreal/demo/demo.py

If anyone knows how to revert colab to python==3.9 and would like to share that tidbit with me, would greatly appreciate an email ping :)

After you finish those two changes, you can go ahead and run the below cell. It will return a *public URL* that you can click into.


In [57]:
import torch
torch.cuda.is_available()

True

In [164]:
!python -m pip install pip==23.1.1

Collecting pip==23.1.1
  Using cached pip-23.1.1-py3-none-any.whl.metadata (4.1 kB)
Using cached pip-23.1.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-23.1.1


In [161]:
!pip install gradio
!pip install attrdict
!pip install fairseq
!pip install mediapy

Collecting gradio
  Using cached gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting anyio<5.0,>=3.0 (from gradio)
  Using cached anyio-4.6.2.post1-py3-none-any.whl.metadata (4.7 kB)
Collecting fastapi<1.0 (from gradio)
  Using cached fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Using cached gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Using cached httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.19.3 (from gradio)
  Using cached huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting importlib-resources<7.0,>=1.3 (from gradio)
  Using cached importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting matp

In [162]:
!pip install tensorboard
!pip install scikit-learn
!pip install PyYAML
!pip install einops
!pip install opencv-python


Collecting tensorboard
  Using cached tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.67.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Using cached Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting importlib-metadata>=4.4 (from markdown>=2.6.8->tensorboard)
  Using cached importlib_metadata-8.5.0-

In [1]:
!python -m demo

Traceback (most recent call last):
  File "/home/shannon/.conda/envs/cas/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/shannon/.conda/envs/cas/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/mnt/d/Work/virtualCAS3/demo.py", line 15, in <module>
    import torchaudio
  File "/home/shannon/.conda/envs/cas/lib/python3.9/site-packages/torchaudio/__init__.py", line 2, in <module>
    from . import _extension  # noqa  # usort: skip
  File "/home/shannon/.conda/envs/cas/lib/python3.9/site-packages/torchaudio/_extension/__init__.py", line 38, in <module>
    _load_lib("libtorchaudio")
  File "/home/shannon/.conda/envs/cas/lib/python3.9/site-packages/torchaudio/_extension/utils.py", line 60, in _load_lib
    torch.ops.load_library(path)
  File "/home/shannon/.conda/envs/cas/lib/python3.9/site-packages/torch/_ops.py", line 1295, in load_library
    ctypes.CDLL(path)
  File 