In [1]:
%load_ext autoreload
%autoreload 2

In [35]:
import sys

if "../" not in sys.path:
    sys.path.append("../")
    print("[sys.path]:", sys.path)

In [36]:
import torch

from src.model.actor import ACTORStyleEncoder, ACTORStyleDecoder

In [37]:
MOTION_DECODER_WEIGHTS_PATH = "/home/nadir/motion-linner/models/tmr_humanml3d_guoh3dfeats/last_weights/motion_decoder.pt"
MOTION_ENCODER_WEIGHTS_PATH = "/home/nadir/motion-linner/models/tmr_humanml3d_guoh3dfeats/last_weights/motion_encoder.pt"
TEXT_ENCODER_WEIGHTS_PATH = "/home/nadir/motion-linner/models/tmr_humanml3d_guoh3dfeats/last_weights/text_encoder.pt"

In [38]:
# text_to_token_emb:
#   _target_: src.data.text.TokenEmbeddings
#   path: datasets/annotations/${hydra:runtime.choices.data}
#   modelname: distilbert-base-uncased
#   preload: true

# text_to_sent_emb:
#   _target_: src.data.text.SentenceEmbeddings
#   path: datasets/annotations/${hydra:runtime.choices.data}
#   modelname: sentence-transformers/all-mpnet-base-v2
#   preload: true

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_model = AutoModel.from_pretrained("distilbert-base-uncased")

print("Text processing setup complete - ready to use with TMR text encoder")

Text processing setup complete - ready to use with TMR text encoder


In [None]:
motion_encoder = ACTORStyleEncoder(
    nfeats=263,
    vae=True,
    latent_dim=256,
    ff_size=1024,
    num_layers=6,
    num_heads=4,
    dropout=0.1,
    activation='gelu'
)
motion_encoder.load_state_dict(
    torch.load(MOTION_ENCODER_WEIGHTS_PATH, map_location='cpu')
)

text_encoder = ACTORStyleEncoder(
    nfeats=768,
    vae=True,
    latent_dim=256,
    ff_size=1024,
    num_layers=6,
    num_heads=4,
    dropout=0.1,
    activation='gelu'
)
text_encoder.load_state_dict(
    torch.load(TEXT_ENCODER_WEIGHTS_PATH, map_location='cpu')
)

motion_decoder = ACTORStyleDecoder(
    nfeats=263,
    latent_dim=256,
    ff_size=1024,
    num_layers=6,
    num_heads=4,
    dropout=0.1,
    activation='gelu'
)
motion_decoder.load_state_dict(
    torch.load(MOTION_DECODER_WEIGHTS_PATH, map_location='cpu'),
)

<All keys matched successfully>

In [None]:
def process_text_for_tmr(text, max_length=77):
    """Process text input for the TMR model"""
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        text_features = text_model(**tokens).last_hidden_state
    
    return text_features

sample_text = "a person walks forward and then turns around"
text_features = process_text_for_tmr(sample_text)
print(f"Text features shape: {text_features.shape}")

mask = torch.ones(text_features.shape[:-1], dtype=torch.bool)

text_encoder.eval()
with torch.no_grad():
    text_latent = text_encoder({"x": text_features, "mask": mask})
    print(f"Text latent shape: {text_latent.shape}")

Text features shape: torch.Size([1, 77, 768])
Text latent shape: torch.Size([1, 2, 256])


In [None]:
from src.model.tmr import TMR

tmr_model = TMR(
    motion_encoder=motion_encoder,
    text_encoder=text_encoder,
    motion_decoder=motion_decoder,
    vae=True,
    lmd={"recons": 1.0, "latent": 1.0e-5, "kl": 1.0e-5, "contrastive": 0.1},
    lr=1e-4,
    temperature=0.7,
    threshold_selfsim=0.80,
    threshold_selfsim_metrics=0.95
)

print("TMR model created successfully!")
print(f"Motion encoder parameters: {sum(p.numel() for p in motion_encoder.parameters())}")
print(f"Text encoder parameters: {sum(p.numel() for p in text_encoder.parameters())}")
print(f"Motion decoder parameters: {sum(p.numel() for p in motion_decoder.parameters())}")

tmr_model.eval()
motion_encoder.eval()
text_encoder.eval()
motion_decoder.eval()

In [None]:
def generate_motion_from_text(text, motion_length=120):
    """Generate motion from text using the pretrained TMR model"""
    
    text_features = process_text_for_tmr(text)
    
    text_x_dict = {
        "x": text_features,
        "length": [text_features.shape[1]]
    }
    
    mask = torch.ones(1, motion_length, dtype=torch.bool)
    
    with torch.no_grad():
        generated_motion = tmr_model(text_x_dict, mask=mask)
    
    return generated_motion

sample_texts = [
    "a person walks forward",
    "someone is dancing",
    "a person sits down and then stands up",
    "running in a circle"
]

print("Generating motions from text descriptions...")
for i, text in enumerate(sample_texts):
    try:
        motion = generate_motion_from_text(text)
        print(f"✓ Generated motion {i+1}: '{text}' -> shape: {motion.shape}")
    except Exception as e:
        print(f"✗ Error with text {i+1}: '{text}' -> {e}")

print("\nYou can now use the pretrained TMR model for text-to-motion generation!")

In [27]:
text_encoder.seqTransEncoder

AttributeError: 'str' object has no attribute 'dtype'

In [3]:
# _target_: src.model.TEMOS

# motion_encoder:
#   _target_: src.model.ACTORStyleEncoder
#   nfeats: ${data.motion_loader.nfeats}
#   vae: true
#   latent_dim: 256
#   ff_size: 1024
#   num_layers: 6
#   num_heads: 4
#   dropout: 0.1
#   activation: gelu

# text_encoder:
#   _target_: src.model.ACTORStyleEncoder
#   nfeats: 768
#   vae: true
#   latent_dim: 256
#   ff_size: 1024
#   num_layers: 6
#   num_heads: 4
#   dropout: 0.1
#   activation: gelu

# motion_decoder:
#   _target_: src.model.ACTORStyleDecoder
#   nfeats: ${data.motion_loader.nfeats}
#   latent_dim: 256
#   ff_size: 1024
#   num_layers: 6
#   num_heads: 4
#   dropout: 0.1
#   activation: gelu

# vae: true

# lmd:
#   recons: 1.0
#   latent: 1.0e-5
#   kl: 1.0e-5

# lr: 1e-4