In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [3]:
pose_tensor = torch.tensor(np.load("train_data.npy"))
#切分训练/测试集
train_tensor = pose_tensor[:400]
test_tensor = pose_tensor[400:]

x1, x2, x3 = torch.chunk(train_tensor, chunks=3, dim=2)
x1 = x1.permute(2,0,3,1)[0]
D1 = x1[:,0,:9]
motion1 = x1[:,1:].reshape(-1,120,23,6)

x2 = x2.permute(2,0,3,1)[0]
D2 = x2[:,0,:9]
motion2 = x2[:,1:].reshape(-1,120,23,6)

x3 = x3.permute(2,0,3,1)[0]
cam = x3[:,1:,:6]

In [4]:
#38, 3, 121
#转换成标准的 PyTorch DataLoader 格式
from torch.utils.data import Dataset, DataLoader

class CustomMotionDataset(Dataset):
    def __init__(self, seq1, seq2, vec1, vec2, labels):
        self.seq1 = seq1.to(torch.float32)
        self.seq2 = seq2.to(torch.float32)
        self.vec1 = vec1.to(torch.float32)
        self.vec2 = vec2.to(torch.float32)
        self.labels = labels.to(torch.float32)

    def __len__(self):
        return len(self.seq1)

    def __getitem__(self, idx):
        return {
            "seq1": self.seq1[idx],
            "seq2": self.seq2[idx],
            "vec1": self.vec1[idx],
            "vec2": self.vec2[idx],
            "label": self.labels[idx]
        }
    

dataset = CustomMotionDataset(motion1, motion2, D1, D2, cam)

batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [8]:
# build transformer，输入双人动作，输出时序编码，学习映射
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim=512, num_heads=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=512, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.output_layer = nn.Linear(embed_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        return self.output_layer(x)

class VectorProcessor(nn.Module):
    def __init__(self, input_dim, embed_dim=512):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

#输出 Toric 摄像机参数
class FusionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.series_model = TimeSeriesTransformer(input_dim=6*23, output_dim=512)
        self.vector_model = VectorProcessor(input_dim=9, embed_dim=512)
        self.fusion_fc = nn.Sequential(
            nn.Linear(512 * 4, 512),
            nn.ReLU(),
            nn.Linear(512, 6)
        )
    
    def forward(self, seq1, seq2, vec1, vec2):
        batch_size = seq1.shape[0]
        seq1 = seq1.view(batch_size, 120, -1)  # Reshape (120, 23, 6) -> (120, 23*6)
        seq2 = seq2.view(batch_size, 120, -1)
        
        seq1_feat = self.series_model(seq1)
        seq2_feat = self.series_model(seq2)
        vec1_feat = self.vector_model(vec1).unsqueeze(1).repeat(1, 120, 1)  # Expand to match time dimension
        vec2_feat = self.vector_model(vec2).unsqueeze(1).repeat(1, 120, 1)
        
        fused = torch.cat([seq1_feat, seq2_feat, vec1_feat, vec2_feat], dim=-1) #输出的6维对应论文中定义的 Toric 特征 (L2Dx, L2Dy, R2Dx, R2Dy, Theta, Phi)，即镜头左/右人物在屏幕上的归一化位置加开合角、仰俯角等。
        output = self.fusion_fc(fused)
        return output

In [9]:
model = FusionModel().to("cuda")
loss_fn = nn.MSELoss()

In [13]:
import torch.optim as optim

# 初始化模型

optimizer = optim.Adam(model.parameters(), lr=1e-5)


# 训练循环
epochs = 1000
for epoch in range(epochs):
    for batch in dataloader:
        seq1, seq2, vec1, vec2, labels = batch["seq1"], batch["seq2"], batch["vec1"], batch["vec2"], batch["label"]
        
        optimizer.zero_grad()
        output = model(seq1.to("cuda"), seq2.to("cuda"), vec1.to("cuda"), vec2.to("cuda"))
        loss = loss_fn(output, labels.to("cuda"))
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [1/1000], Loss: 1.0676
Epoch [2/1000], Loss: 1.1067
Epoch [3/1000], Loss: 0.8388
Epoch [4/1000], Loss: 0.8743
Epoch [5/1000], Loss: 0.9676
Epoch [6/1000], Loss: 0.8351
Epoch [7/1000], Loss: 1.1806
Epoch [8/1000], Loss: 0.7693
Epoch [9/1000], Loss: 0.5348
Epoch [10/1000], Loss: 0.6891
Epoch [11/1000], Loss: 0.4762
Epoch [12/1000], Loss: 0.6074
Epoch [13/1000], Loss: 0.6333
Epoch [14/1000], Loss: 0.5520
Epoch [15/1000], Loss: 0.4761
Epoch [16/1000], Loss: 0.5094
Epoch [17/1000], Loss: 0.4549
Epoch [18/1000], Loss: 0.8415
Epoch [19/1000], Loss: 0.4172
Epoch [20/1000], Loss: 0.4136
Epoch [21/1000], Loss: 0.5024
Epoch [22/1000], Loss: 0.7283
Epoch [23/1000], Loss: 0.4468
Epoch [24/1000], Loss: 0.3606
Epoch [25/1000], Loss: 0.4811
Epoch [26/1000], Loss: 0.5387
Epoch [27/1000], Loss: 0.6752
Epoch [28/1000], Loss: 0.3941
Epoch [29/1000], Loss: 0.5264
Epoch [30/1000], Loss: 0.4792
Epoch [31/1000], Loss: 0.3775
Epoch [32/1000], Loss: 0.5860
Epoch [33/1000], Loss: 0.2955
Epoch [34/1000], Lo

In [None]:
# Epoch [1000/1000], Loss: 0.0061 ，
# 用之前定义的 MSE（均方误差）来衡量模型输出和真实 Toric 特征的差距。越小说明预测越准。

In [10]:
#把训练好的权重存下来
import torch
torch.save(model.state_dict(), "model_no_att.pth")

In [13]:
model = FusionModel() # 先重新创建一个相同结构的网络
model.load_state_dict(torch.load("model_no_att.pth", weights_only=True))# 把磁盘上的权重加载进来
model = model.to("cuda")

In [14]:
test_tensor[10].shape

torch.Size([138, 3, 121])

In [19]:
#预先在训练集上计算好的各类特征的均值和标准差从磁盘上读进来，准备用于反标准化，恢复到真实物理量
Mean = torch.tensor(np.load("normalization/Mean.npy")).to("cpu") #动作特征均值
Std = torch.tensor(np.load("normalization/Std.npy")).to("cpu") #标准差
D_mean = torch.tensor(np.load("normalization/D_Mean.npy")).to("cpu") #根节点平移向量 D 的均值
D_std = torch.tensor(np.load("normalization/D_Std.npy")).to("cpu")
C_mean = torch.tensor(np.load("normalization/C_Mean.npy")).to("cpu") #Toric 相机参数 C 的均值
C_std = torch.tensor(np.load("normalization/C_Std.npy")).to("cpu") #标准差

#拆分
x1, x2, x3 = torch.chunk(test_tensor[-1], chunks=3, dim=1)
x1 = x1.permute(1,2,0)[0]
D1 = x1[0][:9].to(torch.float32)
motion1 = x1[1:].reshape(120,23,6).to(torch.float32)

x2 = x2.permute(1,2,0)[0]
D2 = x2[0][:9].to(torch.float32)
motion2 = x2[1:].reshape(120,23,6).to(torch.float32)

cam = model(motion1.to("cuda").unsqueeze(0), motion2.to("cuda").unsqueeze(0), 
            D1.to("cuda").unsqueeze(0), D2.to("cuda").unsqueeze(0))[0].to(torch.float32).to("cpu")

# 反标准化Denormalize，还原回真实坐标
motion1 = motion1*(Std)+Mean
motion2 = motion2*(Std)+Mean

D1 = D1*(D_std)+D_mean
D2 = D2*(D_std)+D_mean

cam = cam*(C_std)+C_mean

#保存结果到 .npy，准备可视化
np.save("../Camera/generated_data/new_joint_vecs/train/0_p0.npy",motion1)
np.save("../Camera/generated_data/new_joint_vecs/train/0_p1.npy",motion2)
np.save("../Camera/generated_data/canon_data/train/0_p0.npy",D1)
np.save("../Camera/generated_data/canon_data/train/0_p1.npy",D2)
np.save("../Camera/generated_data/camera/0.npy",cam.detach().numpy())

In [23]:
#可视化脚本把这些.npy文件绘出来

In [22]:
cam

tensor([[-0.3077,  0.3124,  0.2330,  0.3106,  3.0105,  0.0698],
        [-0.3094,  0.3102,  0.2323,  0.3084,  3.0063,  0.0668],
        [-0.3050,  0.3094,  0.2400,  0.3070,  2.9929,  0.0667],
        [-0.3014,  0.3094,  0.2361,  0.3056,  2.9873,  0.0769],
        [-0.3105,  0.3070,  0.2413,  0.3107,  2.9589,  0.0649],
        [-0.3171,  0.3017,  0.2473,  0.3131,  2.9468,  0.0726],
        [-0.3113,  0.2963,  0.2483,  0.3011,  2.9268,  0.0617],
        [-0.3170,  0.2909,  0.2339,  0.2991,  2.9536,  0.0646],
        [-0.3159,  0.2900,  0.2456,  0.2957,  2.9546,  0.0621],
        [-0.3106,  0.2893,  0.2369,  0.3016,  2.9262,  0.0580],
        [-0.3085,  0.2942,  0.2412,  0.3118,  2.9338,  0.0593],
        [-0.3108,  0.2880,  0.2477,  0.3134,  2.9290,  0.0602],
        [-0.3114,  0.2917,  0.2528,  0.3033,  2.9059,  0.0547],
        [-0.3128,  0.2901,  0.2501,  0.3032,  2.9317,  0.0485],
        [-0.3167,  0.2854,  0.2488,  0.3063,  2.9056,  0.0533],
        [-0.3071,  0.2970,  0.2439,  0.3

In [None]:
# motion1 and motion2 and D1 and D2

In [11]:
! python vis_generated_data.py

(120, 23, 6) (9,)
0000:: torch.Size([1, 120, 23, 6])
torch.Size([120, 22, 3])
生成第0_p0个视频:
Traceback (most recent call last):
  File "/home/s5701147/MS/1/vis_generated_data.py", line 191, in <module>
    assert 1==2
           ^^^^
AssertionError


In [None]:
# next step is Camera_vis.ipynb