In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import model.SocialLSTM as s
import torch
import torch.optim as optim
import torch.nn as nn

In [57]:
#parameters
obs_len = 10
pred_len = 5
grid_size = 4
hidden_dim = 128
pos_embed_dim = 64
grid_embed_dim = 64
dropout = 0.0
learning_rate = 1e-3
epochs = 30

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:

import utils.slstm_utils as slstm_utils
# import importlib
# importlib.reload(slstm_utils)
import os
current_dir = os.getcwd()

file_path = os.path.join(current_dir, "data","annotations","bookstore","video0", "annotations.txt")
processed_df = slstm_utils.data_cleaning(file_path)
agent_position_dict, frame_ids = slstm_utils.generate_frame_to_cordinate_map(processed_df)
agent_idx_lookup = slstm_utils.agent_hidden_idx_lookup(processed_df)
sequences = slstm_utils.generate_frame_sequences(frame_ids,obs_len=obs_len, pred_len=pred_len, stride=5)

#generate_tensor_from_frames()

In [61]:
max_no_agents = len(agent_idx_lookup)
model = s.SocialLSTM( pos_embedding_size=pos_embed_dim, grid_embedding_size=grid_embed_dim,
                   hidden_dim=hidden_dim, grid_size=grid_size, dropout=dropout, max_agents=max_no_agents)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [62]:

#sequences = sequences[0:1000]
print(len(sequences))
for epoch in range(epochs):
    print(" starting epoch ",epoch)
    total_loss = 0.0
    for window in sequences:
    
        input_tensor , mask_tensor = slstm_utils.generate_tensor_from_frames(window,agent_position_dict,agent_idx_lookup )
        social_grid_tensor = slstm_utils.compute_social_grids(input_tensor,mask_tensor)

        input_tensor = input_tensor.to(device)
        mask_tensor = mask_tensor.to(device)
        social_grid_tensor = social_grid_tensor.to(device)
        max_no_agents = input_tensor.size(1)
        # obs_input = input_tensor[:obs_len]              # [8, max_agents, 2]
        # obs_social_grid_tensor = social_grid_tensor[:obs_len]
        # obs_mask = mask_tensor[:obs_len]

        hidden_states = torch.zeros(max_no_agents, hidden_dim, device=device)
        cell_states = torch.zeros(max_no_agents, hidden_dim, device=device)
        
        

        outputs, hidden_states, cell_states = model(input_tensor, social_grid_tensor, hidden_states, cell_states, mask_tensor)
        

        predicted = outputs[obs_len:]  # future frames
        target = input_tensor[obs_len:]
        target_mask = mask_tensor[obs_len:].unsqueeze(-1)
        # pred_len = input_tensor.size(0) - obs_len
        # predicted = []
        # last_positions = obs_input[-1]

        # for t in range(pred_len):
        #     grid_t = slstm_utils.compute_social_grids(last_positions.unsqueeze(0), obs_mask[-1:].unsqueeze(0))  # shape [1, max_agents, grid_cells, max_agents]
        #     grid_t = grid_t.to(device)
        #     out, hidden_states, cell_states = model(last_positions.unsqueeze(0), grid_t, hidden_states, cell_states, obs_mask[-1:].unsqueeze(0))
        #     last_positions = out[0]  
        #     predicted.append(last_positions)
        
        # predicted = torch.stack(predicted) 
        # target = input_tensor[obs_len:]   
        # target_mask = mask_tensor[obs_len:].unsqueeze(-1)
        #print(predicted)
        loss = criterion(predicted * target_mask, target * target_mask)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
           

    print(f"Epoch [{epoch+1}/{epochs}] Loss: {total_loss/len(sequences):.4f}")



2665
 starting epoch  0
Epoch [1/30] Loss: 140200.4101
 starting epoch  1
Epoch [2/30] Loss: 93352.7468
 starting epoch  2
Epoch [3/30] Loss: 61797.1459
 starting epoch  3
Epoch [4/30] Loss: 43060.7628
 starting epoch  4
Epoch [5/30] Loss: 31200.8390
 starting epoch  5
Epoch [6/30] Loss: 22125.6443
 starting epoch  6
Epoch [7/30] Loss: 17533.6168
 starting epoch  7
Epoch [8/30] Loss: 14235.1311
 starting epoch  8
Epoch [9/30] Loss: 10963.1870
 starting epoch  9
Epoch [10/30] Loss: 7510.8319
 starting epoch  10
Epoch [11/30] Loss: 4942.1275
 starting epoch  11
Epoch [12/30] Loss: 2995.6592
 starting epoch  12
Epoch [13/30] Loss: 1757.2342
 starting epoch  13
Epoch [14/30] Loss: 997.8651
 starting epoch  14
Epoch [15/30] Loss: 603.7648
 starting epoch  15
Epoch [16/30] Loss: 355.3172
 starting epoch  16
Epoch [17/30] Loss: 218.4224
 starting epoch  17
Epoch [18/30] Loss: 197.9390
 starting epoch  18
Epoch [19/30] Loss: 143.9775
 starting epoch  19
Epoch [20/30] Loss: 122.4478
 starting e

In [65]:
for t in range(len(target)):
    print(f"Frame {t+1}")
    print("Target positions:\n", target[t])
    print("Predicted positions:\n", predicted[t])
    print("-"*50)
print(window)

Frame 1
Target positions:
 tensor([[   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0.0000,    0.0000],
        [   0

In [None]:





# fps = cap.get(cv2.CAP_PROP_FPS)
# frame_width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
# frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# if(total_frames == len(frame_ids)):
#     print("No of frames match")
# print("frame dim",frame_width,frame_height)
# #len(frame_ids)

In [None]:
frame_idx = 0
print(predicted.shape)
print(target.shape)
pred_len, max_agents, _ = predicted.shape
pred_mapping = {}
for t, f_idx in enumerate(window[obs_len:]):
    positions = []
    for agent_id in range(max_agents):
        x, y = predicted[t, agent_id].cpu().detach().numpy()
        if not (x == 0 and y == 0): 
            positions.append((int(x), int(y)))
    pred_mapping[f_idx] = positions
print(pred_mapping)

torch.Size([2, 116, 2])
{13330: [(1156, 321), (1160, 320), (325, 1341), (807, 1468), (1342, 1352), (875, 1453), (878, 1539), (887, 1149), (1351, 1427), (1429, 1434)], 13331: [(1156, 321), (1160, 320), (325, 1341), (806, 1465), (1343, 1349), (873, 1447), (878, 1539), (887, 1150), (1351, 1427), (1429, 1434)]}


In [69]:
# Convert dict to list of trajectories per agent
num_agents = len(pred_mapping[list(pred_mapping.keys())[0]])
pred_traj = [[] for _ in range(num_agents)]

# Collect positions per agent across frames
for frame_idx in sorted(pred_mapping.keys()):
    positions = pred_mapping[frame_idx]
    for agent_id, pos in enumerate(positions):
        pred_traj[agent_id].append(pos)

print(pred_traj)

[[(1156, 321), (1156, 321)], [(1160, 320), (1160, 320)], [(325, 1341), (325, 1341)], [(807, 1468), (806, 1465)], [(1342, 1352), (1343, 1349)], [(875, 1453), (873, 1447)], [(878, 1539), (878, 1539)], [(887, 1149), (887, 1150)], [(1351, 1427), (1351, 1427)], [(1429, 1434), (1429, 1434)]]


In [None]:
import cv2
video_file_path = os.path.join(current_dir, "data","videos","bookstore","video0", "video.mov")
#video_path = "video0.mp4"
cap = cv2.VideoCapture(video_file_path)

obs_frame_idx = window[obs_len]  # change this to your last observed frame index (0-based)
cap.set(cv2.CAP_PROP_POS_FRAMES, obs_frame_idx)
ret, last_frame = cap.read()
cap.release()

frame = last_frame.copy()

colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255),
          (255, 0, 255), (255, 255, 0)]  

for agent_id, traj in enumerate(pred_traj):
    color = colors[agent_id % len(colors)]
    for i in range(len(traj)-1):
        alpha = 0.3 + 0.7 * (i / (len(traj)-1))  
        overlay_color = tuple(int(c*alpha) for c in color)
        cv2.line(frame, traj[i], traj[i+1], overlay_color, 2)
        cv2.circle(frame, traj[i], 3, overlay_color, -1)
    cv2.circle(frame, traj[-1], 5, color, -1) 

cv2.imwrite("pred_on_last_frame.png", frame)
cv2.imshow("Predicted Trajectories", frame)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
# projections from true label.

