In [None]:
from ultralytics import YOLO
import cv2
import numpy as np
from collections import deque
from tensorflow.keras.models import load_model

# Load the YOLO model and action recognition model
yolo_model = YOLO(r'C:\Users\poory\Desktop\work\bonji\action detection\track\best.pt')
action_recognition_model = load_model('action_recognition_model.h5')

# Video and frame handling parameters
video_path = r'C:\Users\poory\Desktop\work\bonji\action detection\track\video\croped2.avi'
cap = cv2.VideoCapture(video_path)
ret = True
frame_skip_interval = 20
frame_count = 0
IMG_HEIGHT, IMG_WIDTH = 64, 64
SEQ_LENGTH = 20

# List of actions (assuming the same order as used during training)
ACTIONS_LIST = ['sleeping', 'using the phone', 'sitting and working', 'sitting and talking', 'working', 'eating food']

# Initialize a deque to store the frames for each person
person_frames = {}

while ret:
    ret, frame = cap.read()
    frame_count += 1

    # Skip frames for processing efficiency
    if frame_count % frame_skip_interval != 0:
        continue

    # Track people in the frame
    results = yolo_model.track(frame, persist=True)
    tracked_frame = results[0].plot()

    # Loop through each detected person in the frame
    for detection in results[0].boxes:
        # Check if the detection label is 'person'
        if detection.cls == 0:  # assuming '0' corresponds to 'person' in your model
            x1, y1, x2, y2 = map(int, detection.xyxy[0])  # Get bounding box coordinates
            person_id = int(detection.id)  # Get unique ID for the tracked person
            
            # Crop and preprocess the bounding box image
            person_frame = frame[y1:y2, x1:x2]
            resized_person_frame = cv2.resize(person_frame, (IMG_HEIGHT, IMG_WIDTH))
            normalized_person_frame = resized_person_frame / 255.0

            # Initialize frame deque for new persons
            if person_id not in person_frames:
                person_frames[person_id] = deque(maxlen=SEQ_LENGTH)

            # Add the processed frame to the person's deque
            person_frames[person_id].append(normalized_person_frame)

            # If we have enough frames for action recognition, predict the action
            if len(person_frames[person_id]) == SEQ_LENGTH:
                # Prepare the frame sequence as input to the action recognition model
                frame_sequence = np.expand_dims(person_frames[person_id], axis=0)
                
                # Predict the action
                predicted_probs = action_recognition_model.predict(frame_sequence)[0]
                predicted_label = np.argmax(predicted_probs)
                predicted_action = ACTIONS_LIST[predicted_label]

                # Map action to "working" or "not working"
                if predicted_action in ['sitting and working', 'working']:
                    action_status = "working"
                else:
                    action_status = "not working"

                # Display the working status on the bounding box
                cv2.putText(tracked_frame, action_status, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Display the frame with action recognition results
    cv2.imshow('Person Tracking and Action Recognition', tracked_frame)

    # Break on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()
