In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from collections import deque
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 64, 64
SEQUENCE_LENGTH = 20
DATA_PATH = r"C:\Users\poory\Desktop\work\bonji\action detection\track\new_dataset"
ACTIONS = ['sleeping', 'using the phone', 'sitting and working', 'sitting and talking', 'working', 'eating food']

def retrieve_frames(video_path):
    frames_list = []
    video_capture = cv2.VideoCapture(video_path)
    skip_frames = max(int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT) / SEQUENCE_LENGTH), 1)

    for i in range(SEQUENCE_LENGTH):
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, i * skip_frames)
        success, frame = video_capture.read()
        if not success:
            break

        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        normalized_frame = resized_frame / 255.0  # Normalize to [0, 1]
        frames_list.append(normalized_frame)

    video_capture.release()
    return frames_list

def assemble_action_dataset():
    features_list, labels_list = [], []
    for action_index, action_name in enumerate(ACTIONS):
        print(f'Extracting data for action: {action_name}')
        files = os.listdir(os.path.join(DATA_PATH, action_name))
        for file_name in files:
            video_file_path = os.path.join(DATA_PATH, action_name, file_name)
            video_frames = retrieve_frames(video_file_path)
            if len(video_frames) == SEQUENCE_LENGTH:
                features_list.append(video_frames)
                labels_list.append(action_index)

    return np.asarray(features_list), np.array(labels_list)

# Prepare the dataset
X_data, y_data = assemble_action_dataset()
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, shuffle=True, random_state=42)

# Create CNN-LSTM model
def create_action_recognition_model():
    model = Sequential()
    model.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same'), input_shape=(SEQUENCE_LENGTH, IMAGE_HEIGHT, IMAGE_WIDTH, 3)))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Conv2D(128, (3, 3), activation='relu', padding='same')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Conv2D(256, (3, 3), activation='relu', padding='same')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Flatten()))
    model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
    model.add(LSTM(128, return_sequences=False))  # Increased LSTM units
    model.add(Dense(len(ACTIONS), activation='softmax'))
    return model

# Compile and train the model
action_recognition_model = create_action_recognition_model()
action_recognition_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='accuracy', patience=10, mode='max', restore_best_weights=True)

action_recognition_model.fit(X_train, y_train, epochs=100, batch_size=4, shuffle=True, validation_split=0.2, callbacks=[early_stop])

# Save the trained model
action_recognition_model.save('action_recognition_model.h5')

# Function to predict on video
def predict_action_in_video(video_path, output_path):
    video_capture = cv2.VideoCapture(video_path)
    original_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc('M', 'P', '4', 'V'),
                                   video_capture.get(cv2.CAP_PROP_FPS), (original_width, original_height))

    frame_buffer = deque(maxlen=SEQUENCE_LENGTH)

    while video_capture.isOpened():
        success, frame = video_capture.read()
        if not success:
            break

        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        normalized_frame = resized_frame / 255.0
        frame_buffer.append(normalized_frame)

        if len(frame_buffer) == SEQUENCE_LENGTH:
            predicted_probs = action_recognition_model.predict(np.expand_dims(frame_buffer, axis=0))[0]
            predicted_label = np.argmax(predicted_probs)
            predicted_action = ACTIONS[predicted_label]

            cv2.putText(frame, predicted_action, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            video_writer.write(frame)

    video_capture.release()
    video_writer.release()
