In [None]:
# importing necessary modules
import os
import numpy as np
import torch
import subprocess
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

In [None]:
# device agnostic setting
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


# Video frames generation using ffmpeg

In [None]:
# Specify the path to ffmpeg
FFMPEG_PATH = "/home/taruns/mmer/MMER/ffmpeg-git-20240301-amd64-static/ffmpeg"

# Base path where your class folders are located
base_path = "/home/taruns/rohit/CricShot10dataset"

# Output directory to save frames
output_dir = "/home/taruns/rohit/frames"
os.makedirs(output_dir, exist_ok=True)

# Iterate over each class folder
for class_folder in os.listdir(base_path):
    class_folder_path = os.path.join(base_path, class_folder)

    # Ensure it's a directory
    if os.path.isdir(class_folder_path):
        # Create an output folder for each class
        class_output_dir = os.path.join(output_dir, class_folder)
        os.makedirs(class_output_dir, exist_ok=True)

        # Iterate over each video file in the class folder
        for video_file in os.listdir(class_folder_path):
            video_path = os.path.join(class_folder_path, video_file)
            video_name = os.path.splitext(video_file)[0]
            output_pattern = os.path.join(class_output_dir, f"{video_name}_%04d.jpg")

            # Command to extract frames at 1 fps
            ffmpeg_command = f"{FFMPEG_PATH} -i {video_path} -vf fps=1 {output_pattern} -loglevel quiet"

            # Execute the command
            try:
                subprocess.run(ffmpeg_command, shell=True, check=True)
                print(f"Processed {video_file} in {class_folder}")
            except subprocess.CalledProcessError:
                print(f"Failed to process video: {video_file} in {class_folder}")


Processed lofted_0063.avi in lofted
Processed lofted_0059.avi in lofted
Processed lofted_0064.avi in lofted
Processed lofted_0065.avi in lofted
Processed lofted_0179.avi in lofted
Processed lofted_0062.avi in lofted
Processed lofted_0071.avi in lofted
Processed lofted_0061.avi in lofted
Processed lofted_0058.avi in lofted
Processed lofted_0067.avi in lofted
Processed lofted_0043.avi in lofted
Processed lofted_0045.avi in lofted
Processed lofted_0016.avi in lofted
Processed lofted_0046.avi in lofted
Processed lofted_0015.avi in lofted
Processed lofted_0014.avi in lofted
Processed lofted_0017.avi in lofted
Processed lofted_0020.avi in lofted
Processed lofted_0051.avi in lofted
Processed lofted_0024.avi in lofted
Processed lofted_0026.avi in lofted
Processed lofted_0173.avi in lofted
Processed lofted_0050.avi in lofted
Processed lofted_0018.avi in lofted
Processed lofted_0032.avi in lofted
Processed lofted_0054.avi in lofted
Processed lofted_0040.avi in lofted
Processed lofted_0047.avi in

# Frames embeddings creation

In [None]:
# Path to the model and processor directories
saved_model_path = '/home/taruns/project/clip/model'
saved_processor_path = '/home/taruns/project/clip/processor'

In [None]:
# Load the CLIP model and processor
processor = CLIPProcessor.from_pretrained(saved_processor_path)
clip_model = CLIPModel.from_pretrained(saved_model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

In [None]:
def batch_process_images(image_paths, batch_size, processor, model, device):
    embeddings = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        batch_images = [Image.open(path).convert("RGB") for path in batch_paths]
        tokens = processor(
            text=None,
            images=batch_images,
            return_tensors="pt"
        ).to(device)
        batch_embeddings = model.get_image_features(**tokens)
        batch_embeddings = batch_embeddings.detach().cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings, axis=0)

In [None]:
# Path where the extracted frames are stored
main_folder = '/home/taruns/rohit/frames'
output_folder = '/home/taruns/rohit/frames_embeddings2'
os.makedirs(output_folder, exist_ok=True)

In [None]:
subfolders = [f.path for f in os.scandir(main_folder) if f.is_dir()]
total_subfolders = len(subfolders)
processed_subfolders = 0

for subfolder in subfolders:
    subfolder_name = os.path.basename(subfolder)
    output_subfolder = os.path.join(output_folder, subfolder_name)
    os.makedirs(output_subfolder, exist_ok=True)
    image_files = [f for f in os.listdir(subfolder) if f.endswith(('.png', '.jpg', '.jpeg'))]

    if image_files:
        image_paths = [os.path.join(subfolder, f) for f in image_files]
        embeddings = batch_process_images(image_paths, batch_size=100, processor=processor, model=clip_model, device=device)

        # Save each embedding with a filename that reflects its original image
        for i, emb in enumerate(embeddings):
            original_file_name = image_files[i].rsplit('.', 1)[0]  # Remove extension
            output_path = os.path.join(output_subfolder, f'{original_file_name}_embedding.npy')
            np.save(output_path, emb)

    processed_subfolders += 1
    print(f"Processed {processed_subfolders}/{total_subfolders} subfolders.")

print("All subfolders processed.")


Processed 1/4 subfolders.
Processed 2/4 subfolders.
Processed 3/4 subfolders.
Processed 4/4 subfolders.
All subfolders processed.


# Using embedding and training model

In [None]:
# Load embeddings and their labels
def load_embeddings_and_labels(embeddings_folder):
    embeddings = []
    labels = []
    label_mapping = {}  # To convert class names to numerical labels
    current_label = 0

    for class_folder in sorted(os.listdir(embeddings_folder)):
        class_path = os.path.join(embeddings_folder, class_folder)
        if os.path.isdir(class_path):
            if class_folder not in label_mapping:
                label_mapping[class_folder] = current_label
                current_label += 1
            for emb_file in sorted(os.listdir(class_path)):
                if emb_file.endswith('_embedding.npy'):
                    emb_path = os.path.join(class_path, emb_file)
                    embeddings.append(np.load(emb_path))
                    labels.append(label_mapping[class_folder])

    embeddings = torch.tensor(embeddings, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.long)
    return embeddings, labels, label_mapping

In [None]:
# Define the LSTM neural network
class LSTMNetwork(nn.Module):
    def __init__(self, input_size=512, hidden_size=256, num_classes=4):
        super(LSTMNetwork, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the output of the last time step
        return x

In [None]:
# Load data
embeddings_folder = '/home/taruns/rohit/frames_embeddings2'
embeddings, labels, class_label_mapping = load_embeddings_and_labels(embeddings_folder)

# Split data
dataset = TensorDataset(embeddings.unsqueeze(1), labels)  # Add an extra dimension for LSTM sequence length
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
# Model
model = LSTMNetwork(input_size=512, hidden_size=256, num_classes=len(class_label_mapping)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data, target in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")

    # Validation
    model.eval()
    total_correct = total = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = torch.max(output, 1)
            total_correct += (predicted == target).sum().item()
            total += target.size(0)

    val_accuracy = total_correct / total
    print(f"Validation Accuracy: {val_accuracy:.2f}")

# Test
model.eval()
total_correct = total = 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        _, predicted = torch.max(output, 1)
        total_correct += (predicted == target).sum().item()
        total += target.size(0)

test_accuracy = total_correct / total
print(f"Test Accuracy: {test_accuracy:.2f}")

#confusion matrix
all_labels = []
all_preds = []
for data, target in test_loader:
    data, target = data.to(device), target.to(device)
    output = model(data)
    _, predicted = torch.max(output, 1)
    all_labels.extend(target.tolist())
    all_preds.extend(predicted.tolist())

conf_mat = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_mat)

from sklearn.metrics import classification_report
report = classification_report(all_labels, all_preds, target_names=list(class_label_mapping.keys()))
print("Classification Report:")
print(report)

Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 129.65it/s]


Epoch 1, Training Loss: 1.1114254243233626
Validation Accuracy: 0.70


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.54it/s]


Epoch 2, Training Loss: 0.7452620232806486
Validation Accuracy: 0.72


Epoch 3: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 126.14it/s]


Epoch 3, Training Loss: 0.6369289685698116
Validation Accuracy: 0.74


Epoch 4: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 126.95it/s]


Epoch 4, Training Loss: 0.571181150043712
Validation Accuracy: 0.77


Epoch 5: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.28it/s]


Epoch 5, Training Loss: 0.5049134668181924
Validation Accuracy: 0.76


Epoch 6: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.22it/s]


Epoch 6, Training Loss: 0.4334707137416391
Validation Accuracy: 0.75


Epoch 7: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 125.83it/s]


Epoch 7, Training Loss: 0.4055499487063464
Validation Accuracy: 0.79


Epoch 8: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 127.81it/s]


Epoch 8, Training Loss: 0.3515821660266203
Validation Accuracy: 0.80


Epoch 9: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 130.12it/s]


Epoch 9, Training Loss: 0.33073876710499034
Validation Accuracy: 0.80


Epoch 10: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 127.68it/s]


Epoch 10, Training Loss: 0.29758980870246887
Validation Accuracy: 0.78


Epoch 11: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 129.34it/s]


Epoch 11, Training Loss: 0.3067941858487971
Validation Accuracy: 0.79


Epoch 12: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 129.05it/s]


Epoch 12, Training Loss: 0.260540679535445
Validation Accuracy: 0.84


Epoch 13: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 129.66it/s]


Epoch 13, Training Loss: 0.23355984863112955
Validation Accuracy: 0.84


Epoch 14: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.46it/s]


Epoch 14, Training Loss: 0.24350114911794662
Validation Accuracy: 0.83


Epoch 15: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 130.19it/s]


Epoch 15, Training Loss: 0.22601105930174098
Validation Accuracy: 0.86


Epoch 16: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.28it/s]


Epoch 16, Training Loss: 0.19718347051564386
Validation Accuracy: 0.85


Epoch 17: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 130.95it/s]


Epoch 17, Training Loss: 0.19537548983798309
Validation Accuracy: 0.86


Epoch 18: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 128.72it/s]


Epoch 18, Training Loss: 0.16276761495015202
Validation Accuracy: 0.83


Epoch 19: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 131.49it/s]


Epoch 19, Training Loss: 0.17241775288301356
Validation Accuracy: 0.87


Epoch 20: 100%|██████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 131.67it/s]

Epoch 20, Training Loss: 0.14270910620689392
Validation Accuracy: 0.87
Test Accuracy: 0.85
Confusion Matrix:
[[49  0  0  0]
 [ 0 65  6 14]
 [ 0  1  6  2]
 [ 2  5  4 70]]
Classification Report:
              precision    recall  f1-score   support

     defense       0.96      1.00      0.98        49
      lofted       0.92      0.76      0.83        85
  square_cut       0.38      0.67      0.48         9
       sweep       0.81      0.86      0.84        81

    accuracy                           0.85       224
   macro avg       0.77      0.82      0.78       224
weighted avg       0.87      0.85      0.85       224




