<a href="https://colab.research.google.com/github/singhvis29/Ads_by_Ini_Projects/blob/main/ABI_Video1_Impressions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import cv2
import numpy as np
import os

In [None]:
import math

from PIL import Image
import requests
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import ipywidgets as widgets
from IPython.display import display, clear_output

import torch
from torch import nn
from torchvision.models import resnet50
import torchvision.transforms as T
torch.set_grad_enabled(False);

In [None]:
import os
import cv2
import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
from collections import defaultdict
from tqdm import tqdm
from sort.sort import Sort   # pip install sort-tracker

### Read Video and save it as frames

In [None]:
video_path = '/content/drive/MyDrive/Projects/Ads_by_Ini/clip_1.mp4'
output_folder = '/content/drive/MyDrive/Projects/Ads_by_Ini/frames'

In [None]:


# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Create a VideoCapture object
cap = cv2.VideoCapture(video_path)

# Check if video opened successfully
if (cap.isOpened() == False):
    print("Error opening video file")

# Print number of frame
print('Number of Frames:', cap.get(cv2.CAP_PROP_FPS))

# Read until video is completed
while(cap.isOpened()):
    # Capture frame-by-frame
    ret, frame = cap.read()

    if ret == True:
        # Construct the output file path
        output_path = os.path.join(output_folder, 'frame_' + str(int(cap.get(cv2.CAP_PROP_POS_FRAMES))) + '.png')

        # Save the resulting frame
        cv2.imwrite(output_path, frame)

        # Press Q on keyboard to exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
    # Break the loop
    else:
        break

# When everything done, release the video capture object
cap.release()

# Closes all the frames
# cv2.destroyAllWindows() # This will not work in Colab directly

print("Video processing complete.")

59.94005994005994


### Load Models

In [None]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval();

### Impressions

In [None]:


# Load pretrained DETR
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# Tracker
tracker = Sort(max_age=5, min_hits=2, iou_threshold=0.3)

# Threshold for detections
THRESHOLD = 0.7

# Frames path
frames_folder = output_folder
frame_files = sorted(os.listdir(frames_folder))

FPS = 60.0

# Store object tracks
object_tracks = defaultdict(list)

for frame_id, file in tqdm(enumerate(frame_files), total=len(frame_files)):
    frame_path = os.path.join(frames_folder, file)
    frame = cv2.imread(frame_path)

    inputs = processor(images=frame, return_tensors="pt")
    outputs = model(**inputs)

    target_sizes = torch.tensor([frame.shape[:2]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=THRESHOLD)[0]

    detections = []
    class_map = {}

    # Prepare detections in SORT format: [x1, y1, x2, y2, score]
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        cls = model.config.id2label[label.item()]
        xmin, ymin, xmax, ymax = box.tolist()
        detections.append([xmin, ymin, xmax, ymax, float(score)])
        class_map[(xmin, ymin, xmax, ymax)] = cls

    if len(detections) > 0:
        detections = torch.tensor(detections)
        tracked = tracker.update(detections.numpy())
    else:
        tracked = []

    timestamp = frame_id / FPS

    for x1, y1, x2, y2, track_id in tracked:
        # Match track bbox with class
        cls = None
        for b, c in class_map.items():
            if abs(b[0]-x1)<5 and abs(b[1]-y1)<5:
                cls = c
                break
        if cls is None:
            cls = "unknown"

        area = (x2 - x1) * (y2 - y1)
        distance_est = 1 / (area + 1e-6)

        object_tracks[(cls, int(track_id))].append({
            "frame": frame_id,
            "time": timestamp,
            "bbox": [float(x1), float(y1), float(x2), float(y2)],
            "distance_est": distance_est
        })

# --- Summarize grouped by class ---
summary = defaultdict(list)

for (cls, track_id), entries in object_tracks.items():
    duration = (entries[-1]["time"] - entries[0]["time"]) if len(entries) > 1 else 1/FPS
    avg_distance = sum(e["distance_est"] for e in entries) / len(entries)

    summary[cls].append({
        "track_id": track_id,
        "duration_sec": duration,
        "avg_distance_est": avg_distance
    })

# Print final summary
print("\nSummary grouped by class:")
for cls, objs in summary.items():
    num_objs = len(objs)
    avg_duration = sum(o["duration_sec"] for o in objs) / num_objs
    avg_distance = sum(o["avg_distance_est"] for o in objs) / num_objs
    print(f"{cls}: {num_objs} objects, avg duration {avg_duration:.2f}s, avg distance proxy {avg_distance:.4f}")
