In [1]:
!pip install opencv-python-headless
!pip install pytesseract
!pip install pandas
!pip install youtube-dl
!pip install pafy
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# Installation of YOLOv5 via PyTorch
!pip install yolov5
# Install Tesseract-OCR
!sudo apt install tesseract-ocr


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [4]:
!pip install pytube

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [9]:
import cv2
import pandas as pd
from pytube import YouTube
from yolov5 import YOLOv5  # ensure using pip install yolov5
import pytesseract

# Function to load the YOLO model
def load_model():
    model_path = "yolov5s.pt"  # Specify the path to the YOLOv5 model
    device = "cpu"  # Use "cuda" for GPU
    model = YOLOv5(model_path, device=device)
    return model

# Process frames to detect and annotate shop names
def process_frame(frame, model, fps, count):
    results = model.predict(frame)
    annotations = []
    for xmin, ymin, xmax, ymax, confidence, cls in results.xyxy[0]:
        if confidence > 0.4:  # Higher confidence threshold for better precision
            text = pytesseract.image_to_string(frame[int(ymin):int(ymax), int(xmin):int(xmax)], config='--psm 6').strip()
            if text:
                cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
                cv2.putText(frame, text, (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
                annotations.append((text, f"{count//fps} seconds"))
    return frame, annotations

# Main function to download video, process frames, and save outputs
def main():
    url = 'https://www.youtube.com/watch?v=UKSR0XNSXSo'
    yt = YouTube(url)
    video = yt.streams.filter(file_extension='mp4').get_highest_resolution()

    cap = cv2.VideoCapture(video.download(skip_existing=True))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter('annotated_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

    yolo_model = load_model()
    shop_names = []
    count = 0
    five_minutes_frames = 5 * 60 * fps  # Calculate the number of frames in five minutes

    print("Starting video processing...")  # Debug statement
    while count < five_minutes_frames:
        ret, frame = cap.read()
        if not ret:
            break
        if count % (fps * 10) == 0:  # Process one frame every 10 seconds to reduce computation
            annotated_frame, texts = process_frame(frame, yolo_model, fps, count)
            shop_names.extend(texts)
            out.write(annotated_frame)
            print(f"Processed {count//fps} seconds")  # Debug statement
        else:
            out.write(frame)
        count += 1

    cap.release()
    out.release()

    # Deduplicate and sort shop names
    shop_names = list(dict.fromkeys(shop_names))
    shop_names.sort(key=lambda x: float(x[1].split()[0]))

    # Save to Excel
    df = pd.DataFrame(shop_names, columns=['Shop Name', 'Time Stamp'])
    df.to_excel("output.xlsx", index=False)
    print("Video processing complete, data exported to Excel.")

if __name__ == "__main__":
    main()


Starting video processing...
Processed 0 seconds
Processed 10 seconds
Processed 20 seconds
Processed 30 seconds
Processed 40 seconds
Processed 50 seconds
Processed 60 seconds
Processed 70 seconds
Processed 80 seconds
Processed 90 seconds
Processed 100 seconds
Processed 110 seconds
Processed 120 seconds
Processed 130 seconds
Processed 140 seconds
Processed 150 seconds
Processed 160 seconds
Processed 170 seconds
Processed 180 seconds
Processed 190 seconds
Processed 200 seconds
Processed 210 seconds
Processed 220 seconds
Processed 230 seconds
Processed 240 seconds
Processed 250 seconds
Processed 260 seconds
Processed 270 seconds
Processed 280 seconds
Processed 290 seconds
Video processing complete, data exported to Excel.
