In [None]:
!pip install git+https://github.com/openai/whisper.git
!apt install tesseract-ocr
!apt install libtesseract-dev
!pip install Pillow
!pip install pytesseract

import torch
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from torchvision.transforms import Compose, Normalize, Resize, ToTensor
import pytesseract
import numpy as np
import whisper
import tempfile
import moviepy.editor as mp

# Initialize Midas model for depth estimation
def load_midas_model():
    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small").eval()
    return midas

# Depth estimation function
def estimate_depth(img, midas, transform):
    img_input = transform(Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))).unsqueeze(0)
    with torch.no_grad():
        depth_map = midas(img_input).squeeze().cpu().numpy()
    return depth_map

# Get transform function for Midas
def get_transform():
    return Compose([Resize((384, 384)), ToTensor(), Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])

# OCR function using Tesseract
def extract_text_with_ocr(frame):
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray_frame)
    return text

# Extract audio from video and transcribe it
def transcribe_audio_from_video(video_path):
    # Load Whisper model
    model = whisper.load_model("large")
    # Extract audio
    video_clip = mp.VideoFileClip(video_path)
    with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
        video_clip.audio.write_audiofile(temp_audio.name)
        result = model.transcribe(temp_audio.name)
        return result["text"]

# Generate Amazon listing from captions, transcriptions, OCR text, and dimensions
def generate_amazon_listing(captions, transcription, ocr_texts, dimensions):
    title = captions[0] if captions else "Product Title"
    description = f"{title}. {' '.join(captions[1:])}"
    key_features = [f"Dimension Range: {dim[0]:.2f}m to {dim[1]:.2f}m" for dim in dimensions]
    listing = {
        "title": title,
        "description": description,
        "key_features": key_features,
        "additional_text": " ".join(ocr_texts),
        "transcription_summary": transcription[:150] if transcription else "No transcription available"
    }
    return listing

# Video analysis function
def analyze_video(video_path, analyze_audio=True, frame_interval=2, ocr_interval=2):
    midas = load_midas_model()
    transform = get_transform()
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Load BLIP model and processor for image captioning
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

    captions = []
    dimensions = []
    ocr_texts = []

    for i in range(0, frame_count, frame_interval * fps):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            break

        # Caption the frame
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        prompt_inputs = processor(image, text="Describe the product in detail.", return_tensors="pt").to("cuda")
        output = model.generate(**prompt_inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2)
        caption = processor.decode(output[0], skip_special_tokens=True)
        captions.append(caption)

        # Depth estimation for dimensions
        depth_map = estimate_depth(frame, midas, transform)
        depth_range = (depth_map.min(), depth_map.max())
        dimensions.append(depth_range)

        # Perform OCR on additional frames
        if i % (ocr_interval * fps) == 0:
            ocr_text = extract_text_with_ocr(frame)
            if ocr_text:
                ocr_texts.append(ocr_text)

    cap.release()

    # Transcribe audio if specified
    transcription = transcribe_audio_from_video(video_path) if analyze_audio else None

    # Generate final Amazon listing
    listing = generate_amazon_listing(captions, transcription, ocr_texts, dimensions)

    print("\nGenerated Amazon Listing:")
    print("Title:", listing["title"])
    print("Description:", listing["description"])
    print("Key Features:", listing["key_features"])
    print("Additional Text:", listing["additional_text"])
    print("Transcription Summary:", listing["transcription_summary"])

    return listing

# Run the analysis with user choice for audio analysis
video_path = "/content/hp.mp4"  # Update with your video path
analyze_audio = False  # Set to False if you want to skip audio analysis
analyze_video(video_path, analyze_audio=analyze_audio)


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-r64sx17g
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-r64sx17g
  Resolved https://github.com/openai/whisper.git to commit 5979f03701209bb035a0a466f14131aeb1116cbb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master
