In [1]:
!pip install timm keras-ocr yolov5 opencv-python transformers ultralytics



In [2]:
import cv2
import os
import csv
import uuid
from datetime import datetime
from skimage import metrics
import numpy as np
import pickle
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
from transformers import pipeline, AutoImageProcessor, AutoModelForDepthEstimation
import torch
import keras_ocr
from transformers import BlipProcessor, BlipForConditionalGeneration
import math
from ultralytics import YOLO

  "class": algorithms.Blowfish,
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
!pip install opencv-python



In [4]:
# Function to create a CSV file with column names
def create_csv_file(csv_filename):
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['id', 'timestamp', 'path', 'caption', 'depth'])

In [12]:
def capture_and_store_frames(csv_filename, frames_folder):
    cap = cv2.VideoCapture(0)
    last_frame = None
    frame_count = 0  # Counter for captured frames

    while frame_count < 10:  # Capture 10 frames
        ret, frame = cap.read()
        if not ret:
            break

        # Save the frame
        save_frame(frame, csv_filename, frames_folder)
        frame_count += 1

    # Release the camera
    cap.release()

In [6]:
def save_frame(frame, csv_filename, frames_folder):
    # Generate unique ID for the frame
    frame_id = str(uuid.uuid4())

    # Timestamp of the frame
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Write frame to the frames folder with unique ID
    frame_path = os.path.join(frames_folder, frame_id + '.jpg')
    cv2.imwrite(frame_path, frame)

    # Write frame details to CSV file
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([frame_id, timestamp, frame_path])

def is_similar(image1, image2, threshold=0.9):
    image1_gray = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    image2_gray = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)

    ssim_score = metrics.structural_similarity(image1_gray, image2_gray)
    return ssim_score > threshold

In [7]:
def generate_image_caption(image_path, processor, model):
    # Load an image from local storage
    image = Image.open(image_path)

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt")

    # Generate caption
    outputs = model.generate(**inputs)

    # Decode the generated caption
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [8]:
def detect_objects(image_path):
    model_checkpoint = "facebook/detr-resnet-50"
    object_detector = pipeline("object-detection", model=model_checkpoint)
    image = Image.open(image_path)
    detections = object_detector(image)
    results = []
    for detection in detections:
        box = detection['box']
        label = detection['label']
        score = detection['score']
        results.append({
            'object': label,
            'score': score,
            'coordinates': {
                'xmin': box['xmin'],
                'ymin': box['ymin'],
                'xmax': box['xmax'],
                'ymax': box['ymax']
            }
        })
    return results

In [9]:
def estimate_depth(image_path):
    # Load the depth estimation model
    checkpoint = "vinvino02/glpn-nyu"
    depth_estimator = pipeline("depth-estimation", model=checkpoint)
    image = Image.open(image_path)
    predictions = depth_estimator(image)
    img_depth = np.array(predictions["depth"])

    detected_objects = detect_objects(image_path)

    # Estimate the depth map
    depth_map = img_depth

    # Analyze the depth within bounding boxes
    analysis_results = []
    for obj in detected_objects:
        box = obj['coordinates']
        xmin, ymin, xmax, ymax = int(box['xmin']), int(box['ymin']), int(box['xmax']), int(box['ymax'])

        # Crop the depth map to the bounding box
        depth_crop = depth_map[ymin:ymax, xmin:xmax]

        # Calculate the average depth in the bounding box
        avg_depth = np.mean(depth_crop)

        threshold = 100

        # Determine if the object is near or far based on the threshold
        if avg_depth < threshold:
            distance = 'near'
        else:
            distance = 'far'

        analysis_results.append({
            'object': obj['object'],
            'score': obj['score'],
            'coordinates': obj['coordinates'],
            'avg_depth': avg_depth,
            'distance': distance
        })
    return analysis_results

In [10]:
'''def perform_ocr(image_path, ocr_pipeline):
    image = keras_ocr.tools.read(image_path)
    predictions = ocr_pipeline.recognize([image])[0]
    ocr_text = " ".join([text for text, box in predictions])
    return ocr_text'''

'def perform_ocr(image_path, ocr_pipeline):\n    image = keras_ocr.tools.read(image_path)\n    predictions = ocr_pipeline.recognize([image])[0]\n    ocr_text = " ".join([text for text, box in predictions])\n    return ocr_text'

In [13]:
if __name__ == "__main__":
    csv_filename = 'frames_data.csv'
    results_csv_filename = 'results_data.csv'
    frames_folder = 'frames'

    if not os.path.exists(frames_folder):
        os.makedirs(frames_folder)

    if not os.path.exists(csv_filename):
        create_csv_file(csv_filename)

    capture_and_store_frames(csv_filename, frames_folder)

    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    # Initialize OCR pipeline
    #ocr_pipeline = keras_ocr.pipeline.Pipeline()

    # Create results CSV file with column names
    create_csv_file(results_csv_filename)

    # Generate captions, depth estimations, and perform OCR for saved frames
    for frame in os.listdir(frames_folder):
        frame_path = os.path.join(frames_folder, frame)
        caption = generate_image_caption(frame_path, processor, model)
        depth = estimate_depth(frame_path)
        #ocr_results = perform_ocr(frame_path, ocr_pipeline)
        
        # Measure caption generation time
        start_time_caption = datetime.now()
        caption = generate_image_caption(frame_path, processor, model)
        end_time_caption = datetime.now()
        caption_response_time = (end_time_caption - start_time_caption).total_seconds()
        print(f"Caption response time for {frame}: {caption_response_time} seconds")
        
        # Measure depth estimation time
        start_time_depth = datetime.now()
        depth = estimate_depth(frame_path)
        end_time_depth = datetime.now()
        depth_response_time = (end_time_depth - start_time_depth).total_seconds()
        print(f"Depth estimation response time for {frame}: {depth_response_time} seconds")

        # Save results to the CSV file
        with open(results_csv_filename, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([frame, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), frame_path, caption, depth])

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Caption response time for 3ba9d4d2-adc9-431b-b756-3bc9ab011852.jpg: 7.978343 seconds


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Depth estimation response time for 3ba9d4d2-adc9-431b-b756-3bc9ab011852.jpg: 24.187975 seconds
