In [1]:
import base64
import requests
from io import BytesIO
from PIL import Image
import os
from dotenv import load_dotenv
import cv2
import easyocr
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output


load_dotenv()

def encode_image(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    encoded_string = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return encoded_string


def analyze_image(image_path):
    img = Image.open(image_path)
    base64_img = encode_image(img)

    api = "https://api.hyperbolic.xyz/v1/chat/completions"
    api_key = os.getenv("HYPERBOLIC_API_KEY")

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is this image?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
                    },
                ],
            }
        ],
        "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
        "max_tokens": 2048,
        "temperature": 0.7,
        "top_p": 0.9,
    }

    response = requests.post(api, headers=headers, json=payload)
    return response.json()

# Example usage:
# result = analyze_image("path_to_your_image")
# print(result)


In [3]:

def euclidean_distance(box1, box2):
    """Calculate the Euclidean distance between the centers of two bounding boxes."""
    x1_center = (box1[0][0] + box1[2][0]) / 2
    y1_center = (box1[0][1] + box1[2][1]) / 2
    x2_center = (box2[0][0] + box2[2][0]) / 2
    y2_center = (box2[0][1] + box2[2][1]) / 2
    return ((x1_center - x2_center) ** 2 + (y1_center - y2_center) ** 2) ** 0.5


# Initialize the video capture object
cap = cv2.VideoCapture(0)  # Use 0 for the default camera

# Initialize EasyOCR reader (you can add more languages if needed)
reader = easyocr.Reader(lang_list=['en'])  # Initialize for English

previous_words = {}

detected_text_with_confidence = []
pure_text = ""



while True:
    # Capture frame from the camera
    ret, frame = cap.read()
    if not ret:
        break

    # Perform OCR on the frame directly
    results = reader.readtext(image=frame, allowlist='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,? ')
    # Extract and print the detected text with confidence
    confidence_threshold = 0.6  # Adjust this value as needed
    for bbox, text, conf in results:
        if conf > confidence_threshold:
            text = text.strip().lower()
            if text == "":
                continue


            (x0, y0), (x1, y1), (x2, y2), (x3, y3) = bbox
            cv2.rectangle(frame, (int(x0), int(y0)), (int(x2), int(y2)), (0, 255, 0), 5)
            cv2.putText(frame, text, (int(x0), int(y0)), cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

            # check if word is in previous words
            if text in previous_words:
                continue
            previous_words[text] = True
            detected_text_with_confidence.append(f"(Confidence: {conf:.2f}) {text}")
            pure_text += f"{text} "

    # Clear the output
    clear_output(wait=True)

    # Print detected text with confidence
    if detected_text_with_confidence:
        print("pure_text", pure_text)
        print("Detected text:")
        for item in detected_text_with_confidence:
            print(item)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

    # write text to file
    with open("text.txt", "w") as f:
        f.write(pure_text)
    
    # Display the current frame
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.axis('off')  # Turn off axis labels
    
    # Update the display without blocking
    plt.draw()
    plt.pause(0.000001)  # Small pause to allow the plot to update
# Release resources
cap.release()
cv2.destroyAllWindows()


pure_text h a p ay rt y ed b bl bitrithd r pipi i ply pipin birthda 
Detected text:
(Confidence: 0.96) h
(Confidence: 1.00) a
(Confidence: 0.89) p
(Confidence: 0.86) ay
(Confidence: 0.97) rt
(Confidence: 0.94) y
(Confidence: 0.84) ed
(Confidence: 1.00) b
(Confidence: 0.70) bl
(Confidence: 0.62) bitrithd
(Confidence: 1.00) r
(Confidence: 0.71) pipi
(Confidence: 0.60) i
(Confidence: 0.67) ply
(Confidence: 0.76) pipin
(Confidence: 0.86) birthda


KeyboardInterrupt: 

In [None]:
# Define the OCR function easyocr


# read image
image_path = 'image.jpg'

img = cv2.imread(image_path)

# instance text detector
reader = easyocr.Reader(['en'], gpu=True)

# detect text on image
text_ = reader.readtext(img)

print(text_)

# threshold = 0.25
# # draw bbox and text
# for t_, t in enumerate(text_):
#     print(t)

#     bbox, text, score = t

#     if score > threshold:
#         cv2.rectangle(img, bbox[0], bbox[2], (0, 255, 0), 5)
#         cv2.putText(img, text, bbox[0], cv2.FONT_HERSHEY_COMPLEX, 0.65, (255, 0, 0), 2)

plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.show()

In [None]:
# Deepgram streaming

# take in text from the user and send it to deepgram
example_text = "Hello, how are you, what is your name?"



In [None]:
# get the camera feed from my computer

cap = cv2.VideoCapture(0)

import time

n = 5  # Set the interval in seconds
print("Starting image analysis")
last_analysis_time = time.time()

while True:
    ret, frame = cap.read()
    cv2.imshow('frame', frame)

    current_time = time.time()

    # Check if n seconds have passed since the last analysis
    if current_time - last_analysis_time >= n:
        print("Analyzing image")
        # export the image to a file
        cv2.imwrite("image.jpg", frame)

        # Analyze the image using the hyperbolic api
        result = analyze_image("image.jpg")
        print("Image analysis result:", result)

        # Update the last analysis time
        last_analysis_time = current_time

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [None]:
# send text to deepgram