In [3]:
import whisper
import requests
import json
from PIL import Image, ImageDraw
from google.colab import files
import torch
from inference_sdk import InferenceHTTPClient

def install_dependencies():
    !pip install openai-whisper
    !pip install ffmpeg-python
    !apt update && apt install -y ffmpeg
    !pip install inference-sdk
    !pip install pillow  # Ensure PIL (Pillow) is installed

def transcribe_audio(audio_path):
    model = whisper.load_model("base")  # Load Whisper model
    result = model.transcribe(audio_path)
    return result["text"].lower()

def identify_instrument(transcription, instrument_keywords):
    for instrument in instrument_keywords:
        if instrument in transcription:
            return instrument
    return None

def query_roboflow(image_path, api_key, model_id):
    client = InferenceHTTPClient(api_url="https://detect.roboflow.com", api_key=api_key)
    result = client.infer(image_path, model_id=model_id)
    return result

def draw_bounding_boxes(image_path, detection_result):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    if "predictions" in detection_result:
        for pred in detection_result["predictions"]:
            x, y, w, h = pred["x"], pred["y"], pred["width"], pred["height"]
            x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2
            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
            draw.text((x1, y1 - 10), pred["class"], fill="red")  # Label the bounding box

    return image

if __name__ == "__main__":
    install_dependencies()

    print("Upload your audio file")
    uploaded_audio = files.upload()
    audio_file = list(uploaded_audio.keys())[0]

    print("Upload your image file")
    uploaded_image = files.upload()
    image_file = list(uploaded_image.keys())[0]

    api_key = "DsZ9bp6Ztu2ZZ8PgmAmU"
    model_id = "surgical-tools-dataset-wnqco-t10w9/1?confidence=0.8"  # Roboflow model ID

    instrument_keywords = ["scalpel", "forceps", "clamp", "retractor", "scissors"]  # Define known instruments

    # Step 1: Transcribe the audio command
    command_text = transcribe_audio(audio_file)
    print(f"Transcribed Command: {command_text}")

    # Step 2: Identify the requested instrument
    instrument = identify_instrument(command_text, instrument_keywords)
    if instrument:
        print(f"Identified Instrument: {instrument}")

        # Step 3: Query Roboflow to detect instrument in the image
        detection_result = query_roboflow(image_file, api_key, model_id)
        print("Detection Result:", detection_result)

        # Step 4: Draw bounding boxes and show the image
        processed_image = draw_bounding_boxes(image_file, detection_result)
        processed_image.show()
    else:
        print("No matching instrument found in the command.")

ModuleNotFoundError: No module named 'google.colab'

In [1]:
!pip install -U openai-whisper
!pip install inference-sdk

Collecting inference-sdk
  Downloading inference_sdk-0.45.0-py3-none-any.whl.metadata (20 kB)
Collecting dataclasses-json~=0.6.0 (from inference-sdk)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting opencv-python<=4.10.0.84,>=4.8.1.78 (from inference-sdk)
  Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting supervision<=0.30.0,>=0.25.1 (from inference-sdk)
  Downloading supervision-0.25.1-py3-none-any.whl.metadata (14 kB)
Collecting backoff~=2.2.0 (from inference-sdk)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json~=0.6.0->inference-sdk)
  Using cached marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json~=0.6.0->inference-sdk)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Downloading inference_sdk-0.45.0-py3-none-any.whl (43 kB)
Downloading backoff-2.2.1-py3-n

In [1]:
#Voice Control
import whisper
import requests
import json
from PIL import Image, ImageDraw
from google.colab import files
import torch
from inference_sdk import InferenceHTTPClient

def install_dependencies():
    !pip install openai-whisper
    !pip install ffmpeg-python
    !apt update && apt install -y ffmpeg
    !pip install inference-sdk
    !pip install pillow

def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"].lower()

def identify_instrument(transcription, instrument_keywords):
    for instrument in instrument_keywords:
        if instrument in transcription:
            return instrument
    return None

if __name__ == "__main__":
    install_dependencies()

    print("Upload your audio file")
    uploaded_audio = files.upload()
    audio_file = list(uploaded_audio.keys())[0]

    api_key = "DsZ9bp6Ztu2ZZ8PgmAmU"
    model_id = "surgical-tools-dataset-wnqco-t10w9/1?confidence=0.8"  # Roboflow model ID

    instrument_keywords = ["scalpel", "forceps", "clamp", "retractor", "scissors"]  # Define known instruments

    # Step 1: Transcribe the audio command
    command_text = transcribe_audio(audio_file)
    print(f"Transcribed Command: {command_text}")

    # Step 2: Identify the requested instrument
    instrument = identify_instrument(command_text, instrument_keywords)
    if instrument:
        print(f"Identified Instrument: {instrument}")

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,370 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,235 kB]
Hit:10 https://ppa.launchpadcontent.net/graph

Saving Clark Hall.m4a to Clark Hall.m4a


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 164MiB/s]


Transcribed Command:  scalpel
Identified Instrument: scalpel
