In [21]:
import time
import os
import numpy as np
import whisper
import string
import librosa
import speech_recognition as sr

# Initialize Whisper Model
whisper_model = whisper.load_model("base")  # You can use "small", "medium", etc.

# Initialize Speech Recognition
recognizer = sr.Recognizer()

# Initialize Microphone
microphone = sr.Microphone()

# Transcription Class using Whisper
class Transcriber:
    def __init__(self, model) -> None:
        print("Loading Whisper model...")
        self.model = whisper.load_model(model)
        print("Model loaded.")
        self.prompts = os.environ.get("WHISPER_INITIAL_PROMPT", "")
        print(f"Using prompts: {self.prompts}")

    def transcribe(self, frames):
        print("Transcribing audio...")
        try:
            # Ensure audio is in the right format for Whisper
            audio_data = np.array(frames, dtype=np.int16).astype(np.float32) / 32768.0
            result = self.model.transcribe(audio=audio_data, language="en", fp16=False, initial_prompt=self.prompts)
            return result.get("text", "No speech detected")
        except Exception as e:
            print(f"Error during transcription: {e}")
            return None

transcriber = Transcriber("base")  # Use any Whisper model variant

# List of potential wake words (or phrases)
wake_words = ["astra"]

# Identify instrument from transcription
def identify_instrument(command):
    instruments = ['forceps', 'scalpel', 'scissors', 'tweezers']
    
    instrument = ''
    for word in command.split():
        word = (word.translate(str.maketrans('', '', string.punctuation))).lower()  # Remove punctuation
        if word in instruments:
            instrument = word
            break  # If one instrument is found, break the loop
    
    if instrument == '':
        print("No instrument found")
    else:
        print(f"Instrument identified: {instrument}")

# Process audio file for wake word detection
def process_audio_file(file_path):
    try:
        # Load audio using librosa with resampling to 16kHz
        audio_data, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz if needed
        print(f"Audio loaded with sample rate: {sr}")

        # Ensure audio data is in the correct format for Whisper
        audio_data = (audio_data * 32768).astype(np.int16)
        
        # Transcribe the audio
        transcription = transcriber.transcribe(audio_data)
        
        if transcription is None:
            print("Transcription failed.")
            return

        print(f"Transcription result: {transcription}")
        
        # Check for wake word in the transcription
        wake_word_detected = False
        for wake_word in wake_words:
            if wake_word in transcription.lower():
                print(f"Wake word detected: {wake_word}")
                wake_word_detected = True
                break
        
        # If a wake word is detected, then proceed with identifying instrument
        if wake_word_detected:
            identify_instrument(transcription)  # Identify instrument from transcription
        else:
            print("No wake word detected. Skipping instrument identification.")

    except Exception as e:
        print(f"Error processing audio file: {e}")

# Function to listen for wake word
def listen_for_wake_word():
    print("Listening for wake word...")
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        while True:
            audio = recognizer.listen(source)
            try:
                query = recognizer.recognize_google(audio).lower()
                print(f"Recognized speech: {query}")

                if any(wake_word in query for wake_word in wake_words):
                    print(f"Wake word detected: {query}")
                    return True

            except Exception as e:
                print(f"Error recognizing speech: {e}")
                continue

# Function to listen for speech after wake word is detected
def listen_for_speech():
    print("Listening for command after wake word...")
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
    try:
        print("Recognizing speech...")
        query = recognizer.recognize_google(audio)
        print(f"Recognized text: {query}")
        return query
    except Exception as e:
        print(f"Error recognizing speech: {e}")
        return None

# Combined loop that listens for wake word and then for instrument command
def listen_and_transcribe():
    while True:
        if listen_for_wake_word():
            command = listen_for_speech()
            if command:
                identify_instrument(command)



# Process an audio file (already recorded audio) or continuously listen for new speech
# process_audio_file("/Users/sreyaskanaparti/Downloads/Levering Hall.wav")
# process_audio_file("/Users/sreyaskanaparti/Downloads/Levering Hall 2.wav")
# process_audio_file("/Users/sreyaskanaparti/Downloads/Levering Hall 3.wav")
# process_audio_file('/Users/sreyaskanaparti/Downloads/Mason Hall.wav')
# process_audio_file('/Users/sreyaskanaparti/Downloads/Mason Hall 3.wav')
process_audio_file('/Users/sreyaskanaparti/Downloads/output_with_synthetic_noise_3.wav')

# listen_and_transcribe()  # Uncomment this line to continuously listen for wake word and then transcribe speech

Loading Whisper model...
Model loaded.
Using prompts: 
Audio loaded with sample rate: 16000
Transcribing audio...
Transcription result:  Astra, give me scissors.
Wake word detected: astra
Instrument identified: scissors


In [17]:
from pydub import AudioSegment

def generate_white_noise(duration_ms, sample_rate=16000):
    """Generate white noise with a given duration in milliseconds."""
    num_samples = int(duration_ms * sample_rate / 1000)
    noise = np.random.normal(0, 1, num_samples)  # White noise
    return noise

def add_synthetic_noise(input_audio_path, output_audio_path, noise_level=0.1):
    # Load the original audio file
    original_audio = AudioSegment.from_wav(input_audio_path)
    
    # Get the original audio data as numpy array
    samples = np.array(original_audio.get_array_of_samples(), dtype=np.float32)
    
    # Generate white noise to match the length of the original audio
    noise = generate_white_noise(len(samples) / original_audio.frame_rate * 1000, sample_rate=original_audio.frame_rate)
    
    # Scale the noise to match the noise level (0.1 for low noise, 1.0 for high noise)
    noise *= noise_level * np.max(np.abs(samples))  # Adjust the amplitude of the noise
    
    # Add the noise to the original audio
    noisy_samples = samples + noise
    
    # Clip values to avoid distortion (ensure the values are within the allowed range for audio)
    noisy_samples = np.clip(noisy_samples, -32768, 32767)
    
    # Convert noisy samples back to audio
    noisy_audio = AudioSegment(
        noisy_samples.astype(np.int16).tobytes(),
        frame_rate=original_audio.frame_rate,
        sample_width=original_audio.sample_width,
        channels=original_audio.channels
    )
    
    # Export the noisy audio to a new file
    noisy_audio.export(output_audio_path, format="wav")
    print(f"Audio with synthetic background noise saved to {output_audio_path}")

# Example usage
input_audio = "/Users/sreyaskanaparti/Downloads/Levering Hall 2.wav"
output_audio = "output_with_synthetic_noise_3.wav"

add_synthetic_noise(input_audio, output_audio, noise_level=0.05)

Audio with synthetic background noise saved to output_with_synthetic_noise_3.wav


In [23]:
# Yolo trained on roboflow dataset

from ultralytics import YOLO
import torch

# load yolov5 from online
model = torch.hub.load('ultralytics/yolov5', 'custom', path='./models/instrument_detector_model.pt', force_reload=True)  # load a custom model

# Predict with the model
results = model("./eval-images/test_instruments.jpg")  # predict on an image

# # Access the results
# for result in results:
#     xywh = result.boxes.xywh  # center-x, center-y, width, height
#     xywhn = result.boxes.xywhn  # normalized
#     xyxy = result.boxes.xyxy  # top-left-x, top-left-y, bottom-right-x, bottom-right-y
#     xyxyn = result.boxes.xyxyn  # normalized
#     names = [result.names[cls.item()] for cls in result.boxes.cls.int()]  # class name of each box
#     confs = result.boxes.conf  # confidence score of each box

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to C:\Users\maxis/.cache\torch\hub\master.zip
YOLOv5  2025-4-4 Python-3.9.21 torch-2.6.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)

Fusing layers... 
Model summary: 267 layers, 46167513 parameters, 0 gradients, 107.8 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


In [24]:
# Translate results
results.names = {0: 'Standard Anatomical Tweezers',
 1: 'Slim Anatomical Tweezers',
 2: 'Surgical Tweezers',
 3: 'Splinter Tweezers',
 4: 'Scalpel Handle No. 3',
 5: 'Scalpel Handle No. 4',
 6: 'Clenched Scalpel',
 7: 'Narrow Scalpel',
 8: 'Surgical Scissors Sharp/Sharp',
 9: 'Surgical Scissors Sharp/Narrow',
 10: 'Standard Dissecting Scissors',
 11: 'Dissecting Needle'}

# save to runs/detect/expY
results.save()

Saved 1 image to [1mruns\detect\exp3[0m


In [4]:
import os
import re

# Specify the directory containing the txt files.
directory = 'C:/Users/maxis/Desktop/JHU/Term 2/Design Team/Data/HOSPI_Tools_small_yolov5/labels/train'  # Change this to your directory path

# Use a set to store unique labels.
unique_labels = set()

# Compile a regex pattern:
# This pattern looks for a label (anything ending with an underscore)
# immediately followed by one or more digits and then the ".txt" extension.
pattern = re.compile(r'^(.*_)(\d+)\.txt$', re.IGNORECASE)

# Iterate over the files in the directory.
with os.scandir(directory) as entries:
    for entry in entries:
        if entry.is_file() and entry.name.lower().endswith('.txt'):
            match = pattern.match(entry.name)
            if match:
                label = match.group(1)  # Extract the label part (e.g., "No4BP_Handle_")
                unique_labels.add(label)

# Optionally, sort the labels before writing.
sorted_labels = sorted(unique_labels)

# Write the unique labels to an output file.
output_file = os.path.join(directory, 'unique_labels.txt')
with open(output_file, 'w') as f:
    for label in sorted_labels:
        f.write(label + '\n')

print(f"Unique labels saved to {output_file}")


Unique labels saved to C:/Users/maxis/Desktop/JHU/Term 2/Design Team/Data/HOSPI_Tools_small_yolov5/labels/train\unique_labels.txt


In [13]:
# Online hospitools trained model

from inference_sdk import InferenceHTTPClient

CLIENT = InferenceHTTPClient(
    api_url="https://detect.roboflow.com",
    api_key="2bIaNvju3XOt5sZdFhyl"
)

result = CLIENT.infer("./eval-images/inst-7.jpg", model_id="hospitools-data/1")
result

{'inference_id': '223ecda7-9992-4069-b2fc-20d634142d5f',
 'time': 0.03522089600119216,
 'image': {'width': 612, 'height': 408},
 'predictions': [{'x': 197.5,
   'y': 245.0,
   'width': 117.0,
   'height': 278.0,
   'confidence': 0.8790383338928223,
   'class': '6_Babcock_Tissue_Forceps',
   'class_id': 0,
   'detection_id': 'b6db08a7-032c-46d5-9841-e538ad6e00fa'},
  {'x': 69.5,
   'y': 221.5,
   'width': 131.0,
   'height': 333.0,
   'confidence': 0.8765726089477539,
   'class': '6_Babcock_Tissue_Forceps',
   'class_id': 0,
   'detection_id': 'aa40e352-8f26-4dce-a9df-058d59b341d5'},
  {'x': 317.0,
   'y': 273.5,
   'width': 94.0,
   'height': 221.0,
   'confidence': 0.7567417025566101,
   'class': '6_Babcock_Tissue_Forceps',
   'class_id': 0,
   'detection_id': '7ca93634-8cd0-4bb9-9087-9dac9d86b134'},
  {'x': 506.5,
   'y': 265.5,
   'width': 211.0,
   'height': 243.0,
   'confidence': 0.675891637802124,
   'class': '6_Babcock_Tissue_Forceps',
   'class_id': 0,
   'detection_id': 'dd7c