In [None]:
def non_max_suppression(boxes, probs=None, overlapThresh=0.65):
    if len(boxes) == 0:
        return []

    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")

    pick = []

    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = y2

    if probs is not None:
        idxs = probs

    idxs = np.argsort(idxs)

    while len(idxs) > 0:
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)

        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])

        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        overlap = (w * h) / area[idxs[:last]]

        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))

    return boxes[pick].astype("int")


In [None]:
import cv2

# Check for available camera devices
available_cameras = []
for i in range(10):  # Check indices 0-9
    cap = cv2.VideoCapture(i)
    if cap.isOpened():
        print(f"Camera index {i} is available.")
        available_cameras.append(i)
        cap.release()
    else:
        print(f"Camera index {i} is not available.")

print("Available cameras:", available_cameras)


In [2]:
import numpy as np
import cv2

# Initialize the HOG descriptor/person detector
hog = cv2.HOGDescriptor()
hog.setSVMDetector(cv2.HOGDescriptor_getDefaultPeopleDetector())

cv2.startWindowThread()

# Open webcam video stream
cap = cv2.VideoCapture(0)

# Output will be written to output.avi
out = cv2.VideoWriter(
    'output.mp4',
    cv2.VideoWriter_fourcc(*'MJPG'),
    15.,
    (640, 480))

while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()
    
    # Resizing for faster detection
    frame = cv2.resize(frame, (640, 480))
    
    # Using a grayscale picture, also for faster detection
    # gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    gray = frame # setting to colour for now
    
    # Detect people in the image
    # Returns the bounding boxes for the detected objects
    boxes, weights = hog.detectMultiScale(frame, winStride=(4, 4), padding=(8, 8), scale=1.05, useMeanshiftGrouping=False)

    boxes = np.array([[x, y, x + w, y + h] for (x, y, w, h) in boxes])
    
    for (xA, yA, xB, yB) in boxes:
        # Display the detected boxes in the color picture
        cv2.rectangle(frame, (xA, yA), (xB, yB), (0, 255, 0), 2)
    # Write the output video
    out.write(frame.astype('uint8'))
    # Display the resulting frame
    cv2.imshow('frame', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything is done, release the capture
cap.release()

# And release the output
out.release()


# Finally, close the window1q
cv2.destroyAllWindows()
cv2.waitKey(1)


OpenCV: FFMPEG: tag 0x47504a4d/'MJPG' is not supported with codec id 7 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


-1

In [None]:
# Your existing HOG code
# ...

# Non-max suppression
boxes, weights = hog.detectMultiScale(frame, winStride=(8, 8))
if len(boxes) > 0:
    pick = non_max_suppression(boxes, probs=weights, overlapThresh=0.65)
    for (xA, yA, xB, yB) in pick:
        cv2.rectangle(frame, (xA, yA), (xB, yB), (0, 255, 0), 2)


In [4]:
import pyaudio
import numpy as np

# Initialize PyAudio
p = pyaudio.PyAudio()

# Open a streaming channel
stream = p.open(format=pyaudio.paInt16,  # 16-bit depth
                channels=1,  # Mono
                rate=44100,  # 44.1kHz sampling rate
                input=True,  # Input enabled
                frames_per_buffer=1024)  # Buffer size

while True:
    # Read audio stream
    audio_data = stream.read(1024)
    audio_array = np.frombuffer(audio_data, dtype=np.int16)

    # Here you can manipulate the audio_array, similar to how you manipulate video frames
    # ...

    # Uncomment the following lines if you want to hear the audio you're capturing
    # stream_out = p.open(format=pyaudio.paInt16,
    #                     channels=1,
    #                     rate=44100,
    #                     output=True)
    # stream_out.write(audio_array.tobytes())

    # Uncomment to exit the loop when you press 'q'
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

# Close the stream
stream.stop_stream()
stream.close()

# Terminate PyAudio
p.terminate()


KeyboardInterrupt: 

In [6]:
import pyaudio
import wave
import numpy as np

# Initialize PyAudio
p = pyaudio.PyAudio()

# Setup
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"

# Start Recording
stream = p.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)

print("Recording...")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    audio_data = stream.read(CHUNK)
    frames.append(audio_data)

print("Finished recording.")

# Stop Recording
stream.stop_stream()
stream.close()
p.terminate()

# Save as a WAV file
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


1024

audio with saving

In [4]:
import pyaudio
import wave
import numpy as np

# Initialize PyAudio
p = pyaudio.PyAudio()

# Setup
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"

# Start Recording
stream = p.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)

print("Recording...")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    audio_data = stream.read(CHUNK)
    frames.append(audio_data)

print("Finished recording.")

# Stop Recording
stream.stop_stream()
stream.close()
p.terminate()

# Save as a WAV file
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


Recording...
Finished recording.


audio transcription

In [4]:
import sys

# Path to append
path_to_append = '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages'

# Check if the path is already in sys.path
if path_to_append not in sys.path:
    sys.path.append(path_to_append)

In [19]:
from dotenv import load_dotenv
import os

load_dotenv()  # This will load all the environment variables from .env file

openai_api_key = os.getenv("OPENAI_API_KEY")  # Now the API key is loaded in this variable
replicate_api_key = os.getenv("REPLICATE_API_TOKEN")  # Now the API key is loaded in this variable``

In [23]:
replicate_api_key

'r8_G6djbie2rIpglMJLKlkr2rS2vqzHUFl1KktHK'

In [5]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import openai
audio_file= open("output.wav", "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
print(transcript)

{
  "text": "testing testing transcribe this accurately"
}


REPLICATE

In [11]:
import replicate 


In [30]:
REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
replicate.init(api_token=os.getenv("REPLICATE_API_TOKEN"))

AttributeError: module 'replicate' has no attribute 'init'

In [31]:
replicate = replicate.Client(api_token=REPLICATE_API_TOKEN)

In [40]:
replicate_prompt= "in today's podcaset, we are going to understand how and why ravi riley is such a fine and charming gentleman? "

In [41]:
replicate_output = replicate.run(
  "suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787",
  input={"prompt": replicate_prompt},
)

In [42]:
replicate_output

{'audio_out': 'https://pbxt.replicate.delivery/S0Tsh4Ap2yb1PRGUeIuNYfbxT37i9A9ei9NA1EUAklYJ18ljA/audio.wav'}