# Media File Processing with Hume and OpenAI APIs
This notebook demonstrates how to use the Hume API for processing batches of media files, and how to use OpenAI's GPT and Whisper models for advanced tasks like image understanding and audio transcription.

In [None]:
# Import necessary libraries
from hume import HumeBatchClient
from hume.models.config import FaceConfig
import os
from dotenv import load_dotenv
from openai import OpenAI
import requests
import base64
from pydub import AudioSegment


In [None]:
# Load environment variables
load_dotenv()

HUME_API_KEY = os.getenv("HUME_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print("HUME_API_KEY=", HUME_API_KEY)
print("OPENAI_API_KEY=", OPENAI_API_KEY)


In [None]:
# Initialize Hume and OpenAI clients
hume_client = HumeBatchClient(HUME_API_KEY)
openai_client = OpenAI(api_key=OPENAI_API_KEY)


In [None]:
# Define configurations for Hume client
filepaths = ["faces.zip", "david_hume.jpeg"]
config = FaceConfig()
job = hume_client.submit_job(None, [config], files=filepaths)
print(job)

print("Running...")
details = job.await_complete()
job.download_predictions("predictions.json")
print("Predictions downloaded to predictions.json")


In [None]:
# Function to encode an image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Encode the image and prepare for upload
image_path = "david_hume.jpeg"
base64_image = encode_image(image_path)


In [None]:
# Using OpenAI's GPT to understand image contents
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}",
}

payload = {
    "model": "gpt-4-turbo",
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What’s in this image?"},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ],
        },
    ],
    "max_tokens": 300,
}

response = requests.post(
    "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
)

print(response.json())


In [None]:
# Transcribe audio using OpenAI's Whisper
def transcribe(audio_path):
    with open(audio_path, "rb") as audio_file:
        transcription = openai_client.audio.transcriptions.create(
            model="whisper-1", file=audio_file, response_format="text"
        )
        return transcription

audio_path = "segment_1.mp3"
transcription = transcribe(audio_path)
print(transcription)


In [None]:
# Segment a longer audio into manageable parts
def segment_audio(audio_path, segment_duration_ms):
    song = AudioSegment.from_mp3(audio_path)
    segments = []
    for i in range(0, len(song), segment_duration_ms):
        segment = song[i:i+segment_duration_ms]
        segment_path = f"segment_{i//segment_duration_ms}.mp3"
        segment.export(segment_path, format="mp3")
        segments.append(segment_path)
    return segments

# Example usage
segmented_audio_paths = segment_audio("long_audio.mp3", 10 * 60 * 1000)  # 10 minutes in ms
print("Segmented audio into:", segmented_audio_paths)
