scope3data · tito · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/examples/openai-stt.py b/examples/openai-stt.py
@@ -0,0 +1,36 @@
+from scope3ai import Scope3AI
+from openai import OpenAI
+from pathlib import Path
+
+
+def main(filename: Path, model: str, response_format: str):
+    client = OpenAI()
+    scope3 = Scope3AI.init()
+
+    with scope3.trace() as tracer:
+        response = client.audio.transcriptions.create(
+            model=model,
+            file=filename,
+            response_format=response_format,
+        )
+        print(response)
+        print(response.scope3ai.request)
+        impact = tracer.impact()
+        print(impact)
+        print(f"Total Energy Wh: {impact.total_energy_wh}")
+        print(f"Total GCO2e: {impact.total_gco2e}")
+        print(f"Total MLH2O: {impact.total_mlh2o}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="OpenAI Speech to Text")
+    parser.add_argument("--model", type=str, default="whisper-1", help="Model")
+    parser.add_argument(
+        "--response_format", type=str, default="json", help="Response format"
+    )
+    parser.add_argument("filename", type=Path, help="The path to the input file")
+    args = parser.parse_args()
+
+    main(**vars(args))
diff --git a/scope3ai/tracers/openai/instrument.py b/scope3ai/tracers/openai/instrument.py
@@ -5,6 +5,10 @@
     openai_text_to_speech_wrapper,
     openai_async_text_to_speech_wrapper,
 )
+from .speech_to_text import (
+    openai_async_speech_to_text_wrapper,
+    openai_speech_to_text_wrapper,
+)
 
 
 class OpenAIInstrumentor:
@@ -30,6 +34,16 @@ def __init__(self) -> None:
                 "name": "AsyncSpeech.create",
                 "wrapper": openai_async_text_to_speech_wrapper,
             },
+            {
+                "module": "openai.resources.audio.transcriptions",
+                "name": "Transcriptions.create",
+                "wrapper": openai_speech_to_text_wrapper,
+            },
+            {
+                "module": "openai.resources.audio.transcriptions",
+                "name": "AsyncTranscriptions.create",
+                "wrapper": openai_async_speech_to_text_wrapper,
+            },
         ]
 
     def instrument(self) -> None:

diff --git a/scope3ai/tracers/openai/speech_to_text.py b/scope3ai/tracers/openai/speech_to_text.py
@@ -0,0 +1,111 @@
+import logging
+import time
+from typing import Any, Callable, Optional, Union
+
+import openai
+import tiktoken
+from openai.resources.audio.transcriptions import (
+    AsyncTranscriptions,
+    Transcriptions,
+)
+from openai.resources.audio.transcriptions import (
+    Transcription as _Transcription,
+)
+from openai.resources.audio.transcriptions import (
+    TranscriptionVerbose as _TranscriptionVerbose,
+)
+
+from scope3ai.api.types import ImpactRow, Model, Scope3AIContext, Task
+from scope3ai.lib import Scope3AI
+
+PROVIDER = "openai"
+
+logger = logging.getLogger("scope3.tracers.openai.speech_to_text")
+
+
+class AnnotatedStr(str):
+    scope3ai: Optional[Scope3AIContext] = None
+
+
+class Transcription(_Transcription):
+    scope3ai: Optional[Scope3AIContext] = None
+
+
+class TranscriptionVerbose(_TranscriptionVerbose):
+    scope3ai: Optional[Scope3AIContext] = None
+
+
+def _get_file_audio_duration(
+    file: openai._types.FileTypes,
+) -> Optional[float]:
+    try:
+        from mutagen import File
+
+        if isinstance(file, (list, tuple)):
+            file = file[1]
+
+        audio = File(file)
+        if audio is not None and audio.info is not None:
+            return audio.info.length
+    except Exception as e:
+        logger.exception(f"Failed to get audio duration: {e}")
+    return None
+
+
+def _openai_speech_to_text_wrapper(
+    response: Any, request_latency: float, kwargs: dict
+) -> Union[Transcription, TranscriptionVerbose, str]:
+    model = kwargs["model"]
+    encoder = tiktoken.get_encoding("cl100k_base")
+
+    if isinstance(response, (_Transcription, _TranscriptionVerbose)):
+        output_tokens = len(encoder.encode(response.text))
+    elif isinstance(response, str):
+        output_tokens = len(encoder.encode(response))
+    else:
+        output_tokens = None
+
+    options = {}
+    duration = _get_file_audio_duration(kwargs["file"])
+    if duration is not None:
+        options["input_audio_seconds"] = int(duration)
+
+    scope3_row = ImpactRow(
+        model=Model(id=model),
+        provider=PROVIDER,
+        output_tokens=output_tokens,
+        request_duration_ms=request_latency,
+        task=Task.speech_to_text,
+        **options,
+    )
+    scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)
+
+    if isinstance(response, _Transcription):
+        result = Transcription.model_construct(**response.model_dump())
+    elif isinstance(response, _TranscriptionVerbose):
+        result = TranscriptionVerbose.model_construct(**response.model_dump())
+    elif isinstance(response, str):
+        result = AnnotatedStr(response)
+    else:
+        logger.error(f"Unexpected response type: {type(response)}")
+        return response
+    result.scope3ai = scope3_ctx
+    return result
+
+
+def openai_speech_to_text_wrapper(
+    wrapped: Callable, instance: Transcriptions, args: Any, kwargs: Any
+) -> Union[Transcription, TranscriptionVerbose, str]:
+    timer_start = time.perf_counter()
+    response = wrapped(*args, **kwargs)
+    request_latency = (time.perf_counter() - timer_start) * 1000
+    return _openai_speech_to_text_wrapper(response, request_latency, kwargs)
+
+
+async def openai_async_speech_to_text_wrapper(
+    wrapped: Callable, instance: AsyncTranscriptions, args: Any, kwargs: Any
+) -> Union[Transcription, TranscriptionVerbose, str]:
+    timer_start = time.perf_counter()
+    response = await wrapped(*args, **kwargs)
+    request_latency = (time.perf_counter() - timer_start) * 1000
+    return _openai_speech_to_text_wrapper(response, request_latency, kwargs)
diff --git a/scope3ai/tracers/openai/text_to_speech.py b/scope3ai/tracers/openai/text_to_speech.py
@@ -7,7 +7,7 @@
 import tiktoken
 from openai.resources.audio.speech import AsyncSpeech, Speech, _legacy_response
 
-from scope3ai.api.types import ImpactRow, Model, Scope3AIContext
+from scope3ai.api.types import ImpactRow, Model, Scope3AIContext, Task
 from scope3ai.lib import Scope3AI
 
 
@@ -81,6 +81,7 @@ def _openai_text_to_speech_submit(
         request_duration_ms=request_latency,
         provider=PROVIDER,
         audio_output_seconds=duration,
+        task=Task.text_to_speech,
     )
 
     scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)