In [1]:
from src.melvin.stream import Stream
from src.melvin.Transcriber import Transcriber
from src.melvin.StreamTranscriber import StreamTranscriber

from src.run.TimedStreamingTranscriber import TimedStreamingTranscriber
from src.run.Dataset import Dataset
from src.run.OutputHandler import OutputHandler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
w = StreamTranscriber.for_gpu("medium", [0])
dataset = Dataset()

In [3]:
# Find a sample with less than 100 seconds of audio
while True:
    id, X, y = next(dataset)
    if len(X)/32000 < 100:
        break
print(id)
print(len(X)/32000)

121-123859-0000
93.24


In [4]:
from src.helper.logger import init_logger, get_logger_with_id

init_logger(level="DEBUG")
logger = get_logger_with_id(__name__, "test")
logger.debug("This is a debug message")
logger.info("This is an info message")
logger.warning("This is a warning message")
logger.error("This is an error message")
logger.critical("This is a critical message")

[2025-08-06 12:57:42,564 __main__ (test):5] DEBUG This is a debug message
[2025-08-06 12:57:42,565 __main__ (test):6] INFO This is an info message
[2025-08-06 12:57:42,566 __main__ (test):8] ERROR This is an error message
[2025-08-06 12:57:42,567 __main__ (test):9] CRITICAL This is a critical message


In [5]:
out = OutputHandler()
stream = Stream(w, 0, out)
transcriber = TimedStreamingTranscriber(stream, out, chunk_length_ms=50)
y_pred = await transcriber.transcribe(X)

[2025-08-06 12:57:42,575 src.run.TimedStreamingTranscriber:38] INFO Transcribing audio data with 2983680 bytes
[2025-08-06 12:57:42,576 src.run.TimedStreamingTranscriber:39] DEBUG Bytes per chunk: 1600
[2025-08-06 12:57:42,577 src.run.OutputHandler:24] INFO OutputHandler timer initialized at 5157.540641 with offset -0.050000
[2025-08-06 12:57:43,528 src.melvin.stream (0):85] DEBUG Starting transcription task for window of length: 32000
[2025-08-06 12:57:43,600 faster_whisper:839] INFO Processing audio with duration 00:01.000
[2025-08-06 12:57:43,632 faster_whisper:853] INFO VAD filter removed 00:00.240 of audio
[2025-08-06 12:57:43,633 faster_whisper:859] DEBUG VAD filter kept the following audio segments: [00:00.240 -> 00:01.000]
[2025-08-06 12:57:44,034 faster_whisper:906] INFO Detected language 'en' with probability 0.91
[2025-08-06 12:57:44,038 faster_whisper:1141] DEBUG Processing segment at 00:00.000
[2025-08-06 12:57:44,295 src.melvin.stream (0):204] DEBUG Partial transcription 

In [6]:
out.final_words

[{'conf': 0.737793, 'start': 0.24, 'end': 0.98, 'word': 'You'},
 {'conf': 0.990234, 'start': 0.98, 'end': 1.3, 'word': 'are'},
 {'conf': 0.982422, 'start': 1.32, 'end': 1.66, 'word': 'my'},
 {'conf': 0.930664, 'start': 1.66, 'end': 1.94, 'word': 'all'},
 {'conf': 0.812988, 'start': 1.94, 'end': 2.22, 'word': 'the'},
 {'conf': 0.996094, 'start': 2.22, 'end': 2.66, 'word': 'world'},
 {'conf': 0.970703, 'start': 3.4, 'end': 3.54, 'word': 'and'},
 {'conf': 0.986328, 'start': 3.54, 'end': 3.8, 'word': 'I'},
 {'conf': 0.99707, 'start': 3.8, 'end': 4.1, 'word': 'must'},
 {'conf': 0.98877, 'start': 4.14, 'end': 4.66, 'word': 'strive'},
 {'conf': 0.958984, 'start': 4.66, 'end': 5.42, 'word': 'to'},
 {'conf': 0.996582, 'start': 5.42, 'end': 5.64, 'word': 'know'},
 {'conf': 0.969727, 'start': 5.64, 'end': 6.02, 'word': 'my'},
 {'conf': 0.919678, 'start': 6.04, 'end': 6.56, 'word': 'shames'},
 {'conf': 0.98584, 'start': 6.56, 'end': 6.88, 'word': 'and'},
 {'conf': 0.994385, 'start': 6.88, 'end': 7

In [7]:
for m in out.partial_predictions:
    print(f"Window {m['window'][0]:.2f} - {m['window'][1]:.2f} observed at {m['observation_time']:.4f} :  { m['result']['text']}")

Window 0.00 - 1.00 observed at 1.7682 :  You are
Window 0.00 - 2.00 observed at 2.5166 :  You are my old -
Window 0.00 - 3.00 observed at 3.5360 :  You are my all the world
Window 0.00 - 4.00 observed at 4.5509 :  You are my all the world and I must
Window 0.00 - 5.00 observed at 5.5796 :  You are my all the world, and I must strive.
Window 0.00 - 6.00 observed at 6.6576 :  must strive to know my sh -
Window 0.00 - 7.00 observed at 7.6712 :  must strive to know my shames and praise.
Window 0.00 - 8.00 observed at 8.7043 :  must strive to know my shames and praises from your time.
Window 0.00 - 9.00 observed at 9.6842 :  and praises from your tongue.
Window 0.00 - 10.00 observed at 10.7236 :  and praises from your tongue, none else to me.
Window 0.00 - 11.00 observed at 11.7500 :  and praises from your tongue. None else to me, nor...
Window 0.00 - 12.00 observed at 12.7702 :  me, nor I to none alone.
Window 0.00 - 12.05 observed at 13.5665 :  me, nor I to none alike.
Window 0.00 - 12.80

In [8]:
from pydub.utils import re

def __norm_word(word) -> str:
    text = word.lower()
    # Remove non-alphabetic characters using regular expression
    text = re.sub(r"[^a-z]", "", text)
    return text.lower().strip().strip(".,?!")

def remove_duplicates(words):
    i = 1
    new_words = words
    while i < len(new_words):
        if (
            __norm_word(new_words[i]["word"]) == __norm_word(new_words[i-1]["word"])
            and new_words[i]["start"] < new_words[i-1]["end"]
        ):
            new_words.pop(i)
        else:
            i += 1
    return new_words

In [9]:
y_gold = []
for result in stream.final_transcriptions:
    y_gold += result["result"]

y_gold = remove_duplicates(y_gold)
y_gold_text = " ".join([w["word"] for w in y_gold])
print(y_gold_text)
y_gold

You are my all the world and I must strive to know my shames and praises from your tongue, None else to me, nor I to none alive, That my steeled sense or changes right or wrong. Oh, tis the first, Tis flattery in my seeing. And my great mind most kingly drinks it up. Mine eye well knows what with his gust is green. And to his pallet doth prepare the cup, If it be poisoned, tis the lesser sin. That mine eye loves it, and doth first begin. But reckoning time, whose million accidents Creep in, twixt vows, and changed decrees of kings, Tan sacred beauty, Blunt the sharpest intents, Diverts strong minds to the course of altering things, Alas, why fearing of time's tyranny, Might I not then say, now I love you best? When I was certain o 'er uncertainty, Crowning the present, Doubting of the rest? love is a babe? Then might I not say so? To give full growth to that which still doth grow? so I return rebuked to my content, And gain by ill thrice more than I have spent.


[{'conf': 0.737793, 'start': 0.24, 'end': 0.98, 'word': 'You'},
 {'conf': 0.990234, 'start': 0.98, 'end': 1.3, 'word': 'are'},
 {'conf': 0.982422, 'start': 1.32, 'end': 1.66, 'word': 'my'},
 {'conf': 0.930664, 'start': 1.66, 'end': 1.94, 'word': 'all'},
 {'conf': 0.812988, 'start': 1.94, 'end': 2.22, 'word': 'the'},
 {'conf': 0.996094, 'start': 2.22, 'end': 2.66, 'word': 'world'},
 {'conf': 0.970703, 'start': 3.4, 'end': 3.54, 'word': 'and'},
 {'conf': 0.986328, 'start': 3.54, 'end': 3.8, 'word': 'I'},
 {'conf': 0.99707, 'start': 3.8, 'end': 4.1, 'word': 'must'},
 {'conf': 0.98877, 'start': 4.14, 'end': 4.66, 'word': 'strive'},
 {'conf': 0.958984, 'start': 4.66, 'end': 5.42, 'word': 'to'},
 {'conf': 0.996582, 'start': 5.42, 'end': 5.64, 'word': 'know'},
 {'conf': 0.969727, 'start': 5.64, 'end': 6.02, 'word': 'my'},
 {'conf': 0.919678, 'start': 6.04, 'end': 6.56, 'word': 'shames'},
 {'conf': 0.98584, 'start': 6.56, 'end': 6.88, 'word': 'and'},
 {'conf': 0.994385, 'start': 6.88, 'end': 7