In [None]:
import os
import logging
from telegram.ext import Updater, MessageHandler, Filters
import IPython.display as ipd

import librosa
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import io


In [None]:
with open("Telegram-bot-token.txt", "r") as f:
    TOKEN = f.read().strip() # strip() removes the trailing "\n" if it exists

HF_MODEL = "cantillation/whisper-medium-he-teamim-aviv-base"
model = WhisperForConditionalGeneration.from_pretrained(HF_MODEL).to("cuda")
processor = WhisperProcessor.from_pretrained(HF_MODEL, language="hebrew", task="transcribe")
SR = processor.feature_extractor.sampling_rate

In [None]:
def extract_features(audio):
    feature = processor.feature_extractor(audio, sampling_rate=SR,).input_features[0]
    return torch.tensor(feature).unsqueeze(0)


In [31]:
def transcribe(audio):

    # preprocess the audio file
    inputs = extract_features(audio).to("cuda")
    
    # generate the text
    generated_ids = model.generate(inputs, max_length=225, num_beams=4, early_stopping=True, return_dict_in_generate=True)
    print(generated_ids.keys())
    # return_dict_in_generate=True so we need to access the "sequences" key
    generated_ids = generated_ids.sequences
    transcription = processor.decode(generated_ids[0], skip_special_tokens=False)
    
    return transcription


In [32]:
# Enable logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                     level=logging.INFO)

# Define a function to handle audio messages
def handle_audio(update, context):
    audio_message = update.message.voice or update.message.audio
    # Get the audio file
    file = context.bot.get_file(audio_message.file_id)
    audio = file.download_as_bytearray()
    audio = librosa.load(io.BytesIO(audio), sr=SR, mono=True)[0]
    
    # Send a message to the user
    context.bot.send_message(chat_id=update.message.chat_id, text="קיבלתי את הקובץ, אני מתחיל להמיר אותו לטקסט עם טעמים...")
    
    # Audio to text with cantillations
    transcription = str(transcribe(audio))
    
    # Send the transcription to the user
    context.bot.send_message(chat_id=update.message.chat_id, text=f"זה מוכן!: \n {transcription}")
    
    
    
    
def main():
    # Create an instance of the Updater class
    updater = Updater(TOKEN, use_context=True)

    # Get the dispatcher to register handlers
    dispatcher = updater.dispatcher

    # Register a handler for audio messages
    audio_handler = MessageHandler(Filters.audio, handle_audio)
    dispatcher.add_handler(audio_handler)
    
    # Register a handler for voice messages
    voice_handler = MessageHandler(Filters.voice, handle_audio)
    dispatcher.add_handler(voice_handler)
    
    # Start the bot
    updater.start_polling()
    updater.idle()

if __name__ == '__main__':
    main()

2024-03-11 11:58:34,109 - apscheduler.scheduler - INFO - Scheduler started
2024-03-11 11:58:44,110 - telegram.ext.dispatcher - ERROR - No error handlers are registered, logging exception.
Traceback (most recent call last):
  File "/home/user_7542/.local/lib/python3.10/site-packages/telegram/ext/dispatcher.py", line 442, in process_update
    handler.handle_update(update, self, check, context)
  File "/home/user_7542/.local/lib/python3.10/site-packages/telegram/ext/handler.py", line 160, in handle_update
    return self.callback(update, context)
  File "/tmp/ipykernel_3493536/2650588767.py", line 17, in handle_audio
    transcription = str(transcribe(audio))
  File "/tmp/ipykernel_3493536/101296119.py", line 11, in transcribe
    transcription = processor.decode(generated_ids[1], skip_special_tokens=False)
IndexError: index 1 is out of bounds for dimension 0 with size 1


odict_keys(['sequences', 'past_key_values'])


2024-03-11 11:59:06,689 - telegram.ext.updater - INFO - Received signal 2 (SIGINT), stopping...
2024-03-11 11:59:06,690 - apscheduler.scheduler - INFO - Scheduler has been shut down
2024-03-11 11:59:06,690 - apscheduler.scheduler - INFO - Scheduler has been shut down
