## Text to Speech Speecht5

In [1]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from datasets import load_dataset
import torch
import soundfile as sf

  warn(


In [2]:
processor = SpeechT5Processor.from_pretrained("../Models/speecht5_tts")

In [3]:
model = SpeechT5ForTextToSpeech.from_pretrained("../Models/speecht5_tts")

In [4]:
from transformers import SpeechT5HifiGan

In [13]:
vocoder = SpeechT5HifiGan.from_pretrained("../Models/speecht5_hifigan")

In [14]:
inputs = processor(text="I love Pakistan and Hindustan and Germany and Japan.", return_tensors="pt")

In [15]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

In [16]:
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [17]:
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [18]:
sf.write("speech.wav", speech.numpy(), samplerate=16000)

In [19]:
from IPython.display import Audio

Audio(speech, rate=16000)

## Speech-to-speech for voice conversion (Cloning)

In [20]:
from transformers import SpeechT5ForSpeechToSpeech

In [21]:
processor = SpeechT5Processor.from_pretrained("../Models/speecht5_vc")
model = SpeechT5ForSpeechToSpeech.from_pretrained("../Models/speecht5_vc")

Some weights of SpeechT5ForSpeechToSpeech were not initialized from the model checkpoint at ../Models/speecht5_vc and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Load an input speech example:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
example = dataset[40]

In [23]:
Audio(example["audio"]["array"], rate=16000)

In [24]:
# Preprocess the speech input:
sampling_rate = dataset.features["audio"].sampling_rate
inputs = processor(audio=example["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [25]:
# Load the speaker embedding for the target speaker's voice:
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [26]:
# Generate the speech:
speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)

In [28]:
Audio(speech, rate=16000)

In [29]:
import soundfile as sf
sf.write("speech_converted.wav", speech.numpy(), samplerate=16000)

## Automatic speech recognition (using the model)

In [30]:
from transformers import SpeechT5ForSpeechToText

In [31]:
processor = SpeechT5Processor.from_pretrained("../Models/speecht5_asr")
model = SpeechT5ForSpeechToText.from_pretrained("../Models/speecht5_asr")

Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at ../Models/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# example = "./harvard.wav"
example = dataset[40]

In [33]:
# Preprocess the input speech example:
sampling_rate = dataset.features["audio"].sampling_rate
inputs = processor(audio=example["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [34]:
# Generate text from the speech input:
predicted_ids = model.generate(**inputs, max_length=100)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [35]:
transcription[0]

'a man said to the universe sir i exist'

In [59]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
import soundfile as sf
import speech_recognition as sr
import numpy as np
import librosa
import warnings

In [37]:
def load_model_and_processor(model_path):
    processor = SpeechT5Processor.from_pretrained(model_path)
    model = SpeechT5ForSpeechToText.from_pretrained(model_path)
    return processor, model

In [60]:
def resample_if_necessary(audio_array, orig_sr, target_sr=16000):
    if orig_sr != target_sr:
        audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=target_sr)
    return audio_array

In [61]:
def transcribe_audio(audio_array, sampling_rate, processor, model):
    audio_array = resample_if_necessary(audio_array, sampling_rate)
    
    # Ensure the audio is long enough (pad if necessary)
    if len(audio_array) < 16000:
        audio_array = np.pad(audio_array, (0, 16000 - len(audio_array)))
    
    inputs = processor(audio=audio_array, sampling_rate=16000, return_tensors="pt")
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        predicted_ids = model.generate(**inputs, max_length=100)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [62]:
def transcribe_from_file(file_path, processor, model):
    audio_array, sampling_rate = sf.read(file_path)
    return transcribe_audio(audio_array, sampling_rate, processor, model)

In [64]:
model_path = "../Models/speecht5_asr"
processor, model = load_model_and_processor(model_path)

Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at ../Models/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
file_path = "harvard.wav"  # Replace with your audio file path
print("Transcribing from file:")
print(transcribe_from_file(file_path, processor, model))

Transcribing from file:


RuntimeError: Calculated padded input size per channel: (1). Kernel size: (10). Kernel size can't be greater than actual input size

In [66]:
def transcribe_from_mic(processor, model):
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("Say something!")
        audio = r.listen(source)
    
    # Use recognize_google for comparison
    try:
        google_transcription = r.recognize_google(audio)
        print("Google Speech Recognition thinks you said: ", google_transcription)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))

    # Convert audio to numpy array
    audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16)
    audio_float = audio_data.astype(np.float32) / 32768.0

    return transcribe_audio(audio_float, audio.sample_rate, processor, model)

In [70]:
print("\nTranscribing from microphone:")
print("SpeechT5 thinks you said:", transcribe_from_mic(processor, model))


Transcribing from microphone:
Say something!
Google Speech Recognition thinks you said:  I told the universe sir I exist
i restore to universe serve i exist i exist i restore the universe


In [77]:
def transcribe_from_mic(processor, model):
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("Say something!")
        audio = r.listen(source)
    
    # Use recognize_google for comparison
    try:
        google_transcription = r.recognize_google(audio)
        print("Google Speech Recognition thinks you said:", google_transcription)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))

    # try:
    #     whisper_transcription = r.recognize_whisper(audio)
    #     print("Whisper Speech Recognition thinks you said:", whisper_transcription)
    # except sr.UnknownValueError:
    #     print("Whisper Speech Recognition could not understand audio")
    # except sr.RequestError as e:
    #     print("Could not request results from Whisper Speech Recognition service; {0}".format(e))

    # Convert audio to numpy array
    audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16)
    
    # Resample to 16000 Hz
    audio_resampled = librosa.resample(audio_data.astype(np.float32), 
                                       orig_sr=audio.sample_rate, 
                                       target_sr=16000)
    
    # Normalize the audio
    audio_normalized = librosa.util.normalize(audio_resampled)

    # Process with SpeechT5
    inputs = processor(audio=audio_normalized, sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(**inputs)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    return transcription[0]


In [78]:
model_path = "../Models/speecht5_asr"
processor, model = load_model_and_processor(model_path)

print("\nTranscribing from microphone:")
print("SpeechT5 thinks you said:", transcribe_from_mic(processor, model))

Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at ../Models/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Transcribing from microphone:
Say something!


ModuleNotFoundError: No module named 'whisper'