# Initial Setup
Ensure a proper setup by installing SpeechRecognition, updating scipy, cloning Tortoise TTS, and confirming the presence of necessary packages required for the text-to-speech application.

In [1]:
!pip install SpeechRecognition
!pip3 install -U scipy
!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install
!pip install flask-ngrok

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.4
Collecting scipy
  Downloading scipy-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
 

# Import the necessary packages

The following code block imports the necessary packages for speech recognition, audio processing, neural networks, and text-to-speech functionalities. These components are crucial for the upcoming tasks.


In [2]:
import speech_recognition as sr
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import IPython
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices
import os
from google.colab import files
import io

# Audio Transcription using Speech Recognition
The following code demonstrates how to use speech recognition to transcribe an uploaded audio file using Google's Web Speech API.

In [None]:
# Create a recognizer instance
recognizer = sr.Recognizer()

# Upload the audio file
uploaded_files = files.upload()

# Extract the audio data from the dictionary
audio_file_name = list(uploaded_files.keys())[0]
audio_data = uploaded_files[audio_file_name]

# Save the audio data to a temporary file
with open(audio_file_name, 'wb') as temp_audio_file:
    temp_audio_file.write(audio_data)

# Read the audio file using SpeechRecognition
with sr.AudioFile(audio_file_name) as source:
    audio = recognizer.record(source)  # Record the entire audio file

# Use Google Web Speech API for transcription
try:
    text = recognizer.recognize_google(audio)
    print(type(text))
    print("Transcript: {}".format(text))
except sr.UnknownValueError:
    print("Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Web Speech API; {0}".format(e))


# Text-to-Speech: Generating Speech Output
The following code initializes a TextToSpeech instance for generating speech output using the configured settings.

In [None]:
tts = TextToSpeech()

## Custom Speech Generation Configuration

set up the parameters for custom speech generation using Tortoise's text-to-speech capabilities. Define the text to be spoken and select a preset mode for the desired quality.


In [None]:
# This is the text that will be spoken.
text_convert = text

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "high_quality"

# upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.
CUSTOM_VOICE_NAME = "custom"

custom_voice_folder = f"tortoise/voices/{CUSTOM_VOICE_NAME}"
os.makedirs(custom_voice_folder)
for i, file_data in enumerate(files.upload().values()):
  with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:
    f.write(file_data)

## Speech Generation and Output

The following code generates speech based on the provided text and custom voice configuration. The generated speech is saved as an audio file which can be played back or downloaded.


In [None]:
# Generate speech
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                          preset=preset)
torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')