In [None]:
# the scipy version packaged with colab is not tolerant of misformated WAV files.
# install the latest version.
!pip3 install -U scipy

!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install

In [None]:
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

### 1. Upload your audio sample

In [None]:
# Optionally, upload use your own voice by running the next two cells. I recommend
# you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.
CUSTOM_VOICE_NAME = "martin"

import os
from google.colab import files

custom_voice_folder = f"tortoise/voices/{CUSTOM_VOICE_NAME}"
os.makedirs(custom_voice_folder)
for i, file_data in enumerate(files.upload().values()):
  with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:
    f.write(file_data)

### 2. Read txt from pdf

In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path, output_txt_path):
    # Open the PDF file
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Create a text file for output
        with open(output_txt_path, "w") as output_file:
            # Extract text from each page and write it to the text file
            for page_num in range(14, 400):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                if text:
                    text = text.replace('\t', ' ').replace('\n', ' ').replace('  ', ' ')
                    output_file.write(text + "\n")
                else:
                    print(f"No text found on page {page_num + 1}")

pdf_path = "antifragile.pdf"  # Path to your PDF file
output_txt_path = "antfragile.txt"  # Path for the output text file
extract_text_from_pdf(pdf_path, output_txt_path)

### 3. Generate speech from a text file

In [None]:
from tortoise.utils.text import split_and_recombine_text
from time import time
import os

outpath = "results/longform/"

textfile_path = "../speech.txt"

# Process text
with open(textfile_path, 'r', encoding='utf-8') as f:
    text = ' '.join([l for l in f.readlines()])
    if '|' in text:
        print("Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not"
              "your intent, please remove all '|' characters from the input.")
        texts = text.split('|')
    else:
        texts = split_and_recombine_text(text)

seed = int(time())

voice_outpath = os.path.join(outpath, CUSTOM_VOICE_NAME)
os.makedirs(voice_outpath, exist_ok=True)

voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)

all_parts = []
for j, text in enumerate(texts):
    gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                              preset="fast", k=1, use_deterministic_seed=seed)
    gen = gen.squeeze(0).cpu()
    torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
    all_parts.append(gen)

full_audio = torch.cat(all_parts, dim=-1)
torchaudio.save(os.path.join(voice_outpath, 'combined.wav'), full_audio, 24000)
IPython.display.Audio(os.path.join(voice_outpath, 'combined.wav'))