In [1]:
!pip install TTS

Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting scikit-learn>=1.3.0 (from TTS)
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.54.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collectin

In [2]:
!wget "https://www.dropbox.com/scl/fi/1snfy7wz4zh6yk4blnxqk/gen_speech_checkpoints.zip?rlkey=x2dvhmdcjzlmr8hqe41j2zhwy&st=r2hqv0s3&dl=0" -O gen_speech_checkpoints.zip 

--2025-06-10 10:26:30--  https://www.dropbox.com/scl/fi/1snfy7wz4zh6yk4blnxqk/gen_speech_checkpoints.zip?rlkey=x2dvhmdcjzlmr8hqe41j2zhwy&st=r2hqv0s3&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.85.18, 2620:100:6035:18::a27d:5512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.85.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf56e2e8cd10fae36bedb32a33a.dl.dropboxusercontent.com/cd/0/inline/CrUK_7rRsjtlR-8uIPEteZlzXyCYJZVscN1iu6dLvM1vKcNsQunUN99FN0UJqzzPx2kyBjwngXGVPtu7P4lxoCJzVJpN-kfQROl1aIUYiZyq464DU09vt8d7EjJ87VKcH80lvOmBbLhMpcfZUsOgH-Nn/file# [following]
--2025-06-10 10:26:30--  https://ucf56e2e8cd10fae36bedb32a33a.dl.dropboxusercontent.com/cd/0/inline/CrUK_7rRsjtlR-8uIPEteZlzXyCYJZVscN1iu6dLvM1vKcNsQunUN99FN0UJqzzPx2kyBjwngXGVPtu7P4lxoCJzVJpN-kfQROl1aIUYiZyq464DU09vt8d7EjJ87VKcH80lvOmBbLhMpcfZUsOgH-Nn/file
Resolving ucf56e2e8cd10fae36bedb32a33a.dl.dropboxusercontent.com (ucf56e2e8cd10fae36bedb32a33a.dl.dropboxuserco

In [3]:
ls

gen_speech_checkpoints.zip  [0m[01;34mlightning_logs[0m/  test_sentences.zip
[01;34mgen_speech_checpoints[0m/      [01;34mtest_sentences[0m/


In [4]:
!unzip gen_speech_checkpoints.zip

Archive:  gen_speech_checkpoints.zip
replace gen_speech_checpoints/last.ckpt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [11]:
ls 

gen_speech_checkpoints.zip  [0m[01;34mlightning_logs[0m/  test_sentences.zip
[01;34mgen_speech_checpoints[0m/      [01;34mtest_sentences[0m/


# TextToSpecConverter

In [12]:
import torch
import torchaudio
from TTS.api import TTS
from TTS.tts.utils.synthesis import synthesis


class TextToSpecConverter:
    def __init__(self, model_name: str = "tts_models/en/ljspeech/fast_pitch", device: str = "cpu"):
        self.model_name = model_name
        self.device = device
        self.tts_handler = TTS(model_name=model_name)
        self.model = self.tts_handler.synthesizer.tts_model.to(device)
        self.config = self.tts_handler.synthesizer.tts_config
        self.use_cuda = device == "cuda"
        print(f"Model {model_name} loaded on {device}")
    
    def text2spec(self, text: str):
        """
        Convert text to mel spectrogram using pretrained TTS model.
        Args:
            text (str): Input text to convert to mel spectrogram
        Returns:
            mel_spec (numpy.ndarray): Mel spectrogram of the input text
                with shape [C, T] = [num_mel_channels, num_frames]
        """
        outputs = synthesis(
            self.model,
            text,
            self.config,
            self.use_cuda,
            use_griffin_lim=False,
            do_trim_silence=False
        )
        mel_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
        # denormalize tts output based on the tts audio config
        mel_spec = self.model.ap.denormalize(mel_spec.T).T
        return mel_spec

# Data preprocessing

In [None]:
!mkdir /kaggle/processed_data

In [None]:
import os
import torch
import torchaudio
from tqdm import tqdm
from torchaudio.datasets import LJSPEECH


def preprocess_ljspeech(dataset, out_dir, device="cpu"):
    """
    Preprocess LJSpeech dataset and save preprocessed files.
    
    Args:
        dataset: LJSpeech dataset instance
        out_dir (str): Path to save preprocessed files
        device (str): Device to use for processing ('cpu' or 'cuda')
    """
    # Initialize text to spec converter
    t2s = TextToSpecConverter(device=device)
    
    # Create output directory
    os.makedirs(out_dir, exist_ok=True)
    
    # Process each file
    for i in tqdm(range(len(dataset)), desc="Preprocessing LJSpeech"):
        # Load audio and text
        waveform, sample_rate, text, normalized_text = dataset[i]
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = waveform[0:1]
        waveform = waveform.squeeze(0)
        
        # Generate mel spectrogram from text
        mel = t2s.text2spec(text)
        mel = torch.tensor(mel, dtype=torch.float32)  # Shape: [T, C]
        
        # Create sample dictionary
        sample = {
            "mel": mel,  # [T, C]
            "audio": waveform,  # [T]
            "text": text,
        }
        
        # Save preprocessed file
        save_path = os.path.join(out_dir, f"sample_{i:06d}.pt")
        torch.save(sample, save_path)

    del t2s
    torch.cuda.empty_cache()


base_dir = "/kaggle/"
processed_dir = os.path.join(base_dir, "processed_data")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Download and load dataset
print("Downloading LJSpeech dataset...")
dataset = LJSPEECH(root=base_dir, download=True)
print(f"Dataset downloaded successfully! Number of samples: {len(dataset)}")

# Run preprocessing
preprocess_ljspeech(dataset, processed_dir, device)
print("Preprocessing completed!")

In [None]:
ls /kaggle/processed_data | tail -10

# Data Module

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


class LJSpeechDataset(Dataset):
    def __init__(self, preprocessed_dir):
        self.preprocessed_dir = preprocessed_dir
        self.files = sorted(
            [f for f in os.listdir(preprocessed_dir) if f.endswith(".pt")]
        )

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = torch.load(os.path.join(self.preprocessed_dir, self.files[idx]))
        
        # Normalize audio
        audio = data["audio"].float()
        audio = audio / audio.abs().max()

        return {
            "mel": data["mel"],
            "audio": audio,
            "text": data["text"],
            "audio_length": audio.shape[0]
        }


class LJSpeechDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_dir: str = "/kaggle/processed_data",
        batch_size: int = 16,
        num_workers: int = 4,
        pin_memory: bool = True
    ):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory

    def setup(self, stage=None):

        self.train_dataset = LJSpeechDataset(self.data_dir)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            collate_fn=self.collate_fn
        )

    def val_dataloader(self):
        # Add validation dataloader if needed
        pass

    @staticmethod
    def collate_fn(batch):
        # Extract data from batch
        mels = [item["mel"] for item in batch]
        audios = [item["audio"] for item in batch]
        texts = [item["text"] for item in batch]
        audio_lengths = [item["audio_length"] for item in batch]

        # Get max lengths
        max_mel_len = max(m.shape[0] for m in mels)
        num_mels = mels[0].shape[1]
        max_audio_len = max(audio_lengths)

        # Create padded tensors
        padded_mels = torch.zeros(len(mels), num_mels, max_mel_len)
        padded_audios = torch.zeros(len(audios), max_audio_len)

        # Fill padded tensors
        for i, (mel, audio) in enumerate(zip(mels, audios)):
            padded_mels[i, :, :mel.shape[0]] = mel.T
            padded_audios[i, :audio.shape[0]] = audio

        return {
            "mel": padded_mels,
            "audio": padded_audios,
            "text": texts,
            "audio_length": audio_lengths
        }


data_module = LJSpeechDataModule(
    data_dir="/kaggle/processed_data",
    batch_size=16,
    num_workers=4
)

# Setup data
data_module.setup()

# Get a batch
batch = next(iter(data_module.train_dataloader()))
print("Batch shapes:")
print(f"Mel spectrograms: {batch['mel'].shape}")
print(f"Audio: {batch['audio'].shape}")
print(f"Number of texts: {len(batch['text'])}")
print(f"Audio lengths: {len(batch['audio_length'])}")

# Model

In [13]:
import torch
import torchaudio
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from torch.nn.utils import weight_norm
import numpy as np
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.optim.lr_scheduler import ReduceLROnPlateau


def weights_init(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find("BatchNorm2d") != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)


def WNConv1d(*args, **kwargs):
    return weight_norm(nn.Conv1d(*args, **kwargs))


def WNConvTranspose1d(*args, **kwargs):
    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))


class ResnetBlock(nn.Module):
    def __init__(self, dim, dilation=1):
        super().__init__()
        self.block = nn.Sequential(
            nn.LeakyReLU(0.2),
            nn.ReflectionPad1d(dilation),
            WNConv1d(dim, dim, kernel_size=3, dilation=dilation),
            nn.LeakyReLU(0.2),
            WNConv1d(dim, dim, kernel_size=1),
        )
        self.shortcut = WNConv1d(dim, dim, kernel_size=1)

    def forward(self, x):
        return self.shortcut(x) + self.block(x)


class Generator(nn.Module):
    def __init__(self, input_size, ngf, n_residual_layers):
        super().__init__()
        ratios = [8, 8, 2, 2]
        self.hop_length = np.prod(ratios)
        mult = int(2 ** len(ratios))

        model = [
            nn.ReflectionPad1d(3),
            WNConv1d(input_size, mult * ngf, kernel_size=7, padding=0),
        ]

        for i, r in enumerate(ratios):
            model += [
                nn.LeakyReLU(0.2),
                WNConvTranspose1d(
                    mult * ngf,
                    mult * ngf // 2,
                    kernel_size=r * 2,
                    stride=r,
                    padding=r // 2 + r % 2,
                    output_padding=r % 2,
                ),
            ]

            for j in range(n_residual_layers):
                model += [ResnetBlock(mult * ngf // 2, dilation=3 ** j)]

            mult //= 2

        model += [
            nn.LeakyReLU(0.2),
            nn.ReflectionPad1d(3),
            WNConv1d(ngf, 1, kernel_size=7, padding=0),
            nn.Tanh(),
        ]

        self.model = nn.Sequential(*model)
        self.apply(weights_init)

    def forward(self, x):
        return self.model(x)


class NLayerDiscriminator(nn.Module):
    def __init__(self, ndf, n_layers, downsampling_factor):
        super().__init__()
        model = nn.ModuleDict()

        model["layer_0"] = nn.Sequential(
            nn.ReflectionPad1d(7),
            WNConv1d(1, ndf, kernel_size=15),
            nn.LeakyReLU(0.2, True),
        )

        nf = ndf
        stride = downsampling_factor
        for n in range(1, n_layers + 1):
            nf_prev = nf
            nf = min(nf * stride, 1024)

            model["layer_%d" % n] = nn.Sequential(
                WNConv1d(
                    nf_prev,
                    nf,
                    kernel_size=stride * 10 + 1,
                    stride=stride,
                    padding=stride * 5,
                    groups=nf_prev // 4,
                ),
                nn.LeakyReLU(0.2, True),
            )

        nf = min(nf * 2, 1024)
        model["layer_%d" % (n_layers + 1)] = nn.Sequential(
            WNConv1d(nf_prev, nf, kernel_size=5, stride=1, padding=2),
            nn.LeakyReLU(0.2, True),
        )

        model["layer_%d" % (n_layers + 2)] = WNConv1d(
            nf, 1, kernel_size=3, stride=1, padding=1
        )

        self.model = model

    def forward(self, x):
        results = []
        for key, layer in self.model.items():
            x = layer(x)
            results.append(x)
        return results


class Discriminator(nn.Module):
    def __init__(self, num_D, ndf, n_layers, downsampling_factor):
        super().__init__()
        self.model = nn.ModuleDict()
        for i in range(num_D):
            self.model[f"disc_{i}"] = NLayerDiscriminator(
                ndf, n_layers, downsampling_factor
            )

        self.downsample = nn.AvgPool1d(4, stride=2, padding=1, count_include_pad=False)
        self.apply(weights_init)

    def forward(self, x):
        results = []
        for key, disc in self.model.items():
            results.append(disc(x))
            x = self.downsample(x)
        return results


In [14]:
class MelGAN(pl.LightningModule):
    def __init__(
        self,
        input_size=80,
        ngf=32,
        ndf=16,
        n_residual_layers=2,
        n_disc_layers=4,
        num_D=1,
        downsampling_factor=4,
        lr_g=2e-4,
        lr_d=2e-4,
        betas=(0.5, 0.9),
        lambda_feat=10.0,  # Weight for feature matching loss
        lr_scheduler_patience=5,
        lr_scheduler_factor=0.5,
    ):
        super().__init__()
        self.save_hyperparameters()
        
        # Initialize models
        self.generator = Generator(input_size, ngf, n_residual_layers)
        self.discriminator = Discriminator(num_D, ndf, n_disc_layers, downsampling_factor)
        
        # Initialize optimizers
        self.lr_g = lr_g
        self.lr_d = lr_d
        self.betas = betas
        self.lambda_feat = lambda_feat
        
        # For logging
        self.automatic_optimization = False

    def forward(self, x):
        return self.generator(x)

    def adversarial_loss(self, real_outputs, fake_outputs):
        loss = 0
        for real, fake in zip(real_outputs, fake_outputs):
            # Get the minimum length to avoid shape mismatch
            min_len = min(real[-1].size(-1), fake[-1].size(-1))
            
            # Truncate both outputs to the minimum length
            real_output = real[-1][..., :min_len]
            fake_output = fake[-1][..., :min_len]
            
            loss += F.relu(1 + fake_output).mean()
            loss += F.relu(1 - real_output).mean()
        return loss

    def generator_loss(self, fake_outputs):
        loss = 0
        for fake in fake_outputs:
            loss += -fake[-1].mean()
        return loss

    def feature_matching_loss(self, real_outputs, fake_outputs):
        loss = 0
        feat_weights = 4.0 / (self.hparams.n_disc_layers + 1)
        D_weights = 1.0 / self.hparams.num_D
        wt = D_weights * feat_weights
        
        for i in range(self.hparams.num_D):
            for j in range(len(fake_outputs[i]) - 1):
                # Get the current feature maps
                real_feat = real_outputs[i][j]
                fake_feat = fake_outputs[i][j]
                
                # Get the minimum length to avoid shape mismatch
                min_len = min(real_feat.size(-1), fake_feat.size(-1))
                
                # Truncate both feature maps to the minimum length
                real_feat = real_feat[..., :min_len]
                fake_feat = fake_feat[..., :min_len]
                
                # Calculate L1 loss
                feat_loss = F.l1_loss(fake_feat, real_feat.detach())
                loss += wt * feat_loss
        
        return loss

    def training_step(self, batch, batch_idx):
        # Get optimizers
        opt_g, opt_d = self.optimizers()
        
        # Get data from dictionary
        mel = batch["mel"]
        real_audio = batch["audio"].unsqueeze(1)  # Add channel dimension
        
        # Train discriminator
        # Generate fake audio
        fake_audio = self.generator(mel)
        
        # Get discriminator outputs
        real_outputs = self.discriminator(real_audio)
        fake_outputs = self.discriminator(fake_audio.detach())
        
        # Calculate discriminator loss
        d_loss = self.adversarial_loss(real_outputs, fake_outputs)
        
        # Update discriminator
        opt_d.zero_grad()
        d_loss.backward()
        opt_d.step()
        
        # Train generator
        # Get discriminator outputs for generator
        fake_outputs = self.discriminator(fake_audio)
        
        # Calculate generator losses
        g_loss = self.generator_loss(fake_outputs)
        feat_loss = self.feature_matching_loss(real_outputs, fake_outputs)
        
        # Total generator loss
        total_g_loss = g_loss + self.lambda_feat * feat_loss
        
        # Update generator
        opt_g.zero_grad()
        total_g_loss.backward()
        opt_g.step()
        
        # Log losses
        self.log_dict({
            "g_loss": g_loss,
            "d_loss": d_loss,
            "feat_loss": feat_loss,
            "total_g_loss": total_g_loss,
        }, prog_bar=True)
        
        return {
            "g_loss": g_loss,
            "d_loss": d_loss,
            "feat_loss": feat_loss,
            "total_g_loss": total_g_loss
        }

    def configure_optimizers(self):
        opt_g = optim.Adam(
            self.generator.parameters(),
            lr=self.lr_g,
            betas=self.betas
        )
        opt_d = optim.Adam(
            self.discriminator.parameters(),
            lr=self.lr_d,
            betas=self.betas
        )
        
        # Learning rate schedulers
        scheduler_g = {
            "scheduler": ReduceLROnPlateau(
                opt_g,
                mode='min',
                factor=self.hparams.lr_scheduler_factor,
                patience=self.hparams.lr_scheduler_patience,
                verbose=True
            ),
            "monitor": "total_g_loss"
        }
        
        scheduler_d = {
            "scheduler": ReduceLROnPlateau(
                opt_d,
                mode='min',
                factor=self.hparams.lr_scheduler_factor,
                patience=self.hparams.lr_scheduler_patience,
                verbose=True
            ),
            "monitor": "total_g_loss"
        }
        
        return [opt_g, opt_d], [scheduler_g, scheduler_d]

In [None]:
data_module = LJSpeechDataModule(
    data_dir="/kaggle/processed_data",
    batch_size=4,
    num_workers=4
)
data_module.setup()

# Create model
model = MelGAN(
    lambda_feat=8.0,  # Set feature matching loss weight
    lr_scheduler_patience=5,
    lr_scheduler_factor=0.5
)
# checkpoint_path = "/kaggle/working/melgan-epoch=02-total_g_loss=12.29.ckpt"
# model = MelGAN.load_from_checkpoint(
#     checkpoint_path,
#     input_size=80,
#     ngf=32,
#     ndf=16,
#     n_residual_layers=2,
#     n_disc_layers=4,
#     num_D=1,
#     downsampling_factor=4,
#     lambda_feat=5.0,
# )

# Create checkpoint callback
checkpoint_callback = ModelCheckpoint(
    dirpath='/kaggle/working',
    filename='melgan-{epoch:02d}-{total_g_loss:.2f}',
    save_top_k=-1,
    monitor='g_loss',
    mode='min',
    save_last=True,
    save_on_train_epoch_end=True
)

# Create learning rate monitor
lr_monitor = LearningRateMonitor(logging_interval='epoch')

# Create trainer
trainer = pl.Trainer(
    max_epochs=30,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    callbacks=[checkpoint_callback, lr_monitor],
)

# Train model
trainer.fit(model, train_dataloaders=data_module.train_dataloader())
# trainer.fit(model, train_dataloaders=data_module.train_dataloader(), ckpt_path=checkpoint_path)

# Test samples

In [15]:
import torch
import torchaudio
import os
from tqdm import tqdm


def load_model(checkpoint_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """Load model from checkpoint"""
    # Initialize model with the same hyperparameters as training
    model = MelGAN(
        input_size=80,  # mel spectrogram dimension
        ngf=32,
        ndf=16,
        n_residual_layers=2,
        n_disc_layers=4,
        num_D=1,
        downsampling_factor=4,
        lambda_feat=10.0
    )
    
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    model = model.to(device)
    model.eval()
    
    return model

def text_to_wav(model, text, output_path, sample_rate=22050):
    """Convert text to waveform and save to file"""
    # Convert text to mel spectrogram using your text-to-spec model
    # This is a placeholder - you'll need to implement your text-to-spec conversion
    t2s = TextToSpecConverter()  # Your text-to-spec model
    mel_spec = t2s.text2spec(text)
    
    # Convert mel spectrogram to tensor and add batch dimension
    mel_spec = torch.from_numpy(mel_spec).float().unsqueeze(0)
    
    # Move to device
    device = next(model.parameters()).device
    mel_spec = mel_spec.to(device)
    
    # Generate waveform
    with torch.no_grad():
        print(f"mel_spec shape: {mel_spec.transpose(1, 2).shape}")
        waveform = model(mel_spec.transpose(1, 2))
    
    # Remove batch dimension and move to CPU
    waveform = waveform.squeeze(0).cpu()
    
    # Save to file
    torchaudio.save(
        output_path,
        waveform,
        sample_rate
    )
    
    return output_path


In [26]:
ls gen_speech_checpoints

 last.ckpt  'melgan-epoch=22-total_g_loss=26.41.ckpt'


In [27]:
!rm -r test_sentences

In [28]:
!rm test_sentences.zip

In [29]:
!mkdir test_sentences

In [32]:
test_sentences = [
    "Tourists found the octagonal lighthouse after a two-mile hike through fog",
    "In twenty forty-nine, neural implants became standard for city workers",
    "He whispered something in Icelandic I could not quite translate",
    "Lucia balanced a porcelain vase on her elbow without flinching",
    "They gathered quietly beneath the turbine as the wind picked up speed",
]


checkpoint = "/kaggle/working/gen_speech_checpoints/last.ckpt"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading model from {checkpoint}...")
model = load_model(checkpoint, device)

# Generate audio
for idx, sentence in enumerate(test_sentences):
    print(f"Generating audio for text: {sentence}")
    output_path = text_to_wav(model, sentence, f"/kaggle/working/test_sentences/test_sentences_{idx}.wav")

    print(f"Audio saved to: {output_path}")

Loading model from /kaggle/working/gen_speech_checpoints/last.ckpt...
Generating audio for text: Tourists found the octagonal lighthouse after a two-mile hike through fog
 > tts_models/en/ljspeech/fast_pitch is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: fast_pitch
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_

In [None]:
!rm -r test_sentences

In [17]:
!mkdir test_sentences

In [22]:
!mv test_sentences_4.wav test_sentences/

In [34]:
ls test_sentences

test_sentences_0.wav  test_sentences_2.wav  test_sentences_4.wav
test_sentences_1.wav  test_sentences_3.wav


In [36]:
!zip -r test_sentences.zip test_sentences/

updating: test_sentences/ (stored 0%)
updating: test_sentences/test_sentences_0.wav (deflated 15%)
updating: test_sentences/test_sentences_1.wav (deflated 16%)
updating: test_sentences/test_sentences_4.wav (deflated 17%)
updating: test_sentences/test_sentences_3.wav (deflated 17%)
updating: test_sentences/test_sentences_2.wav (deflated 16%)
