<a href="https://colab.research.google.com/github/shivam-sultania/Interacts-AI-TTS-model/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchaudio librosa numpy websockets ffmpeg-python
!apt-get install -y ffmpeg

!git clone https://github.com/jaywalnut310/vits.git
%cd vits

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 59 not upgraded.
fatal: destination path 'vits' already exists and is not an empty directory.
/content/vits


In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATASET_PATH = "/content/drive/MyDrive/AI4Bharat_dataset"  # Adjust this path to dataset's location in Google Drive
CHECKPOINT_PATH = "/content/drive/MyDrive/checkpoints_vits"  # Directory to save model checkpoints


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
from sklearn.model_selection import train_test_split

def prepare_dataset():
    transcripts_file = f"{DATASET_PATH}/transcript.txt"
    data = []

    # Read and clean data
    with open(transcripts_file, "r") as f:
        current_line = ""
        for line in f:
            line = line.strip()
            if "|" in line:
                if current_line:
                    data.append(current_line.split("|", 1))  # Spliting only once to handle long texts
                current_line = line
            else:
                current_line += " " + line

        if current_line:
            data.append(current_line.split("|", 1))

    data = [entry for entry in data if len(entry) == 2]

    # Spliting data into 80/10/10 for train/val/test
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

    # Save split files
    for split_name, split_data in zip(["train", "val", "test"], [train_data, val_data, test_data]):
        with open(f"{DATASET_PATH}/{split_name}/metadata.txt", "w") as f:
            for filename, text in split_data:
                f.write(f"{filename}|{text}\n")

prepare_dataset()

In [4]:
import json

config = {
    "sampling_rate": 16000,
    "batch_size": 16,
    "epochs": 1000,
    "learning_rate": 0.0001,
    "data": {
        "training_files": f"{DATASET_PATH}/train/metadata.txt",
        "validation_files": f"{DATASET_PATH}/val/metadata.txt",
        "sampling_rate": 16000,
        "n_mel_channels": 80,
        "n_symbols": 40,
        "text_cleaners": ["basic_cleaners"]
    },
    "model": {
        "hidden_channels": 192,
        "n_flow": 4,
        "n_group": 8,
        "n_layers": 3
    }
}

# Save config to JSON
with open("config.json", "w") as f:
    json.dump(config, f, indent=2)

In [None]:
%cd /content/vits
!pwd
!ls

In [None]:
!apt-get update
!apt-get install -y build-essential
!pip install Cython
!pip install numpy
!git clone https://github.com/jaywalnut310/vits.git
%cd vits

!pip install -r requirements.txt
!pip install unidecode

%cd monotonic_align
!python setup.py build_ext --inplace
%cd ..

from setuptools import setup
from Cython.Build import cythonize
import numpy

setup(
    name='monotonic_align',
    ext_modules=cythonize("/content/vits/monotonic_align/core.pyx"),
    include_dirs=[numpy.get_include()],
    script_args=["build_ext", "--build-lib", "/content/vits/monotonic_align"]
)
import sys
sys.path.append('/content/vits')
sys.path.append('/content/vits/monotonic_align')

%cd /content/vits/monotonic_align

!mkdir -p build
# List contents to ensure the .so file exist
!python setup.py build_ext --inplace
!ls /content/vits/monotonic_align
# from Cython.Build import cythonize
# import numpy
# import pyximport
from monotonic_align.core import maximum_path_c
print("Import successful!")


# from setuptools import setup, Extension
# from torch.utils.cpp_extension import BuildExtension, CUDAExtension

# !nvcc --version
# from torch.utils.cpp_extension import load
# import os

# # Compile the extension
# core = load(name="core",
#             sources=["/content/vits/monotonic_align/core.cpp", "/content/vits/monotonic_align/core.cu"],
#             verbose=True)

# !ls
# import sys
# sys.path.append('/content/vits/vits/monotonic_align')

# # Now try importing the required modules
# import numpy as np
# import torch

# # Adjust import based on the location of maximum_path_c

In [None]:
import sys
sys.path.append('/content/vits')

import json
from torch.utils.data import DataLoader
from vits.vits import VITS
from vits.trainer import Trainer
from vits.dataset import TextAudioLoader

def train_vits():
    with open("config.json") as f:
        config = json.load(f)

    train_dataset = TextAudioLoader(config["data"]["training_files"], config)
    val_dataset = TextAudioLoader(config["data"]["validation_files"], config)
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)

    model = VITS(config)
    trainer = Trainer(model=model, train_loader=train_loader, val_loader=val_loader, config=config)

    trainer.train(epochs=config["epochs"])

train_vits()


In [None]:
import soundfile as sf

def load_model(checkpoint_path):
    model = vits.load_from_checkpoint(checkpoint_path)
    model.eval()
    return model

def synthesize_text(model, text, config):
    text_sequence = model.text_to_sequence(text, config["data"]["text_cleaners"])
    with torch.no_grad():
        audio = model.infer(text_sequence)
    return audio

checkpoint_path = f"{CHECKPOINT_PATH}/last_checkpoint.pth"

model = load_model(checkpoint_path)
sample_text = "Sample text for synthesis"
output_audio = synthesize_text(model, sample_text, config)

# Save generated audio as a WAV file
sf.write("output.wav", output_audio, config["sampling_rate"])


In [None]:
import shutil

def save_checkpoint(epoch):
    checkpoint_name = f"checkpoint_epoch_{epoch}.pth"
    model_checkpoint = f"{CHECKPOINT_PATH}/{checkpoint_name}"
    shutil.copy("last_checkpoint.pth", model_checkpoint)

save_checkpoint(100)
