In [None]:
!pip install librosa soundfile tqdm



In [46]:
import os
import re
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import butter, lfilter
from tqdm import tqdm
import IPython.display as ipd

In [None]:
LJSPEECH_URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
VCTK_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"


In [None]:
!wget $LJSPEECH_URL
!wget $VCTK_URL

--2025-12-13 14:48:25--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 169.150.207.210, 2400:52e0:1500::1179:1
Connecting to data.keithito.com (data.keithito.com)|169.150.207.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [text/plain]
Saving to: ‘LJSpeech-1.1.tar.bz2’


2025-12-13 14:48:53 (94.1 MB/s) - ‘LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]

--2025-12-13 14:48:53--  https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip
Resolving datashare.ed.ac.uk (datashare.ed.ac.uk)... 129.215.67.172
Connecting to datashare.ed.ac.uk (datashare.ed.ac.uk)|129.215.67.172|:443... connected.
HTTP request sent, awaiting response... 200 200
Length: 11747302977 (11G) [application/zip]
Saving to: ‘VCTK-Corpus-0.92.zip’


2025-12-13 15:10:14 (8.76 MB/s) - ‘VCTK-Corpus-0.92.zip’ saved [11747302977/11747302977]



In [None]:
!tar -xjf LJSpeech-1.1.tar.bz2
!unzip VCTK-Corpus-0.92.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: wav48_silence_trimmed/p341/p341_320_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_116_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_266_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_099_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_050_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_376_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_189_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_140_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_230_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_006_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_195_mic1.flac  
  inflating: wav48_silence_trimmed/p341/p341_283_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_410_mic2.flac  
  inflating: wav48_silence_trimmed/p341/p341_126_mic1.flac  
  inflating: wav48_silence_trimmed/p341/p341_310_mic1.flac  
  inflating: wav48_s

In [None]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [41]:
PHONEME_MAP = {
    "a":"AH","b":"B","c":"K","d":"D","e":"EH","f":"F","g":"G",
    "h":"HH","i":"IH","j":"JH","k":"K","l":"L","m":"M","n":"N",
    "o":"OW","p":"P","q":"K","r":"R","s":"S","t":"T","u":"UH",
    "v":"V","w":"W","x":"KS","y":"Y","z":"Z"
}

def text_to_phonemes(text):
    return [PHONEME_MAP[c] for c in text if c in PHONEME_MAP]

In [42]:
LJ_PATH = "LJSpeech-1.1"
metadata = []

with open(f"{LJ_PATH}/metadata.csv", encoding="utf-8") as f:
    for line in f.readlines()[:300]:
        wav_id, _, text = line.strip().split("|")
        metadata.append((wav_id, normalize_text(text)))

train_data = metadata[:250]
test_data  = metadata[250:]

In [43]:
def extract_pitch(audio, sr):
    pitches, _ = librosa.piptrack(y=audio, sr=sr)
    p = pitches[pitches > 0]
    return np.mean(p) if len(p) else 0

In [44]:
def train_tts(train_data):
    pitches = []
    durations = []

    for wav_id, text in tqdm(train_data):
        audio, sr = librosa.load(f"{LJ_PATH}/wavs/{wav_id}.wav", sr=22050)
        pitch = extract_pitch(audio, sr)
        if pitch > 0:
            pitches.append(pitch)

        phonemes = text_to_phonemes(text)
        durations.append(len(audio) / sr / max(len(phonemes), 1))

    return {
        "base_pitch": np.mean(pitches),
        "phoneme_duration": np.mean(durations)
    }

tts_params = train_tts(train_data)
tts_params

100%|██████████| 250/250 [00:04<00:00, 52.38it/s]


{'base_pitch': np.float32(1372.1963),
 'phoneme_duration': np.float64(0.08289072341101336)}

In [45]:
def glottal_source(f0, duration, sr=22050):
    t = np.linspace(0, duration, int(sr * duration))
    return 2 * (t * f0 - np.floor(0.5 + t * f0))

In [47]:
def bandpass(signal, low, high, sr=22050):
    nyq = 0.5 * sr
    b, a = butter(2, [low/nyq, high/nyq], btype="band")
    return lfilter(b, a, signal)


In [48]:
PHONEME_FORMANTS = {
    "AH":[700,1200,2600],
    "EH":[530,1850,2500],
    "IH":[400,2000,2550],
    "OW":[570,840,2410],
    "UH":[440,1020,2240],
    "L":[400,2400,3000],
    "M":[300,1200,2100],
    "N":[300,1500,2500],
    "R":[300,1300,1700]
}

VOICED = set(PHONEME_FORMANTS.keys())


In [49]:
def amplitude_envelope(n):
    t = np.linspace(0, 1, n)
    attack = np.minimum(t / 0.1, 1.0)
    decay = np.exp(-3 * t)
    return attack * decay


In [50]:
def synthesize_phoneme(phoneme, pitch, duration, age, gender):
    sr = 22050
    n = int(sr * duration)

    if phoneme in VOICED:
        source = glottal_source(pitch, duration, sr)
        formants = PHONEME_FORMANTS[phoneme]
    else:
        source = np.random.randn(n) * 0.4
        return source * amplitude_envelope(n)

    speech = np.zeros_like(source)
    for f in formants:
        speech += bandpass(source, f-100, f+100, sr)

    speech *= amplitude_envelope(n)
    return speech / (np.max(np.abs(speech)) + 1e-6)


In [51]:
def overlap_add(signals, overlap=0.35):
    output = signals[0]
    for s in signals[1:]:
        ov = int(len(s) * overlap)
        output[-ov:] += s[:ov]
        output = np.concatenate([output, s[ov:]])
    return output


In [52]:
class ParametricTTS:
    def __init__(self, params):
        self.base_pitch = params["base_pitch"]
        self.duration = params["phoneme_duration"]

    def profile(self, age, gender):
        pitch = self.base_pitch
        speed = 1.0

        if gender == "male": pitch *= 0.6
        if age < 13: pitch *= 1.4
        if age > 60: speed = 0.85

        return pitch, speed

    def synthesize(self, text, age, gender):
        phonemes = text_to_phonemes(normalize_text(text))
        pitch, speed = self.profile(age, gender)

        signals = []
        for ph in phonemes:
            signals.append(
                synthesize_phoneme(ph, pitch, self.duration, age, gender)
            )

        audio = overlap_add(signals)
        return librosa.effects.time_stretch(audio, rate=speed)


In [53]:
tts = ParametricTTS(tts_params)

audio = tts.synthesize(
    "hello how are you",
    age=25,
    gender="male"
)

sf.write("final_tts.wav", audio, 22050)
ipd.Audio("final_tts.wav")
