In [1]:
! pip install librosa
! pip install sounddevice



In [86]:
import librosa
import numpy as np
import sounddevice as sd
import random
import os

## Module 1

In [79]:
def speak_text(text, voice_profile):
    text_splits = text.split(" ")
    print(text_splits)
    
    concatenated_audio = np.array([])
    for i in range(len(text_splits)):
        clip_path = "clips/" + voice_profile + "/" + text_splits[i] + ".wav"
        audio, sr = librosa.load(clip_path, sr=8000)
        concatenated_audio = np.concatenate([concatenated_audio, audio])
    
    sd.play(concatenated_audio, 8000)
    sd.wait()


def speak_text_multiple(text_array, voice_profile):
    grand_concatenated_audio = np.array([])
    for i in range(len(text_array)):
        text = text_array[i]
    
        text_splits = text.split(" ")
        
        concatenated_audio = np.array([])
        for i in range(len(text_splits)):
            clip_path = "clips/" + voice_profile + "/" + text_splits[i] + ".wav"
            audio, sr = librosa.load(clip_path, sr=8000)
            concatenated_audio = np.concatenate([concatenated_audio, audio])
    
        grand_concatenated_audio = np.concatenate([grand_concatenated_audio, np.array([point for point in range(int(0.5 * 8000))]), concatenated_audio])
    
    sd.play(grand_concatenated_audio, 8000)
    sd.wait()

In [80]:
text = "ball is not cat and cat is not ball"
voice_profile = "pradeep"
speak_text(text, voice_profile)

['ball', 'is', 'not', 'cat', 'and', 'cat', 'is', 'not', 'ball']


In [81]:
text_array = [
    "this is a cat", 
    "that is a ball", 
    "cat is not ball", 
    "this cat is not that ball", 
    "ball is not cat and cat is not ball",
    "a ball is not a cat and that is"
]
voice_profile = "pradeep" 
speak_text_multiple(text_array, voice_profile)

## Module 2

In [82]:
def speak_text_with_conjuction_times(text, voice_profile, conjuction_times_with_probabilities):
    conjuction_times = conjuction_times_with_probabilities[0]
    randomization = conjuction_times_with_probabilities[1]
    
    text_splits = text.split(" ")
    
    conjuction_times_list = [0]
    text_splits_with_conjuction_times = ["cnj_0"]
    
    for split in text_splits:
        text_splits_with_conjuction_times.append(split)
        conjuction_time = float(np.random.choice(conjuction_times, p=randomization))
        conjuction_times_list.append(conjuction_time)
        text_splits_with_conjuction_times.append("cnj_" + str(conjuction_time))
    
    conjuction_times_list.pop()
    text_splits_with_conjuction_times.pop()
    
    # Writing out a script that will use conjuction time while doing concatenation.
    concatenated_audio = np.array([])
    for i in range(len(text_splits)):
        clip_path = "clips/" + voice_profile + "/" + text_splits[i] + ".wav"
        audio, sr = librosa.load(clip_path, sr=8000)
        conjuction_time = float(conjuction_times_list[i])
        
        if conjuction_time == 0:
            concatenated_audio = np.concatenate([concatenated_audio, audio])
    
        if conjuction_time > 0:
            concatenated_audio = np.concatenate([concatenated_audio, np.zeros(int(conjuction_time*8000)), audio])
    
        if conjuction_time < 0:
            concatenated_audio = np.concatenate([
                concatenated_audio[:len(concatenated_audio)-int(-conjuction_time*8000)],
                concatenated_audio[len(concatenated_audio)-int(-conjuction_time*8000):] + audio[:int(-conjuction_time*8000)],
                audio[int(-conjuction_time*8000):]
            ])
            
    print(text_splits)
    print(conjuction_times_list)
    print(text_splits_with_conjuction_times)
    
    sd.play(concatenated_audio, 8000)
    sd.wait()

In [83]:
text = "that is a ball and this is not a cat"
voice_profile = "sahil"
conjuction_times_with_probabilities = [[-0.05, 0.1], [0.8, 0.2]]

for i in range(1):
    speak_text_with_conjuction_times(text, voice_profile, conjuction_times_with_probabilities)

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'not', 'a', 'cat']
[0, -0.05, -0.05, -0.05, -0.05, -0.05, -0.05, -0.05, -0.05, -0.05]
['cnj_0', 'that', 'cnj_-0.05', 'is', 'cnj_-0.05', 'a', 'cnj_-0.05', 'ball', 'cnj_-0.05', 'and', 'cnj_-0.05', 'this', 'cnj_-0.05', 'is', 'cnj_-0.05', 'not', 'cnj_-0.05', 'a', 'cnj_-0.05', 'cat']


## Module 3

In [170]:
def speak_text_with_fluctuations(text, voice_profile, conjuction_times_with_probabilities, speeding_factors_with_probabilities):
    conjuction_times = conjuction_times_with_probabilities[0]
    conjuction_times_randomization = conjuction_times_with_probabilities[1]

    speeding_factors = speeding_factors_with_probabilities[0]
    speeding_factors_randomization = speeding_factors_with_probabilites[1]
    
    text_splits = text.split(" ")
    
    conjuction_times_list = [0]
    text_splits_with_conjuction_times = ["cnj_0"]

    speeding_factors_list = []
    text_splits_with_speeding_factors = []
    
    for split in text_splits:
        text_splits_with_conjuction_times.append(split)
        conjuction_time = float(np.random.choice(conjuction_times, p=conjuction_times_randomization))
        conjuction_times_list.append(conjuction_time)
        text_splits_with_conjuction_times.append("cnj_" + str(conjuction_time))

        speeding_factor = float(np.random.choice(speeding_factors, p=speeding_factors_randomization))
        speeding_factors_list.append(speeding_factor)
        text_splits_with_speeding_factors.append(split + "_spd_" + str(speeding_factor))
    
    conjuction_times_list.pop()
    text_splits_with_conjuction_times.pop()
    
    # Writing out a script that will use conjuction time while doing concatenation.
    concatenated_audio = np.array([])
    for i in range(len(text_splits)):
        clip_path = "clips/" + voice_profile + "/" + text_splits[i] + ".wav"
        speeded_clip_path = "temporary/" + text_splits[i] + ".wav"
        cmd = "ffmpeg -i {} -af atempo={} {}".format(clip_path, speeding_factors_list[i], speeded_clip_path)
        os.system(cmd)
        
        audio, sr = librosa.load(speeded_clip_path, sr=8000)
        conjuction_time = float(conjuction_times_list[i])
        
        if conjuction_time == 0:
            concatenated_audio = np.concatenate([concatenated_audio, audio])
    
        if conjuction_time > 0:
            concatenated_audio = np.concatenate([concatenated_audio, np.zeros(int(conjuction_time*8000)), audio])
    
        if conjuction_time < 0:
            concatenated_audio = np.concatenate([
                concatenated_audio[:len(concatenated_audio)-int(-conjuction_time*8000)],
                concatenated_audio[len(concatenated_audio)-int(-conjuction_time*8000):] + audio[:int(-conjuction_time*8000)],
                audio[int(-conjuction_time*8000):]
            ])

    os.system("rm -rf temporary/*")
    
    print(text_splits)
    print(conjuction_times_list)
    print(text_splits_with_conjuction_times, end="\n\n")

    print(text_splits)
    print(speeding_factors_list)
    print(text_splits_with_speeding_factors)
    
    sd.play(concatenated_audio, 8000)
    sd.wait()

In [174]:
text = "that is a ball and this is a cat"
voice_profile = "sahil"
conjuction_times_with_probabilities = [[-0.1, 0.2], [0.7, 0.3]]
speeding_factors_with_probabilites = [[1.3, 0.7], [0.7, 0.3]]

for i in range(4):
    speak_text_with_fluctuations(text, voice_profile, conjuction_times_with_probabilities, speeding_factors_with_probabilites)

ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0, 0.2, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1]
['cnj_0', 'that', 'cnj_0.2', 'is', 'cnj_-0.1', 'a', 'cnj_-0.1', 'ball', 'cnj_-0.1', 'and', 'cnj_-0.1', 'this', 'cnj_-0.1', 'is', 'cnj_-0.1', 'a', 'cnj_-0.1', 'cat']

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 0.7, 1.3, 1.3]
['that_spd_1.3', 'is_spd_1.3', 'a_spd_1.3', 'ball_spd_1.3', 'and_spd_1.3', 'this_spd_1.3', 'is_spd_0.7', 'a_spd_1.3', 'cat_spd_1.3']


ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0, -0.1, -0.1, -0.1, -0.1, 0.2, -0.1, -0.1, 0.2]
['cnj_0', 'that', 'cnj_-0.1', 'is', 'cnj_-0.1', 'a', 'cnj_-0.1', 'ball', 'cnj_-0.1', 'and', 'cnj_0.2', 'this', 'cnj_-0.1', 'is', 'cnj_-0.1', 'a', 'cnj_0.2', 'cat']

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0.7, 1.3, 1.3, 0.7, 0.7, 0.7, 1.3, 1.3, 0.7]
['that_spd_0.7', 'is_spd_1.3', 'a_spd_1.3', 'ball_spd_0.7', 'and_spd_0.7', 'this_spd_0.7', 'is_spd_1.3', 'a_spd_1.3', 'cat_spd_0.7']


ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0, -0.1, 0.2, -0.1, -0.1, 0.2, -0.1, 0.2, -0.1]
['cnj_0', 'that', 'cnj_-0.1', 'is', 'cnj_0.2', 'a', 'cnj_-0.1', 'ball', 'cnj_-0.1', 'and', 'cnj_0.2', 'this', 'cnj_-0.1', 'is', 'cnj_0.2', 'a', 'cnj_-0.1', 'cat']

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 0.7, 1.3]
['that_spd_1.3', 'is_spd_1.3', 'a_spd_1.3', 'ball_spd_1.3', 'and_spd_1.3', 'this_spd_1.3', 'is_spd_1.3', 'a_spd_0.7', 'cat_spd_1.3']


ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0, -0.1, 0.2, 0.2, 0.2, -0.1, -0.1, -0.1, -0.1]
['cnj_0', 'that', 'cnj_-0.1', 'is', 'cnj_0.2', 'a', 'cnj_0.2', 'ball', 'cnj_0.2', 'and', 'cnj_-0.1', 'this', 'cnj_-0.1', 'is', 'cnj_-0.1', 'a', 'cnj_-0.1', 'cat']

['that', 'is', 'a', 'ball', 'and', 'this', 'is', 'a', 'cat']
[0.7, 1.3, 1.3, 0.7, 1.3, 1.3, 1.3, 1.3, 1.3]
['that_spd_0.7', 'is_spd_1.3', 'a_spd_1.3', 'ball_spd_0.7', 'and_spd_1.3', 'this_spd_1.3', 'is_spd_1.3', 'a_spd_1.3', 'cat_spd_1.3']
