In [None]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# FastPitch: Voice Modification with Custom Transformations

## Model overview

The [FastPitch](https://arxiv.org/abs/2006.06873) model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. Similarly to [FastSpeech2](https://arxiv.org/abs/2006.04558), which has been developed concurrently, it learns to predict the pitch contour and conditions the generation on such contour.

The simple mechanism of predicting the pitch on grapheme-level (rather than frame-level, as FastSpeech2 does) allows to easily alter the pitch during synthesis. FastPitch can thus change the perceived emotional state of the speaker, or slightly emphasise certain lexical units.

## Requirements

Run the notebook inside the container. By default the container forwards port `8888`.
```
bash scripts/docker/interactive.sh

# inside the container
cd notebooks
jupyter notebook --ip='*' --port=8888
```
Please refer the Requirement section in `README.md` for more details and running outside the container.

In [None]:
import os
assert os.getcwd().split('/')[-1] == 'notebooks'

## Generate audio samples

Training a FastPitch model from scrath takes 3 to 27 hours depending on the type and number of GPUs, performance numbers can be found in Section "Training performance results" in `README.md`. Therefore, to save the time of running this notebook, we recommend to download the pretrained FastPitch checkpoints on NGC for inference.

You can find FP32 checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_fp32_ckpt_v1/files) , and AMP (Automatic Mixed Precision) checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_amp_ckpt_v1/files).

To synthesize audio, you will need a WaveGlow model, which generates waveforms based on mel-spectrograms generated by FastPitch.You can download a pre-trained WaveGlow AMP model at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:waveglow256pyt_fp16).

You can perform inference using the respective checkpoints that are passed as `--fastpitch` and `--waveglow` arguments. Next, you will use FastPitch model to generate audio samples for input text, including the basic version and the variations i npace, fade out, and pitch transforms, etc.

In [None]:
import IPython

# store paths in aux variables
fastp = '../output_fastpitch/FastPitch_checkpoint_1000.pt'
waveg = '../output_waveglow/checkpoint_WaveGlow_450.pt'
flags = f'--cuda --fastpitch {fastp} --waveglow {waveg} --wn-channels 256 --p-arpabet 0.0'

### 1. Basic speech synthesis

You need to create an input file with some text, or just input the text in the below cell:

In [None]:
%%writefile text.txt
В выходны+е со мной бы+ли Мару+ся, Ка+тя и Мари+на.

In [None]:
!python ../inference.py {flags} -i text.txt -o ../output/original_ik4 > /dev/null

IPython.display.Audio("../output/original_ik4/audio_0.wav")

### 2. Bryzgunova

In [None]:
%%writefile ../fastpitch/pitch_transform.py
import torch
import numpy as np


def is_sound(symb):
    return (ord('а') <= ord(symb) <= ord('ё')) or (ord('А') <= ord(symb) <= ord('Ё'))


def pitch_transform_custom(pitch, pitch_lens, text, ik_index):
    accent = text[0].find('++') - 1
    second_accent = text[0].rfind('++') - 1
    scale = 20
    phrase_pitch = pitch[0]
    average = torch.median(phrase_pitch)
    pitch[0] = (pitch[0] - average) / 1.5 + average
    smoothed_values = []
    for i in range(len(text[0])):
        new_value = (pitch[0][0,max(0, i - 1)] + pitch[0][0,i] + pitch[0][0,min(i, len(text[0]) - 1)]) / 3
        smoothed_values.append(new_value)
        
    for i in range(len(text[0])):
        pitch[0][0,i] = smoothed_values[i]
    
    low_border, upper_border = 80, 300

    
    pitch_change = {
        1: {
            'beginning': 0, 
            'before': [0, 0, -.2 * scale], 
            'accent': -.5 * scale, 
            'after': [-1 * scale, -1 * scale], 
            'end': 0
        },
        2: {
            'beginning': 0, 
            'before': [0, 1 * scale, 1.5 * scale], 
            'accent': -1 * scale, 
            'after': [-0.5 * scale, -0.5 * scale], 
            'end': -0.5 * scale,
        },
        3: {
            'beginning': 0, 
            'before': [-1 * scale, -1 * scale, 2 * scale], 
            'accent': 2 * scale, 
            'after': [1 * scale, 1 * scale], 
            'end': 0,
        },
        4: {
            'beginning': 0, 
            'before': [scale, scale, -1.5 * scale], 
            'accent': -1.5 * scale, 
            'after': [1.5 * scale, 1.5 * scale], 
            'end': 0,
        },
        5: {
            'beginning': 0, 
            'before': [0, 0, 2 * scale], 
            'accent_1': 3 * scale, 
            'accent_2': -1 * scale, 
            'after': [0 * scale, 0 * scale], 
            'end': 0,
        },
        6: {
            'beginning': 0, 
            'before': [0, - scale, scale], 
            'accent': 2.5 * scale, 
            'after': [2 * scale, 2 * scale], 
            'end': 2 * scale,
        }
    }
    
    if accent == second_accent:
        if ik_index == 5:
            print('Этот тип интонационной конструкции подразумевает один акцент. Пожалуйста, поставьте один акцент во фразе.')
        
        pitch[0][0, accent] += pitch_change[ik_index]['accent']

        sound_number = 0
        for i in range(accent - 1, -1, -1):
            if sound_number <= 2:
                pitch[0][0, i] += pitch_change[ik_index]['before'][2 - sound_number]
                if is_sound(text[0][i]):
                    sound_number += 1
            else:
                pitch[0][0, i] += pitch_change[ik_index]['beginning']
        sound_number = 0  
        for i in range(accent + 1, len(text[0])):
            if sound_number <= 1:
                pitch[0][0, i] += pitch_change[ik_index]['after'][sound_number]
                if is_sound(text[0][i]):
                    sound_number += 1
            else:
                pitch[0][0, i] += pitch_change[ik_index]['end']
        for i in range(len(text[0])):
            pitch[0][0, i] = max(80, pitch[0][0, i])
    else:
        if ik_index != 5:
            print('Этот тип интонационной конструкции подразумевает два акцента. Пожалуйста, поставьте два акцента во фразе.')
            
        cons = 0
        for i in range(accent, second_accent):
            pitch[0][0, i] += pitch_change[ik_index]['accent_1'] - cons
            cons += 3
        pitch[0][0, second_accent] += pitch_change[ik_index]['accent_2']

        sound_number = 0
        for i in range(accent - 1, -1, -1):
            if sound_number <= 2:
                pitch[0][0, i] += pitch_change[ik_index]['before'][2 - sound_number]
                    if is_sound(text[0][i]):
                sound_number += 1
            else:
                pitch[0][0, i] += pitch_change[ik_index]['beginning']
        sound_number = 0  
        for i in range(second_accent + 1, len(text[0])):
            if sound_number <= 1:
                pitch[0][0, i] += pitch_change[ik_index]['after'][sound_number]
                if is_sound(text[0][i]):
                    sound_number += 1
            else:
                pitch[0][0, i] += pitch_change[ik_index]['end']
        for i in range(len(text[0])):
            pitch[0][0, i] = max(80, pitch[0][0, i])
        
    # smooth non-sounds
    for i, symb in enumerate(text[0]):
        if not is_sound(symb):
            has_left_sound, has_right_sound = False, False
            for j in range(i, -1, -1):
                if is_sound(text[0][j]):
                    left_sound = pitch[0][0, j]
                    has_left_sound = True
                    break
            for j in range(i, len(text[0])):
                if is_sound(text[0][j]):
                    right_sound = pitch[0][0, j]
                    has_right_sound = True
                    break
            
            if has_left_sound:
                pitch[0][0, i] = left_sound
            elif has_right_sound:
                pitch[0][0, i] = right_sound
    
    return pitch

In [None]:
!python ../inference.py {flags} -i text.txt -o ../output/modified_ik4/ \
    --pitch-transform-custom 4

IPython.display.Audio("../output/modified_ik4/audio_0.wav")