# Speech analysis and re-synthesis

This notebook demonstrates how to analyze speech and re-synthesis speech waveform from speech parameters using [pysptk](https://github.com/r9y9/pysptk) (and other useful speech/audio/music analysis packages). Synthesized audio examples are provided so that you are able to compare synthesis filters on your browser.

## Requirements

- pysptk: https://github.com/r9y9/pysptk
- scipy
- librosa: https://github.com/bmcfee/librosa
- seaborn: https://github.com/mwaskom/seaborn

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import matplotlib
import seaborn
seaborn.set(style="dark")
rcParams['figure.figsize'] = (16, 5)

In [3]:
from IPython.display import Audio
import IPython.display

In [4]:
import numpy as np
import sys
import librosa
import librosa.display
import pysptk
from scipy.io import wavfile

## Data Load

In [5]:
import copy
import fnmatch
import os
import random
import re

import pandas as pd
import json

def find_files(directory, pattern='*.wav'):
    '''Recursively finds all files matching the pattern.'''
    files = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            files.append(os.path.join(root, filename))
    return files

## Mel-frequency cepstrum extraction

In [64]:
def generate_mfcc(directory, sample_rate, lc_dir_name, n_mfcc, lc_ext_name=".csv", overwrite=False):
    files = find_files(directory,  pattern="*.wav")
    for index, filename in enumerate(files):
        print(str(index)+"/"+str(len(files)))
        print(filename)
        lc_filename = copy.deepcopy(filename)
        if lc_filename.endswith('.wav'):
            lc_filename = lc_filename[:-4] + lc_ext_name
        lc_filename = lc_filename.replace("wav", lc_dir_name)
        if os.path.isfile(lc_filename) and not overwrite:
            IPython.display.clear_output(wait=True)
            continue
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=1024, hop_length=128) # shape = (n_mfcc, t)
        df = pd.DataFrame(mfcc.T)
        print(str(mfcc.T.shape))
        if not os.path.exists(os.path.dirname(lc_filename)):
            try:
                os.makedirs(os.path.dirname(lc_filename))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        print(lc_filename)
        df.to_csv(lc_filename, sep=',', header=None, index=None) 
        IPython.display.clear_output(wait=True)

In [65]:
def generate_lc_map(directory, old_name, new_name, json_name):
    files = find_files(directory, pattern="*.wav")
    print("files length: {}".format(len(files)))
    lookup = {}
    for filename in files:
        p_filename = filename.split(directory)[-1]
        p_lc_filename = p_filename.replace(old_name, new_name).replace(".wav", ".csv")
        #p_lc_filename = "../../mfcc40/p225/"+p_filename.replace(".wav", ".csv")
        lookup[p_filename] = p_lc_filename
    with open(directory+"/"+json_name, "w") as output:
        print("Export to "+directory+"/"+json_name)
        json.dump(lookup, output)
    print(lookup)

In [66]:
generate_mfcc("../../cmu_us_slt_arctic", 16000, "mfcc20_16k", 20, lc_ext_name=".csv", overwrite=True)

1131/1132
../../cmu_us_slt_arctic/wav/arctic_a0166.wav


In [54]:
generate_lc_map("../../cmu_us_slt_arctic",  "/wav/", "mfcc20_16k/", "mfcc40_48k.json")

files length: 1132
Export to ../../cmu_us_slt_arctic/mfcc40_48k.json
{'/wav/arctic_a0340.wav': 'mfcc40_48k/arctic_a0340.csv', '/wav/arctic_b0166.wav': 'mfcc40_48k/arctic_b0166.csv', '/wav/arctic_b0300.wav': 'mfcc40_48k/arctic_b0300.csv', '/wav/arctic_b0463.wav': 'mfcc40_48k/arctic_b0463.csv', '/wav/arctic_a0004.wav': 'mfcc40_48k/arctic_a0004.csv', '/wav/arctic_b0491.wav': 'mfcc40_48k/arctic_b0491.csv', '/wav/arctic_a0475.wav': 'mfcc40_48k/arctic_a0475.csv', '/wav/arctic_b0350.wav': 'mfcc40_48k/arctic_b0350.csv', '/wav/arctic_b0504.wav': 'mfcc40_48k/arctic_b0504.csv', '/wav/arctic_b0241.wav': 'mfcc40_48k/arctic_b0241.csv', '/wav/arctic_b0209.wav': 'mfcc40_48k/arctic_b0209.csv', '/wav/arctic_b0204.wav': 'mfcc40_48k/arctic_b0204.csv', '/wav/arctic_a0057.wav': 'mfcc40_48k/arctic_a0057.csv', '/wav/arctic_a0425.wav': 'mfcc40_48k/arctic_a0425.csv', '/wav/arctic_b0104.wav': 'mfcc40_48k/arctic_b0104.csv', '/wav/arctic_a0109.wav': 'mfcc40_48k/arctic_a0109.csv', '/wav/arctic_b0092.wav': 'mfcc40_4

In [43]:
# generate_lc_map("../../VCTK-Corpus/wav48/p225")

files length: 231
{'/p225_144.wav': '/p225_144.csv', '/p225_225.wav': '/p225_225.csv', '/p225_323.wav': '/p225_323.csv', '/p225_084.wav': '/p225_084.csv', '/p225_287.wav': '/p225_287.csv', '/p225_024.wav': '/p225_024.csv', '/p225_012.wav': '/p225_012.csv', '/p225_244.wav': '/p225_244.csv', '/p225_018.wav': '/p225_018.csv', '/p225_300.wav': '/p225_300.csv', '/p225_366.wav': '/p225_366.csv', '/p225_116.wav': '/p225_116.csv', '/p225_037.wav': '/p225_037.csv', '/p225_151.wav': '/p225_151.csv', '/p225_211.wav': '/p225_211.csv', '/p225_053.wav': '/p225_053.csv', '/p225_020.wav': '/p225_020.csv', '/p225_322.wav': '/p225_322.csv', '/p225_325.wav': '/p225_325.csv', '/p225_254.wav': '/p225_254.csv', '/p225_277.wav': '/p225_277.csv', '/p225_089.wav': '/p225_089.csv', '/p225_030.wav': '/p225_030.csv', '/p225_171.wav': '/p225_171.csv', '/p225_201.wav': '/p225_201.csv', '/p225_210.wav': '/p225_210.csv', '/p225_265.wav': '/p225_265.csv', '/p225_350.wav': '/p225_350.csv', '/p225_038.wav': '/p225_038.c

In [29]:
# A = np.zeros((1000))
# mfcc = librosa.feature.mfcc(y=A, sr=16000) # shape = (n_mfcc, t)
# print(mfcc)


[[-1.13137085e+03 -1.13137085e+03]
 [-1.27731159e-13 -1.27731159e-13]
 [ 7.25530747e-14  7.25530747e-14]
 [ 6.37268016e-14  6.37268016e-14]
 [ 7.73825448e-14  7.73825448e-14]
 [-1.13409282e-13 -1.13409282e-13]
 [ 7.18869408e-14  7.18869408e-14]
 [-1.35336187e-13 -1.35336187e-13]
 [ 8.72080186e-14  8.72080186e-14]
 [-3.10307335e-14 -3.10307335e-14]
 [ 4.64628336e-14  4.64628336e-14]
 [-2.10942375e-14 -2.10942375e-14]
 [ 1.13964393e-13  1.13964393e-13]
 [-7.40518757e-14 -7.40518757e-14]
 [ 7.03881398e-14  7.03881398e-14]
 [ 2.25708341e-13  2.25708341e-13]
 [-9.75886039e-14 -9.75886039e-14]
 [-1.17350574e-13 -1.17350574e-13]
 [ 3.23630012e-14  3.23630012e-14]
 [ 6.42819131e-14  6.42819131e-14]]


In [61]:
A = np.zeros((5000))
frame_length = 1024
hop_length = 80
frames = librosa.util.frame(A, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T
mfcc = pysptk.sptk.mfcc(frames, order=20, fs=16000, 
                        alpha=0.97, eps=1.0, window_len=None, frame_len=None, 
                        num_filterbanks=20, cepslift=22, use_dft=False, use_hamming=False, 
                        czero=True, power=True) # shape = (n_mfcc, t)
print(mfcc)


[[ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00
   0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00 -1.e+10]
 [ 0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.e+00  0.