In [2]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read, write
import os, glob, shutil
from shutil import copyfile
import pinyin
import librosa

### parameters

In [3]:
path = '/home/guandao/data/avaocado_data20181222'
out_path = '/home/guandao/data/avaocado_data20181222_all/guoguo'
out_wav_path = os.path.join(out_path, 'wavs')
speaker = '果果'

### clean output directory

In [4]:
shutil.rmtree(out_path, ignore_errors = True)
shutil.rmtree(out_wav_path, ignore_errors = True)
os.mkdir(out_path)
os.mkdir(out_wav_path)

### rename and subsample wavs 

In [5]:
data = []
index = 0
n_time = 0.0
prefix = 'avocado-' + pinyin.get(speaker,format="strip") + '-'
for excel in glob.glob(os.path.join(path, '**','*.xls'),recursive=True):
    dir_name = os.path.dirname(excel)

    df = pd.read_excel(excel).dropna();    
    df = df[df['people'] == speaker] 
    
    for idx, row in df.iterrows():
        text = row[1]
        original_wav_name = os.path.join(dir_name, 'clips', str(int(row[0])) + ' ' + row[1].strip() + '.wav')
        wav_name = os.path.join(out_wav_path, prefix + str(index) + '.wav') 

        if not os.path.exists(original_wav_name):
            for f in os.listdir(os.path.join(dir_name, 'clips')):
                if f.startswith(str(int(row[0])) + ' '):
                    original_wav_name = os.path.join(dir_name,'clips', f)
                    v_text = ' '.join(f.split(' ')[1:])[:-4]
                    print(dir_name)
                    print("fix text:" + text + " to:" + v_text)
                    text = v_text
                    break
                    
        rate, wav = read(original_wav_name) 
        if len(wav.shape) == 2:
            # 2 channels
            wav = wav.mean(axis=1)
        subsampled = librosa.core.resample(wav.astype(float), rate, 16000).astype(np.int16)
        write(wav_name,16000, subsampled)
        n_time += subsampled.size / 16000
        data.append([wav_name, pinyin.get(text,format="numerical", delimiter=" "), len(subsampled)/16000])
        index +=1

print("total length of waves: " + str(n_time))
print("total number of sentences: " + str(index))

/home/guandao/data/avaocado_data20181222/20181119-201812/20181119-201812/雾中的黑影
fix text:它的危害很大吗? to:它的危害很大吗？
/home/guandao/data/avaocado_data20181222/20181119-201812/20181119-201812/舞会里的病毒
fix text:是会让人鼻子生病的病毒吗? to:是会让人鼻子生病的病毒吗？
/home/guandao/data/avaocado_data20181222/20181119-201812/20181119-201812/石峁古城游记
fix text:这不是比YoYo说的两千八百多年还早嘛? to:这不是比YoYo说的两千八百多年还早嘛？
/home/guandao/data/avaocado_data20181222/201810/1029-1102（60篇）/光明也害人
fix text:他，他看不见我们吗？ to:他看不见我们吗？
/home/guandao/data/avaocado_data20181222/201810/1015-1019（60篇）/果果校长的烦恼
fix text:现在我们可以通过什么捐款啦、科研啦、投资啦之类的方法来获得收入了！ to:现在我们可以通过什么捐款啦、科研啦、投资啦之类的方式来获得收入了！
/home/guandao/data/avaocado_data20181222/201810/1015-1019（60篇）/拍照的奇遇
fix text:我，我果果大侠会保护你们的。 to:我果果大侠会保护你们的。
/home/guandao/data/avaocado_data20181222/201810/1015-1019（60篇）/奇妙的乐园
fix text:怎，怎么会这么苦啊， to:怎么会这么苦啊，
/home/guandao/data/avaocado_data20181222/201810/1015-1019（60篇）/害人的酒精
fix text:要不是我果果大侠身手矫捷， to:要不是我果果大侠身手矫健，
/home/guandao/data/avaocado_data20181222/201810/1015-1019（60篇）/金枪鱼

In [6]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2
0,/home/guandao/data/avaocado_data20181222_all/g...,Y o Y o ！,0.491
1,/home/guandao/data/avaocado_data20181222_all/g...,Y o Y o ！,0.62
2,/home/guandao/data/avaocado_data20181222_all/g...,wo3 he2 niu2 niu2 ge1 dao4 le5 ，,1.24
3,/home/guandao/data/avaocado_data20181222_all/g...,ni3 kuai4 kai1 men2 ya1 ！,1.474
4,/home/guandao/data/avaocado_data20181222_all/g...,Y o Y o ni3 zen3 yao1 le5 ？,1.965062


In [16]:
df = df[df[2] < 20]

In [19]:
df.describe()

Unnamed: 0,2
count,18254.0
mean,2.258195
std,1.198522
min,0.109063
25%,1.379063
50%,2.071531
75%,2.911766
max,12.427


### generate train and eval txts

In [20]:
r = np.random.rand(df.shape[0])
train = df[r < 0.9]
evalu = df[r > 0.9]

In [26]:
train[2].sum() / df[2].sum()

0.9006705233885157

In [24]:
# for small data experiment
train.to_csv(os.path.join(out_path, "train.txt"), sep='|',index=False, header=None)
evalu.to_csv(os.path.join(out_path, "eval.txt"), sep='|',index=False, header=None)