In [1]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read, write
import os, glob, shutil
from shutil import copyfile
from chinese_process import expand, get_pinyin, combine_with_zero, combine, seperate
import librosa
import re

### parameters

In [2]:
path = '/home/guandao/data/avaocado_data20181222'
out_path = '/home/guandao/data/avaocado_data20181222_all/guoguo22050_expanded'
out_wav_path = os.path.join(out_path, 'wavs')
speaker = '果果'

### clean output directory

In [3]:
shutil.rmtree(out_path, ignore_errors = True)
shutil.rmtree(out_wav_path, ignore_errors = True)
os.mkdir(out_path)
os.mkdir(out_wav_path)

### rename and subsample wavs 

In [4]:
def expand_text(sentence):
    a = ['你打119干嘛？', '只要998，','只要998……','什么C什么19？','我……我要回家研究研究这个C919，','啊！没想到这个G20峰会原来那么重要，',
         '保质期这里写着611，','把119倒着看成611了！']
    b = ['这都62年了啊！']
    c = ['我们约在明天下午2点半见面可以吗？','到比赛的城市坐高铁也至少要2个多小时，']


    if sentence in a:
        return re.sub('[0-9]+', (lambda s: seperate(s.group())) ,sentence)

    elif sentence in b:
        return re.sub('[0-9]+', (lambda s: combine_with_zero(s.group())) ,sentence)

    elif sentence in c:
        return re.sub('[0-9]+', '两' ,sentence)

    elif sentence == '我总是7：00到学校去跑步。':
        return '我总是七点到学校去跑步。'

    else:
        return expand(sentence)



In [5]:
data = []
index = 0
n_time = 0.0
prefix = 'avocado-guoguo-'
for excel in glob.glob(os.path.join(path, '**','*.xls'),recursive=True):
    dir_name = os.path.dirname(excel)

    df = pd.read_excel(excel).dropna();    
    df = df[df['people'] == speaker] 
    
    for idx, row in df.iterrows():
        text = row[1]
        original_wav_name = os.path.join(dir_name, 'clips', str(int(row[0])) + ' ' + row[1].strip() + '.wav')
        wav_name = os.path.join(out_wav_path, prefix + str(index) + '.wav') 

        if not os.path.exists(original_wav_name):
            for f in os.listdir(os.path.join(dir_name, 'clips')):
                if f.startswith(str(int(row[0])) + ' '):
                    original_wav_name = os.path.join(dir_name,'clips', f)
                    v_text = ' '.join(f.split(' ')[1:])[:-4]
#                     print(dir_name)
#                     print("fix text:" + text + " to:" + v_text)
                    text = v_text
                    break
                    
        rate, wav = read(original_wav_name) 
            
        if len(wav.shape) == 2:
            # 2 channels
            wav = wav.mean(axis=1)
        subsampled = librosa.core.resample(wav.astype(float), rate, 22050).astype(np.int16)
        write(wav_name,22050, subsampled)
        t = subsampled.size / 22050
        n_time += t
        text = expand_text(text)
        text = get_pinyin(text)
        data.append([wav_name, text, t])
        index +=1

print("total length of waves: " + str(n_time))
print("total number of sentences: " + str(index))

total length of waves: 42177.56662131507
total number of sentences: 18257


In [6]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2
0,/home/guandao/data/avaocado_data20181222_all/g...,YoYo！,0.49102
1,/home/guandao/data/avaocado_data20181222_all/g...,YoYo！,0.62
2,/home/guandao/data/avaocado_data20181222_all/g...,wo3 he2 niu2 niu2 ge1 dao4 le ，,1.24
3,/home/guandao/data/avaocado_data20181222_all/g...,ni3 kuai4 kai1 men2 ya ！,1.474014
4,/home/guandao/data/avaocado_data20181222_all/g...,YoYo ni3 zen3 me le ？,1.965034


In [7]:
# remove too long
df = df[df[2] < 20]

In [8]:
# remove too short
df = df[-((list((map(lambda s: len(s.split(' ')) < 4,df[1])))&(df[2] < 3)) |(df[2] < 1))]

In [9]:
df.head()

Unnamed: 0,0,1,2
2,/home/guandao/data/avaocado_data20181222_all/g...,wo3 he2 niu2 niu2 ge1 dao4 le ，,1.24
3,/home/guandao/data/avaocado_data20181222_all/g...,ni3 kuai4 kai1 men2 ya ！,1.474014
4,/home/guandao/data/avaocado_data20181222_all/g...,YoYo ni3 zen3 me le ？,1.965034
5,/home/guandao/data/avaocado_data20181222_all/g...,ni3 zen3 me kan4 qi3 lai2 na4 me sheng1 qi4 ne ？,2.432018
6,/home/guandao/data/avaocado_data20181222_all/g...,ni3 dou1 kuai4 bian4 cheng2 mao4 hei1 qi4 de d...,3.614014


In [10]:
df.describe()

Unnamed: 0,2
count,15817.0
mean,2.489386
std,1.117886
min,1.0
25%,1.644989
50%,2.272018
75%,3.061995
max,12.426984


In [11]:
df.sum()

0    /home/guandao/data/avaocado_data20181222_all/g...
1    wo3 he2 niu2 niu2 ge1 dao4 le ，ni3 kuai4 kai1 ...
2                                              39374.6
dtype: object

### generate train and eval txts

In [12]:
r = np.random.rand(df.shape[0])
train = df[r < 0.9]
evalu = df[r > 0.9]

In [13]:
train[2].sum() / df[2].sum()

0.8954304553645492

In [14]:
# for small data experiment
train.to_csv(os.path.join(out_path, "train.txt"), sep='|',index=False, header=None)
evalu.to_csv(os.path.join(out_path, "eval.txt"), sep='|',index=False, header=None)