可以将wav识别成音素并做位置校准, 去掉了声调
data source from http://www.data-baker.com/open_source.html 中文语音数据 http://www.data-baker.com/open_source.html
0
0.335
"sil"
0.335
0.44716666358556956
"i2"
0.44716666358556956
0.4791972882013558
"g"
0.4791972882013558
0.5988041034411818
"e4"
0.5988041034411818
0.7845117609567029
"ve4"
0.7845117609567029
0.8576356247418923
"h"
0.8576356247418923
1.055
"ou4"
1.055
1.1463078796848405
"sp1"
1.1463078796848405
1.2032778206706618
"d"
1.2032778206706618
1.3256834492826761
"i3"
1.3256834492826761
1.384927467968385
"zh"
1.384927467968385
1.4392608155296476
"iii4"
1.4392608155296476
1.4855054815523305
"l"
1.4855054815523305
1.6360173128079154
"ian2"
1.6360173128079154
1.671243486080499
"m"
1.671243486080499
1.8158445405638317
"eng2"
1.8158445405638317
2.0444386227512936
"ua2"
2.0444386227512936
2.134321276719004
"j"
2.134321276719004
2.369155146691041
"ie3"
2.369155146691041
2.685
"sil"
(0, 336, 'sil')
(336, 448, 'i')
(448, 488, 'g')
(488, 600, 'e')
(600, 784, 've')
(784, 864, 'h')
(864, 1056, 'ou')
(1056, 1152, 'sil')
(1152, 1208, 'd')
(1208, 1328, 'i')
(1328, 1392, 'zh')
(1392, 1448, 'iii')
(1448, 1488, 'l')
(1488, 1624, 'ian')
(1624, 1672, 'm')
(1672, 1792, 'eng')
(1792, 2048, 'ua')
(2048, 2136, 'j')
(2136, 2368, 'ie')
(2368, 2680, 'sil')