In [2]:
# !pip install gtts pydub audioop-lts
from gtts import gTTS
from pydub import AudioSegment
import time
import os
import pandas as pd
import glob
from collections import defaultdict
pd.options.display.max_columns = 100

# 1. Load data

In [14]:
cols_keep = [
    'chinese', 'pinyin', 'english',
    'type', 'priority', 'known', 'known_pinyin_prompt', 'known_english_prompt',
    'phonetic', 'category1', 'quality',
    'word1', 'word1_english', 'word2', 'word2_english', 'word3', 'word3_english', 'word4', 'word4_english',
    'sentence', 'sentence_pinyin', 'sentence_english', 'date']
sheet_url = 'https://docs.google.com/spreadsheets/d/1pw9EAIvtiWenPDBFBIf7pwTh0FvIbIR0c3mY5gJwlDk/edit#gid=0'
sheet_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
df = pd.read_csv(sheet_url)[cols_keep]
df = df.dropna(subset=['chinese', 'english'])
df['known_english_prompt'] = df['known_english_prompt'].fillna(6)
print(df.shape)
df.head()

(4565, 23)


Unnamed: 0,chinese,pinyin,english,type,priority,known,known_pinyin_prompt,known_english_prompt,phonetic,category1,quality,word1,word1_english,word2,word2_english,word3,word3_english,word4,word4_english,sentence,sentence_pinyin,sentence_english,date
0,后备箱,hòu bèi xiāng,trunk;boot (of car),combo,1.0,5.0,1.0,3.0,,travel,1.0,后,back,设备,equipment,箱,box,,,把行李放进后备箱,Bǎ xínglǐ fàng jìn hòubèixiāng,Put the luggage in the trunk,
1,爆竹,bào zhú,firecracker,combo,1.0,3.0,2.0,5.0,,china,1.0,爆,explode,竹,bamboo,,,,,春节的时候我们放了很多爆竹。,Chūnjié de shíhou wǒmen fàng le hěn duō bàozhú.,We set off many firecrackers during Spring Fes...,
2,周一,zhōu yī,monday,combo,1.0,1.0,1.0,1.0,,general,1.0,周,week,一,one,,,,,,,,
3,一月,yī yuè,january,combo,1.0,1.0,1.0,1.0,,general,1.0,一,one,月,month,,,,,,,,
4,黄金,huáng jīn,gold,combo,1.0,1.0,1.0,1.0,,industry,1.0,黄,yellow,金,metal,,,,,,,,


# 2. Select data

In [15]:
# Which data to use this time?
types_not_allowed = ['phrase', 'sentence', 'part sent', 'phrase_save']
df_this = (
    df[
        (df['priority'] <= 2) &
        (df['known_english_prompt'] >= 2) &
        (~df['type'].isin(types_not_allowed))
    ]
    .sort_values(['category1', 'pinyin'], ascending=[True, True])
    .dropna(subset=['word1', 'word1_english', 'word2', 'word2_english'])
    .reset_index(drop=True))
print(df_this.shape)
df_this.head(10)

(912, 23)


Unnamed: 0,chinese,pinyin,english,type,priority,known,known_pinyin_prompt,known_english_prompt,phonetic,category1,quality,word1,word1_english,word2,word2_english,word3,word3_english,word4,word4_english,sentence,sentence_pinyin,sentence_english,date
0,标准,biāo zhǔn,standard,combo,1.0,5.0,1.0,3.0,,adjective,3.0,目标,target,准备,preparation,,,,,这家店的服务很标准,Zhè jiā diàn de fúwù hěn biāozhǔn,The service here is very standard,
1,出名,chū míng,famous,combo,2.0,5.0,2.0,2.0,,adjective,2.0,出,to go out,名,name,,,,,他因为短视频很快出名,Tā yīnwèi duǎn shìpín hěn kuài chūmíng,He became famous quickly because of short videos,
2,大声,dà shēng,loud,combo,2.0,4.0,5.0,2.0,,adjective,1.0,大,big,声,voice,,,,,请不要大声说话,Qǐng búyào dàshēng shuōhuà,Please do not speak loudly,2025-06-15
3,角度,jiǎo​ dù,angle,combo,2.0,3.0,1.0,5.0,,adjective,2.0,角,horn,程度,degree,,,,,换个角度看看,Huàn gè jiǎodù kànkan,Look at it from another angle,
4,主动,zhǔ dòng,initiative;voluntarily,combo,2.0,5.0,5.0,5.0,,adjective,2.0,主意,plan,动作,movement,,,,,她主动打了招呼,Tā zhǔdòng dǎ le zhāohu,She greeted me first,
5,主观,zhǔ guān,subjective,combo,2.0,5.0,5.0,5.0,,adjective,2.0,主义,ideology,观点,point of view,,,,,这是你的主观感受,Zhè shì nǐ de zhǔguān gǎnshòu,This is your subjective feeling,
6,自愿,zì yuàn,voluntary,combo,2.0,5.0,5.0,5.0,,adjective,2.0,自,self,愿意,wish,,,,,我自愿帮忙,Wǒ zìyuàn bāngmáng,I volunteer to help,
7,百万,bǎi wàn,million,combo,2.0,2.0,1.0,2.0,,amount,3.0,百,hundred,万,ten thousand,,,,,他中了彩票奖金是百万,Tā zhòng le cǎipiào jiǎngjīn shì bǎiwàn,He won a million from the lottery,
8,差不多,chà bù duō,almost;more or less,combo,2.0,2.0,1.0,3.0,,amount,3.0,差异,difference,不,no,多,much,,,我们俩的想法差不多,Wǒmen liǎ de xiǎngfǎ chàbuduō,Our ideas are pretty much the same,
9,差点儿,chà diǎn er,almost,combo,2.0,2.0,2.0,2.0,,amount,2.0,差,lacking,点,bit,儿,Beijing er,,,我差点儿错过了火车,Wǒ chàdiǎnr cuòguò le huǒchē,I almost missed the train,


# 3. TTS

In [32]:
def create_tts_file(tts_type, content_str, lang_name, last_timestamp):
    if tts_type == 'zh_slow':
        slow_mode = True
    else:
        slow_mode = False

    new_file_path = f"audio_files/{tts_type}/{content_str}.mp3"
    if not os.path.exists(new_file_path):
        try:
            gTTS(content_str, lang=lang_name, slow=slow_mode).save(new_file_path)
        except:
            # Wait 60 seconds and try again
            print(f"!!!!!!! FAILURE, wait 52 seconds, row{i_row}, {tts_type}, {content_str} !!!!!!!")
            time.sleep(52)
            try:
                gTTS(content_str, lang=lang_name, slow=slow_mode).save(new_file_path)
            except:
                # Wait 60 seconds and try again
                print(f"!!!!!!!!!! FAILURE AGAIN, wait 278 seconds, row{i_row}, {tts_type}, {content_str} !!!!!!!!!!")
                time.sleep(278)
                gTTS(content_str, lang=lang_name, slow=slow_mode).save(new_file_path)
        print(f"{(time.time()-last_timestamp):.3f}s, row{i_row}, {tts_type}, {content_str}")
    else:
        print(f"{(time.time()-last_timestamp):.3f}s, ALREADY EXISTS, row{i_row}, {tts_type}, {content_str}")


for i_row, row in df_this.iterrows():
    create_tts_file(tts_type='zh', content_str=row['chinese'], lang_name='zh-cn', last_timestamp=time.time())
    create_tts_file(tts_type='zh_slow', content_str=row['chinese'], lang_name='zh-cn', last_timestamp=time.time())
    create_tts_file(tts_type='english', content_str=row['english'], lang_name='en', last_timestamp=time.time())
    create_tts_file(tts_type='zh', content_str=row['word1'], lang_name='zh-cn', last_timestamp=time.time())
    create_tts_file(tts_type='zh', content_str=row['word2'], lang_name='zh-cn', last_timestamp=time.time())
    create_tts_file(tts_type='english', content_str=row['word1_english'], lang_name='en', last_timestamp=time.time())
    create_tts_file(tts_type='english', content_str=row['word2_english'], lang_name='en', last_timestamp=time.time())
    if not pd.isna(row['word3']):
        create_tts_file(tts_type='zh', content_str=row['word3'], lang_name='zh-cn', last_timestamp=time.time())
        create_tts_file(tts_type='english', content_str=row['word3_english'], lang_name='en', last_timestamp=time.time())
    if not pd.isna(row['word4']):
        create_tts_file(tts_type='zh', content_str=row['word4'], lang_name='zh-cn', last_timestamp=time.time())
        create_tts_file(tts_type='english', content_str=row['word4_english'], lang_name='en', last_timestamp=time.time())

0.000s, ALREADY EXISTS, row0, zh, 标准
0.000s, ALREADY EXISTS, row0, zh_slow, 标准
0.000s, ALREADY EXISTS, row0, english, standard
0.000s, ALREADY EXISTS, row0, zh, 目标
0.000s, ALREADY EXISTS, row0, zh, 准备
0.000s, ALREADY EXISTS, row0, english, target
0.000s, ALREADY EXISTS, row0, english, preparation
0.000s, ALREADY EXISTS, row1, zh, 出名
0.000s, ALREADY EXISTS, row1, zh_slow, 出名
0.000s, ALREADY EXISTS, row1, english, famous
0.000s, ALREADY EXISTS, row1, zh, 出
0.000s, ALREADY EXISTS, row1, zh, 名
0.000s, ALREADY EXISTS, row1, english, to go out
0.000s, ALREADY EXISTS, row1, english, name
0.000s, ALREADY EXISTS, row2, zh, 大声
0.000s, ALREADY EXISTS, row2, zh_slow, 大声
0.000s, ALREADY EXISTS, row2, english, loud
0.000s, ALREADY EXISTS, row2, zh, 大
0.000s, ALREADY EXISTS, row2, zh, 声
0.000s, ALREADY EXISTS, row2, english, big
0.000s, ALREADY EXISTS, row2, english, voice
0.000s, ALREADY EXISTS, row3, zh, 角度
0.000s, ALREADY EXISTS, row3, zh_slow, 角度
0.000s, ALREADY EXISTS, row3, english, angle
0.000

# 4. Combine files for individual vocab words

In [33]:
recording_id = '006'
recording_name = '0815_word_p2_k2_ecombo'
for i_row, row in df_this.iterrows():
    start_time = time.time()
    new_file_path = f"audio_files/rows/{recording_id}_{row['chinese']}.mp3"
    if not os.path.exists(new_file_path):
        chinese_audio = AudioSegment.from_mp3(f"audio_files/zh/{row['chinese']}.mp3")
        chinese_slow_audio = AudioSegment.from_mp3(f"audio_files/zh_slow/{row['chinese']}.mp3")
        english_audio = AudioSegment.from_mp3(f"audio_files/english/{row['english']}.mp3")
        
        word1_audio = AudioSegment.from_mp3(f"audio_files/zh/{row['word1']}.mp3")
        word2_audio = AudioSegment.from_mp3(f"audio_files/zh/{row['word2']}.mp3")
        word1e_audio = AudioSegment.from_mp3(f"audio_files/english/{row['word1_english']}.mp3")
        word2e_audio = AudioSegment.from_mp3(f"audio_files/english/{row['word2_english']}.mp3")
        if not pd.isna(row['word3']):
            word3_audio = AudioSegment.from_mp3(f"audio_files/zh/{row['word3']}.mp3")
            word3e_audio = AudioSegment.from_mp3(f"audio_files/english/{row['word3_english']}.mp3")
        if not pd.isna(row['word4']):
            word4_audio = AudioSegment.from_mp3(f"audio_files/zh/{row['word4']}.mp3")
            word4e_audio = AudioSegment.from_mp3(f"audio_files/english/{row['word4_english']}.mp3")
        
        pause_100ms = AudioSegment.silent(duration=100)
        pause_500ms = AudioSegment.silent(duration=500)
        pause_1000ms = AudioSegment.silent(duration=1000)

        # concat words audio
        component_words_audio = word1_audio + pause_100ms + word1e_audio + pause_500ms + word2_audio + pause_100ms + word2e_audio
        if not pd.isna(row['word3']):
            component_words_audio += pause_500ms + word3_audio + pause_100ms + word3e_audio
        if not pd.isna(row['word4']):
            component_words_audio += pause_500ms + word4_audio + pause_100ms + word4e_audio

        # 001
        # combined = chinese_audio + pause_500ms + chinese_slow_audio + pause_500ms + english_audio + pause_500ms + sent_audio + pause_500ms + sent_english_audio + pause_500ms + sent_audio + pause_1000ms
        # 002
        # combined = chinese_audio + pause_500ms + chinese_slow_audio + pause_500ms + english_audio + pause_500ms + sent_audio + pause_500ms + sent_audio + pause_1000ms

        # 003
        combined = chinese_audio + pause_500ms + component_words_audio + pause_500ms + chinese_slow_audio + pause_500ms + english_audio + pause_500ms
        
        combined.export(new_file_path, format="mp3")
        print(f"{(time.time()-start_time):.2f} seconds, row {i_row}, {row['chinese']}")
    else:
        print(f"{(time.time()-start_time):.2f} seconds, row {i_row}, {row['chinese']} ALREADY EXISTS")


0.00 seconds, row 0, 标准 ALREADY EXISTS
0.00 seconds, row 1, 出名 ALREADY EXISTS
0.00 seconds, row 2, 大声 ALREADY EXISTS
0.00 seconds, row 3, 角度 ALREADY EXISTS
0.00 seconds, row 4, 主动 ALREADY EXISTS
0.00 seconds, row 5, 主观 ALREADY EXISTS
0.00 seconds, row 6, 自愿 ALREADY EXISTS
0.00 seconds, row 7, 百万 ALREADY EXISTS
0.00 seconds, row 8, 差不多 ALREADY EXISTS
0.00 seconds, row 9, 差点儿 ALREADY EXISTS
0.00 seconds, row 10, 差距 ALREADY EXISTS
0.00 seconds, row 11, 大部分 ALREADY EXISTS
0.00 seconds, row 12, 大多数 ALREADY EXISTS
0.00 seconds, row 13, 大概 ALREADY EXISTS
0.00 seconds, row 14, 大小 ALREADY EXISTS
0.00 seconds, row 15, 概率 ALREADY EXISTS
0.00 seconds, row 16, 高度 ALREADY EXISTS
0.00 seconds, row 17, 好几 ALREADY EXISTS
0.00 seconds, row 18, 几十 ALREADY EXISTS
0.00 seconds, row 19, 毛重 ALREADY EXISTS
0.00 seconds, row 20, 难得 ALREADY EXISTS
0.00 seconds, row 21, 频率 ALREADY EXISTS
0.00 seconds, row 22, 平方米 ALREADY EXISTS
0.00 seconds, row 23, 十亿 ALREADY EXISTS
0.00 seconds, row 24, 数量 ALREADY EXISTS
0.00 

In [None]:
# os.remove('audio_files/zh/经纪.mp3')

# 5. Combine individual vocab into final audio recording

In [34]:
# Combine the audio files into a single file
all_audio_files = []
for i_row, row in df_this.iterrows():
    audio_file = f"audio_files/rows/{recording_id}_{row['chinese']}.mp3"
    if os.path.exists(audio_file):
        all_audio_files.append(AudioSegment.from_mp3(audio_file))

combined = all_audio_files[0]
for audio in all_audio_files[1:]:
    combined += audio
combined.export(f"audio_files/products/{recording_id}_{recording_name}.mp3", format="mp3")

<_io.BufferedRandom name='audio_files/products/006_0815_word_p2_k2_ecombo.mp3'>

# 6. 