In [None]:
from utils.text_to_speech import TextToSpeech, VOICES
import os
import pandas as pd
from tqdm import tqdm
import yaml

In [None]:
CATEGORY = yaml.safe_load(open("STEP_0_define_category_sub_category.yaml", "r"))['category']
SUB_CATEGORY = yaml.safe_load(open("STEP_0_define_category_sub_category.yaml", "r"))['sub_category']
print(f"Category: {CATEGORY}")
print(f"Sub Category: {SUB_CATEGORY}")

In [None]:
# first create a dataframe for the text
df = pd.read_csv(f"category_data/individual_dataframes/{CATEGORY}_{SUB_CATEGORY}.csv")

In [None]:
len(df), df.columns

In [None]:
tts = TextToSpeech()

In [None]:
json_obj = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    obj = {}
    for lang in 'english', 'italian':
        obj[lang] = {
            "text": row[lang],
        }
        synth_obj = tts.synthesize(
            text=row[lang],
            voice_name=VOICES[lang]['male'],
            speaking_rate=0.75 if lang == 'italian' else 1.0,
            pitch=0,
            verbose=False,
        )
        obj[lang]['hash'] = synth_obj['hash']
        obj[lang]['audio_file'] = synth_obj['audio_file']
        obj[lang]['voice_name'] = synth_obj['voice_name']
        obj[lang]['speaking_rate'] = synth_obj['speaking_rate']
        obj[lang]['pitch'] = synth_obj['pitch']
    for key in row.keys():
        if key not in ['english', 'italian']:
            if str(row[key]) == 'nan':
                obj[key] = ""
            else:
                obj[key] = row[key]

    json_obj.append(obj)

In [None]:
json_obj

In [None]:
import json
json.dump(
    json_obj,
    open(f"category_data/step_1/{CATEGORY}_{SUB_CATEGORY}.json", "w"),
    indent=2,
    ensure_ascii=False,
)

In [None]:
import json
import os
from pydub import AudioSegment
from tqdm import tqdm
import yaml

In [None]:
SILENCE_RATE = 2

In [None]:
data = json.load(open(f'category_data/step_1/{CATEGORY}_{SUB_CATEGORY}.json'))

In [None]:
def get_duration(path):
    try:
        audio = AudioSegment.from_file(path)  # Replace with your file path
        duration_seconds = audio.duration_seconds
        return duration_seconds
    except Exception as e:
        print(f"Error reading audio file: {e}")
        return None

In [None]:
for sample_i, sample_obj in tqdm(enumerate(data), total=len(data)):
    for lang in 'english', 'italian':
        path = sample_obj[lang]['audio_file']
        assert(os.path.exists(path))
        duration = get_duration(path) 
        data[sample_i][lang]['duration_ms'] = int(duration * 1000)

In [None]:
def combine_audio_files(input_paths, output_path):
    combined_audio = None
    for path in input_paths:
        if combined_audio is None:
            combined_audio = AudioSegment.from_file(path)
        else:
            combined_audio += AudioSegment.from_file(path)
    combined_audio.export(output_path, format="mp3")

In [None]:
for sample_i, sample_obj in tqdm(enumerate(data), total=len(data)):
    english_path = sample_obj['english']['audio_file']
    italian_path = sample_obj['italian']['audio_file']
    italian_duration = sample_obj['italian']['duration_ms']
    silence_duration = int(((italian_duration * SILENCE_RATE) // 100) * 100)
    silence_path = f'data/audio/silence/{silence_duration}ms.mp3'
    assert os.path.exists(silence_path), f"Silence file {silence_path} does not exist."
    combined_audio_path = f'category_data/audio/{CATEGORY}_{SUB_CATEGORY}/{sample_i}.mp3'
    os.makedirs(os.path.dirname(combined_audio_path), exist_ok=True)
    audio_files = [english_path, silence_path, italian_path, silence_path, italian_path, silence_path]
    if not os.path.exists(combined_audio_path):
    # if True:
        combine_audio_files(audio_files, combined_audio_path)
    else:
        print(f"Skipping {combined_audio_path} as it already exists.")
    data[sample_i]['combined_audio_file'] = combined_audio_path

In [None]:
json.dump(data, open(f"category_data/step_2/{CATEGORY}_{SUB_CATEGORY}.json", 'w'), indent=2, ensure_ascii=False)

In [None]:
import json
import os
import shutil
from tqdm import tqdm
import yaml

In [None]:
REPO_DIR = '/Users/stevie/repos/language_app'
assert(os.path.exists(REPO_DIR))

DEST_DIR = os.path.join('/Users/stevie/repos/language-app-frontend/assets/category_data', f"{CATEGORY}_{SUB_CATEGORY}")
os.makedirs(DEST_DIR, exist_ok=True)

In [None]:
json_path = os.path.join(REPO_DIR, f'category_data/step_2/{CATEGORY}_{SUB_CATEGORY}.json')
data = json.load(open(json_path))

In [None]:
path_rel = data[0]['combined_audio_file']
print(path_rel)
assert(os.path.exists(os.path.join(REPO_DIR, path_rel)))
# print(os.path.join(DEST_DIR, path_rel))

In [None]:
for sample_i, sample_obj in tqdm(enumerate(data), total=len(data)):
    # combined audio file
    path = os.path.join(REPO_DIR, sample_obj['combined_audio_file'])
    assert(os.path.exists(path))
    rel_new_path = f'./audio/combined/{os.path.basename(path)}'
    new_path = os.path.join(DEST_DIR, rel_new_path)
    os.makedirs(os.path.dirname(new_path), exist_ok=True)
    if not os.path.exists(new_path):
        shutil.copyfile(path, new_path)
    path_str = f'require("{rel_new_path}"):::'
    data[sample_i]['combined_audio_file'] = path_str

    # individual audio files
    for language in 'english', 'italian':
        path = os.path.join(REPO_DIR, sample_obj[language]['audio_file'])
        assert(os.path.exists(path))
        rel_new_path = f'./audio/individual/{os.path.basename(path)}'
        new_path = os.path.join(DEST_DIR, rel_new_path)
        os.makedirs(os.path.dirname(new_path), exist_ok=True)
        if not os.path.exists(new_path):
            shutil.copyfile(path, new_path)
        path_str = f'require("{rel_new_path}"):::'
        data[sample_i][language]['audio_file'] = path_str

In [None]:
REPO_DIR

In [None]:
js_path = os.path.join(REPO_DIR, f"category_data/step_2/{CATEGORY}_{SUB_CATEGORY}.js")
json.dump(data, open(js_path, 'w'), indent=2, ensure_ascii=False)
text = open(js_path).read()
text = text.replace('\\"', '"')
text = f"samples = {text}"
text = text.replace('"require', 'require').replace(':::"', '')
text += ';\nexport default samples;'
open(js_path, 'w').write(text)

In [None]:
old_js_path = os.path.join(REPO_DIR, f"category_data/step_2/{CATEGORY}_{SUB_CATEGORY}.js")
new_js_path = os.path.join(DEST_DIR, 'data.js')
shutil.copyfile(old_js_path, new_js_path)

old_json_path = os.path.join(REPO_DIR, f"category_data/step_2/{CATEGORY}_{SUB_CATEGORY}.json")
new_json_path = os.path.join(DEST_DIR, 'data.json')
shutil.copyfile(old_json_path, new_json_path)

In [None]:
import os
import json
import hashlib

In [None]:
category_dir = '/Users/stevie/repos/language-app-frontend/assets/category_data'
for category_sub_category in os.listdir(category_dir):
    if not os.path.isdir(os.path.join(category_dir, category_sub_category)):
        continue

    path = os.path.join(category_dir, category_sub_category, 'data.json')
    data = json.load(open(path, 'r'))
    print(data)
    for item in data:

        path = item['combined_audio_file']
        path = os.path.join(
            f'./{category_sub_category}/audio/combined',
            os.path.basename(path)
        )
        item['combined_audio_file'] = path

        param_str = f"{item['italian']['hash']}_{item['english']['hash']}"
        hash_object = hashlib.sha256(param_str.encode('utf-8'))
        hash_key = hash_object.hexdigest()
        item['combined_hash'] = hash_key

        for lang in 'english', 'italian':
            path = item[lang]['audio_file']
            path = os.path.join(
                f'./{category_sub_category}/audio/individual',
                os.path.basename(path)
            )
            item[lang]['audio_file'] = path

    # write the modified data back to the file
    with open(os.path.join(category_dir, category_sub_category, 'data.json'), 'w') as f:
        json.dump(data, f, indent=2)

In [None]:
from copy import deepcopy
category_sub_category_dir = '/Users/stevie/repos/language-app-frontend/assets/category_data'
combined_js = {}
combined_json = {}

for category_sub_category in sorted(os.listdir(category_sub_category_dir)):
    if not os.path.isdir(os.path.join(category_sub_category_dir, category_sub_category)):
        continue
    path = os.path.join(category_sub_category_dir, category_sub_category, 'data.json')
    data = json.load(open(path, 'r'))

    combined_json[category_sub_category] = deepcopy(data)

    for item in data:
        path = item['combined_audio_file']
        path_str = f'require("{path}"):::'
        item['combined_audio_file'] = path_str
        for lang in 'english', 'italian':
            path = item[lang]['audio_file']
            path_str = f'require("{path}"):::'
            item[lang]['audio_file'] = path_str
    combined_js[category_sub_category] = data

In [None]:
category_sub_category_dir = '/Users/stevie/repos/language-app-frontend/assets/category_data'
# write the combined data to a new file
path = os.path.join(category_sub_category_dir, 'combined_data.js')
with open(path, 'w') as f:
    json.dump(combined_js, f, indent=2)

# write the combined data to a new file
path = os.path.join(category_sub_category_dir, 'combined_data.json')
with open(path, 'w') as f:
    json.dump(combined_json, f, indent=2)

In [None]:
category_sub_category_dir = '/Users/stevie/repos/language-app-frontend/assets/category_data'
path = os.path.join(category_sub_category_dir, 'combined_data.js')
text = open(path).read()
text = text.replace('\\"', '"')
text = f"samples = {text}"
text = text.replace('"require', 'require').replace(':::"', '')
text += ';\nexport default samples;'
open(path, 'w').write(text)