In [None]:
import os
os.environ["PATH_TO_REPO"] = "/Users/stevie/repos/lingo_kit_data"
# os.environ["PATH_TO_REPO"] = "/home/ubuntu/busy_bees/lingo_kit_data"

In [None]:
# load in environment variable
import os
PATH_TO_REPO = os.getenv('PATH_TO_REPO')
assert PATH_TO_REPO is not None, "Please set PATH_TO_REPO environment variable"

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import yaml
import requests
import hashlib
import uuid

import sys
sys.path.append(PATH_TO_REPO)
from utils.audio.text_to_speech import TextToSpeech, VOICES
from utils.s3.upload_to_s3 import upload_file
from utils.csv_helper import get_all_csv_files_rec

In [None]:
df_path = os.path.join(PATH_TO_REPO, 'dataframes/dataframes_by_pos')
all_csv_files = get_all_csv_files_rec(df_path)

In [None]:
tts = TextToSpeech()

In [None]:
def get_term_hash(english_text, italian_text):
    hash = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{english_text}-{italian_text}"))
    return hash
print(get_term_hash('person', 'persone'))

In [None]:
def parse_topics(topics_str):
    assert(topics_str[0] == '[' and topics_str[-1] == ']')
    topics_str = topics_str[1:-1]  # remove brackets
    topics = topics_str.split(',')
    return topics

def get_language_key(lang):
    if lang == 'english':
        return 'en'
    elif lang == 'italian':
        return 'it'
    else:
        raise ValueError(f"Unknown language: {lang}")

def get_audio_hash(text, voice_name, speaking_rate, pitch):
    param_str = f"{text}{voice_name}{speaking_rate:.2f}{pitch:.2f}"
    hash_object = hashlib.sha256(param_str.encode('utf-8'))
    hash_key = hash_object.hexdigest()
    return hash_key

In [None]:
def is_csv_processed(df):
    for lang in 'english', 'italian':
        if f'{lang}_audio_hash' not in df.columns:
            return False
        if f"{lang}_duration_ms" not in df.columns:
            return False
        if sum(df[f'{lang}_audio_hash'].isna()) > 0:
            return False
        if sum(df[f'{lang}_duration_ms'].isna()) > 0:
            return False
    return True

new_all_csv_files = []
for csv_file in all_csv_files:
    df = pd.read_csv(csv_file)
    if not is_csv_processed(df):
        new_all_csv_files.append(csv_file)
all_csv_files = new_all_csv_files
len(all_csv_files)

In [None]:
for csv_path in tqdm(all_csv_files):
    if not os.path.exists(csv_path):
        print(f"File no longer exists, skipping: {csv_path}")
        continue
    print(f"Processing file: {csv_path}")
    file_cost = 0.0
    df = pd.read_csv(csv_path)
    if is_csv_processed(df):
        print(f"File already processed: {csv_path}")
        continue
    for i, (index, row) in enumerate(df.iterrows()):
        for lang in 'english', 'italian':
            has_hash = f'{lang}_audio_hash' in df.columns and not pd.isna(df.loc[index, f'{lang}_audio_hash'])
            has_duration = f'{lang}_duration_ms' in df.columns and not pd.isna(df.loc[index, f'{lang}_duration_ms'])
            if has_hash and has_duration:
                continue
            if lang == 'english':
                gender = 'female'
                speaking_rate = 0.92
                text = row['translation_english']
            else:
                assert(lang == 'italian')
                gender = 'male'
                speaking_rate = 0.7
                text = row['term_italian']

            # this is a fix to make short words like "a" or "the" followed by parenthesis to sound too short
            synth_obj, cost = tts.synthesize(
                text=text,
                voice_name=VOICES[lang][gender],
                speaking_rate=speaking_rate,
                verbose=False,
            )
            file_cost += cost

            local_hash = get_audio_hash(
                text=text,
                voice_name=VOICES[lang][gender],
                speaking_rate=speaking_rate,
                pitch=synth_obj['pitch'],
            )
            assert(local_hash == synth_obj['hash']), f"Hash mismatch: {local_hash} != {synth_obj['hash']}"

            # upload audio file to s3
            file_path = os.path.join(PATH_TO_REPO, synth_obj['audio_file'])
            assert(os.path.exists(file_path)), f"File does not exist: {file_path}"
            upload_file(file_path=file_path, verbose=False)

            df.loc[index, f'{lang}_audio_hash'] = local_hash
            df.loc[index, f'{lang}_duration_ms'] = synth_obj['duration_ms']
        print(f"Processing {i+1}/{len(df)}: {row['term_italian']} / {row['translation_english']} -> {df.loc[index, 'italian_audio_hash']} / {df.loc[index, 'english_audio_hash']}")
    print(f"Total cost for file {csv_path}: ${file_cost:.6f}")
    df.to_csv(csv_path, index=False)
    tts.save()