In [None]:
# load in environment variable
import os
PATH_TO_REPO = os.getenv('PATH_TO_REPO')
assert PATH_TO_REPO is not None, "Please set PATH_TO_REPO environment variable"

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import yaml
import requests
import hashlib
import uuid

import sys
sys.path.append(PATH_TO_REPO)
from utils.audio.text_to_speech import TextToSpeech, VOICES
from utils.s3.upload_to_s3 import upload_file

In [None]:
df = pd.read_csv(os.path.join(PATH_TO_REPO, 'dataframes/combined_and_reorganized.csv'))
len(df), df.columns

In [None]:
tts = TextToSpeech()

In [None]:
def get_term_hash(english_text, italian_text):
    hash = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{english_text}-{italian_text}"))
    return hash
print(get_term_hash('person', 'persone'))

In [None]:
def parse_topics(topics_str):
    assert(topics_str[0] == '[' and topics_str[-1] == ']')
    topics_str = topics_str[1:-1]  # remove brackets
    topics = topics_str.split(',')
    return topics

def get_language_key(lang):
    if lang == 'english':
        return 'en'
    elif lang == 'italian':
        return 'it'
    else:
        raise ValueError(f"Unknown language: {lang}")

def get_audio_hash(text, voice_name, speaking_rate, pitch):
    param_str = f"{text}{voice_name}{speaking_rate:.2f}{pitch:.2f}"
    hash_object = hashlib.sha256(param_str.encode('utf-8'))
    hash_key = hash_object.hexdigest()
    return hash_key

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):

    for lang in 'english', 'italian':
        if lang == 'english':
            gender = 'female'
            speaking_rate = 0.92
            text = row['translation_english']
        else:
            assert(lang == 'italian')
            gender = 'male'
            speaking_rate = 0.7
            text = row['term_italian']

        # this is a fix to make short words like "a" or "the" followed by parenthesis to sound too short
        synth_obj = tts.synthesize(
            text=text,
            voice_name=VOICES[lang][gender],
            speaking_rate=speaking_rate,
            verbose=False,
        )

        local_hash = get_audio_hash(
            text=text,
            voice_name=VOICES[lang][gender],
            speaking_rate=speaking_rate,
            pitch=synth_obj['pitch'],
        )
        assert(local_hash == synth_obj['hash']), f"Hash mismatch: {local_hash} != {synth_obj['hash']}"

        # upload audio file to s3
        file_path = os.path.join(PATH_TO_REPO, synth_obj['audio_file'])
        assert(os.path.exists(file_path)), f"File does not exist: {file_path}"
        upload_file(file_path=file_path, verbose=False)

        df.loc[i, f'{lang}_audio_hash'] = local_hash

In [None]:
tts.save()

In [None]:
df = pd.read_csv(os.path.join(PATH_TO_REPO, 'dataframes/dataframe_with_audio.csv'))

In [None]:
df.to_csv(os.path.join(PATH_TO_REPO, 'dataframes/dataframe_with_audio.csv'), index=False)

In [None]:
df['italian_audio_hash'].isna().sum()

In [None]:
df['english_audio_hash'].isna().sum()

In [None]:
df['english_audio_hash']

In [None]:
len(tts.df)

In [None]:
len(tts.df)

In [None]:
df.columns

In [None]:
# # get the audio file from s3
# hash = '9cb7f04b82b5485738f6606c67a1cd43dceadc2c42e006eec2f6dc254ec95d71'
row = df.iloc[2]
print(row)
# def get_audio_hash(text, voice_name, speaking_rate, pitch):
for lang in 'english', 'italian':
    if lang == 'english':
        gender = 'female'
        speaking_rate = 0.92
    else:
        assert(lang == 'italian')
        gender = 'male'
        speaking_rate = 0.7
        
    hash = get_audio_hash(
        text=row[f'{lang}_term'],
        voice_name=VOICES[lang][gender],
        speaking_rate=speaking_rate,
        pitch=0,
    )
    url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/{hash}.mp3"
    file = requests.get(url)
    with open(f'test_{lang}.mp3', 'wb') as f:
        f.write(file.content)

In [None]:

# url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/1ed73913-49ff-52ef-898e-eface7fd9f89.mp3"

In [None]:
terms = requests.get(
    'http://127.0.0.1:8000/terms',
).json()
len(terms)