In [2]:
import os
import pandas as pd
from tqdm import tqdm
import yaml
import requests

import sys
sys.path.append("/Users/stevie/repos/language_app")
from utils.text_to_speech import TextToSpeech, VOICES
from utils.upload_to_s3 import upload_file

In [3]:
# first create a dataframe for the text
df = pd.read_csv(f"/Users/stevie/repos/language_app/foundational_words/dataframes/combined.csv")
len(df), df.columns

(288,
 Index(['english', 'italian', 'base_term', 'is_base', 'topic',
        'part_of_speech'],
       dtype='object'))

In [4]:
tts = TextToSpeech()

In [5]:
# term_obj = {
#     'english_term': 'person',
#     'italian_term': 'persone',
#     'base_term': 'person',
#     'is_base': True
# }
# term_obj = requests.post(
#     'http://127.0.0.1:8000/create-term',
#     json=term_obj,
# ).json()

In [6]:
import uuid
def get_term_hash(english_text, italian_text):
    hash = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{english_text}-{italian_text}"))
    return hash
print(get_term_hash('person', 'persone'))

3cd240e7-7dc2-5554-a776-8111b16dba14


In [None]:
def get_language_key(lang):
    if lang == 'english':
        return 'en'
    elif lang == 'italian':
        return 'it'
    else:
        raise ValueError(f"Unknown language: {lang}")

for i, row in tqdm(df.iterrows(), total=len(df)):
    # hash
    # english_term
    # italian_term
    # base_term
    # is_base
    # part_of_speech
    # topics
    term_hash = get_term_hash(row['english'], row['italian'])
    term_obj = {
        'hash': term_hash,
        'english_term': row['english'],
        'italian_term': row['italian'],
        'base_term': row['base_term'],
        'is_base': row['is_base'],
        'part_of_speech': row['part_of_speech'],
        'topics': ['Foundational Words'],
    }
    response = requests.post(
        'http://127.0.0.1:8000/create-term',
        json=term_obj,
    )
    term_obj = response.json()
    if response.status_code != 201:
        # create better handling of term already taken, and add error handling
        # print(response.text)
        # print(response.status_code)
        # print(response.json())
        assert(response.json()['hash'][0] == 'term with this hash already exists.')
        continue

    for lang in 'english', 'italian':
        text = row[lang]
        # this is a fix to make short words like "a" or "the" followed by parenthesis to sound too short
        text = text.replace(' (', '. (')
        synth_obj = tts.synthesize(
            text=row[lang],
            voice_name=VOICES[lang]['male'],
            # speaking_rate=0.75 if lang == 'italian' else 1.0,
            speaking_rate=0.75,
            pitch=0,
            verbose=False,
        )

        # term
        # language
        # hash
        # speaking_rate
        # voice_name
        # duration_ms
        audio_file_obj = {
            'term': term_obj['hash'],
            'language': get_language_key(lang),
            'hash': synth_obj['hash'],
            'speaking_rate': synth_obj['speaking_rate'],
            'voice_name': synth_obj['voice_name'],
        }
        audio_file_obj = requests.post(
            'http://127.0.0.1:8000/create-audio-file',
            json=audio_file_obj,
        ).json()
        file_path = os.path.join('/Users/stevie/repos/language_app', synth_obj['audio_file'])
        assert(os.path.exists(file_path)), f"File does not exist: {file_path}"
        upload_file(file_path=file_path)

100%|██████████| 288/288 [04:02<00:00,  1.19it/s]


In [8]:
tts.save()

Dataframe saved to /Users/stevie/repos/language_app/data/dataframe.csv


In [13]:
len(tts.df)

1435

In [9]:
tts.df.iloc[0]['audio_file']

'/Users/stevie/repos/language_app/data/audio/70ac481b9b3267885ff18cfa685971fc0274c476758e89dad59a29fcac70b907.mp3'

In [10]:
tts.df.iloc[-1]['audio_file']

'/Users/stevie/repos/language_app/data/audio/ac97dd5e420032640676c224300d47c444b0ee5eeade8017d26f9d1d26dcc479.mp3'

In [11]:
# # get the audio file from s3
# hash = '9cb7f04b82b5485738f6606c67a1cd43dceadc2c42e006eec2f6dc254ec95d71'
# url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/{hash}.mp3"
# file = requests.get(url)
# with open('test.mp3', 'wb') as f:
#     f.write(file.content)

In [12]:

# url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/1ed73913-49ff-52ef-898e-eface7fd9f89.mp3"