In [None]:
import os
import pandas as pd
from tqdm import tqdm
import yaml
import requests

import sys
sys.path.append("/Users/stevie/repos/lingo_kit_data")
from utils.text_to_speech import TextToSpeech, VOICES
from utils.upload_to_s3 import upload_file

In [None]:
category_data_df = pd.read_csv('/Users/stevie/repos/lingo_kit_data/dataframes/category_data.csv')
foundational_words_df = pd.read_csv('/Users/stevie/repos/lingo_kit_data/dataframes/foundational_words.csv')
spotify_lessons_df = pd.read_csv('/Users/stevie/repos/lingo_kit_data/dataframes/spotify_lessons.csv')

In [None]:
columns = [
    'english_term',
    'italian_term',
    'part_of_speech',
    'topics',
    'english_base_term',
    'italian_base_term',
]

In [None]:
df = pd.concat([category_data_df, foundational_words_df, spotify_lessons_df], ignore_index=True)
print(len(df))
df.drop_duplicates(subset=['english_term', 'italian_term'], inplace=True)
print(len(df))
df.reset_index(drop=True, inplace=True)
df = df[columns]
len(df), df.columns

In [None]:
tts = TextToSpeech()

In [None]:
# term_obj = {
#     'english_term': 'person',
#     'italian_term': 'persone',
#     'base_term': 'person',
#     'is_base': True
# }
# term_obj = requests.post(
#     'http://127.0.0.1:8000/create-term',
#     json=term_obj,
# ).json()

In [None]:
import uuid
def get_term_hash(english_text, italian_text):
    hash = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{english_text}-{italian_text}"))
    return hash
print(get_term_hash('person', 'persone'))

In [None]:
def parse_topics(topics_str):
    assert(topics_str[0] == '[' and topics_str[-1] == ']')
    topics_str = topics_str[1:-1]  # remove brackets
    topics = topics_str.split(',')
    return topics

def get_language_key(lang):
    if lang == 'english':
        return 'en'
    elif lang == 'italian':
        return 'it'
    else:
        raise ValueError(f"Unknown language: {lang}")

In [None]:
import time
# df = df[:5]
for i, row in tqdm(df.iterrows(), total=len(df)):
    # hash
    # english_term
    # italian_term
    # english_base_term
    # italian_base_term
    # part_of_speech
    # topics
    term_hash = get_term_hash(row['english_term'], row['italian_term'])

    term_obj = {
        'hash': term_hash,
        'english_term': row['english_term'],
        'italian_term': row['italian_term'],
        'english_base_term': row['english_base_term'],
        'italian_base_term': row['italian_base_term'],
        'part_of_speech': row['part_of_speech'],
        'topics': parse_topics(row['topics']),
    }

    # check if term already exists
    response = requests.get(f'http://127.0.0.1:8000/term/{term_hash}')
    if response.status_code != 200:
        assert(response.status_code == 404)
        response = requests.post(
            'http://127.0.0.1:8000/create-term',
            json=term_obj,
        )
        assert(response.status_code == 201)
    term_obj = response.json()

    for lang in 'english', 'italian':
        text = row[f"{lang}_term"]
        # this is a fix to make short words like "a" or "the" followed by parenthesis to sound too short
        text = text.replace(' (', '. (')
        synth_obj = tts.synthesize(
            text=text,
            voice_name=VOICES[lang]['male'],
            # speaking_rate=0.75 if lang == 'italian' else 1.0,
            speaking_rate=0.75,
            pitch=0,
            verbose=False,
        )

        # language
        # hash
        # speaking_rate
        # voice_name
        # duration_ms
        audio_file_obj = {
            'text': text,
            'language': get_language_key(lang),
            'hash': synth_obj['hash'],
            'speaking_rate': synth_obj['speaking_rate'],
            'voice_name': synth_obj['voice_name'],
            'pitch': synth_obj['pitch'],
            'duration_ms': synth_obj['duration_ms'],
        }

        # upload audio file to s3
        file_path = os.path.join('/Users/stevie/repos/lingo_kit_data', synth_obj['audio_file'])
        assert(os.path.exists(file_path)), f"File does not exist: {file_path}"
        upload_file(file_path=file_path, verbose=False)

        # check if audio file already exists
        response = requests.get(f"http://127.0.0.1:8000/audio-file/{synth_obj['hash']}")

        if response.status_code != 200:
            assert(response.status_code == 404)
            resp = requests.post(
                'http://127.0.0.1:8000/create-audio-file',
                json=audio_file_obj,
            )
            audio_file_obj = resp.json()
            assert(resp.status_code == 201), resp.text
    if i % 100 == 0:
        tts.save()

In [None]:
tts.save()

In [None]:
len(tts.df)

In [None]:
tts.df.iloc[0]['audio_file']

In [None]:
tts.df.iloc[-1]['audio_file']

In [None]:
# # get the audio file from s3
# hash = '9cb7f04b82b5485738f6606c67a1cd43dceadc2c42e006eec2f6dc254ec95d71'
# url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/{hash}.mp3"
# file = requests.get(url)
# with open('test.mp3', 'wb') as f:
#     f.write(file.content)

In [None]:

# url = f"https://steviedale-language-app.s3.us-east-1.amazonaws.com/1ed73913-49ff-52ef-898e-eface7fd9f89.mp3"

In [None]:
terms = requests.get(
    'http://127.0.0.1:8000/terms',
).json()
len(terms)