In [115]:
import csv
import genanki
import requests

## Read in the Google Sheet
`ingest_sheets` reads a manually created vocab list with the columns
`word_de | word_en | sentence1_id_de | sentence2_id_de` The last column may be empty. The words should be unique. It assumes there's a header.

In [131]:
def ingest_sheet(filepath_str):
    words = {}
    with open(filepath_str, newline='', encoding='utf-8') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        next(tsvreader)  # skip the headers
        for row in tsvreader:
            word_de = row[0]
            word_en = row[1]
            sentence1_id_de = row[2]
            sentence2_id_de = row[3]
            words[word_de] = {'word_en': word_en,
                              'sentence1_id_de': sentence1_id_de,
                              'sentence2_id_de': sentence2_id_de}
    return words

In [132]:
vocab = ingest_sheet('./NW_7_Days-and-times.tsv')

In [133]:
vocab

{'der Arzt, die Ärzte': {'word_en': 'doctor (male)',
  'sentence1_id_de': '458886',
  'sentence2_id_de': ''},
 'die Ärztin, die Ärztinnen': {'word_en': 'doctor (female)',
  'sentence1_id_de': '428477',
  'sentence2_id_de': ''},
 'der Babysitter, die Babysitter': {'word_en': 'babysitter (male)',
  'sentence1_id_de': '3751325',
  'sentence2_id_de': ''},
 'die Babysitterin, die Babysitterinnen': {'word_en': 'babysitter (female)',
  'sentence1_id_de': '8074468',
  'sentence2_id_de': ''},
 'beginnen': {'word_en': 'to begin; to start',
  'sentence1_id_de': '722917',
  'sentence2_id_de': ''},
 'krank': {'word_en': 'sick; ill',
  'sentence1_id_de': '722917',
  'sentence2_id_de': ''},
 'die Lösung, die Lösungen': {'word_en': 'solution',
  'sentence1_id_de': '1205',
  'sentence2_id_de': ''},
 'die Mittagspause, die Mittagspausen': {'word_en': 'lunch break',
  'sentence1_id_de': '7552388',
  'sentence2_id_de': ''},
 'die Schule, die Schulen': {'word_en': 'school',
  'sentence1_id_de': '135753',
 

## Add the Tatoeba data
For each word, find the Tatoeba sentence it links to. Get the text and the audio.
What order?

Not all German sentences have translations.

Not all German sentences have audio.

So get the text of the German sentence first.

Then check for translations.

Then check for audio.

### Get the German sentences from the sentence_id

In [88]:
SENTENCES_DE_PATH = './deu_sentences.tsv'

In [89]:
def ingest_tsv(filepath_str):
    with open(filepath_str, encoding='utf-8-sig', newline='') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        data = [row for row in tsvreader]
    return data

In [121]:
def get_sentence_from_id(sentence_id, tsv_data):
    for xs in tsv_data:
        if xs[0] == sentence_id:
            return xs[2]
    return ''

In [122]:
sentences_de = ingest_tsv(SENTENCES_DE_PATH)

In [127]:
def validate_id(sentence_id, sentences_data):
    for x in sentences_data:
        if not any([x[0] for x in sentences_data if x[0] == sentence_id]):
            print(f'Could not find sentence with id {sentence_id} in data')
            raise AssertionError
        else:
            return None

In [134]:
for word in vocab:
    sentence1_id = vocab[word]['sentence1_id_de']
    validate_id(sentence1_id, sentences_de)
    sentence2_id = vocab[word]['sentence2_id_de']
    if sentence2_id:
        validate_id(sentence2_id, sentences_de)

In [135]:
for word in vocab:
    sentence1_id = vocab[word]['sentence1_id_de']
    sentence1_text = get_sentence_from_id(sentence1_id, sentences_de)
    vocab[word]['sentence1_text'] = sentence1_text
    
    sentence2_id = vocab[word]['sentence2_id_de']
    sentence2_text = get_sentence_from_id(sentence2_id, sentences_de)
    vocab[word]['sentence2_text'] = sentence2_text

In [136]:
vocab

{'der Arzt, die Ärzte': {'word_en': 'doctor (male)',
  'sentence1_id_de': '458886',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sein Vater Arzt?',
  'sentence2_text': ''},
 'die Ärztin, die Ärztinnen': {'word_en': 'doctor (female)',
  'sentence1_id_de': '428477',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sie Ärztin?',
  'sentence2_text': ''},
 'der Babysitter, die Babysitter': {'word_en': 'babysitter (male)',
  'sentence1_id_de': '3751325',
  'sentence2_id_de': '',
  'sentence1_text': 'Ich benötige keinen Babysitter.',
  'sentence2_text': ''},
 'die Babysitterin, die Babysitterinnen': {'word_en': 'babysitter (female)',
  'sentence1_id_de': '8074468',
  'sentence2_id_de': '',
  'sentence1_text': 'Als Jugendliche jobbte Maria als Babysitterin.',
  'sentence2_text': ''},
 'beginnen': {'word_en': 'to begin; to start',
  'sentence1_id_de': '722917',
  'sentence2_id_de': '',
  'sentence1_text': 'Sollen wir beginnen?',
  'sentence2_text': ''},
 'krank': {'word_en': 'sick; ill',


### Process the translations

In [137]:
PAIRS_FILEPATH = './Sentence pairs in German-English - 2024-12-13.tsv'

In [138]:
pairs_data = ingest_tsv(PAIRS_FILEPATH)

In [139]:
for word in vocab:
    sentence1_rows = [row for row in pairs_data if row[0] == vocab[word]['sentence1_id_de']]
    if sentence1_rows:
        sentence1_de = sentence1_rows[0][1]
        # Next line, take 3 English translations max
        sentence1_translations = [xs[3] for xs in sentence1_rows][:3]
        for i, translation in zip([1, 2, 3], sentence1_translations):
            vocab[word][f'translation{i}_en'] = translation

In [140]:
vocab

{'der Arzt, die Ärzte': {'word_en': 'doctor (male)',
  'sentence1_id_de': '458886',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sein Vater Arzt?',
  'sentence2_text': '',
  'translation1_en': 'Is his father a doctor?'},
 'die Ärztin, die Ärztinnen': {'word_en': 'doctor (female)',
  'sentence1_id_de': '428477',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sie Ärztin?',
  'sentence2_text': '',
  'translation1_en': 'Is she a doctor?'},
 'der Babysitter, die Babysitter': {'word_en': 'babysitter (male)',
  'sentence1_id_de': '3751325',
  'sentence2_id_de': '',
  'sentence1_text': 'Ich benötige keinen Babysitter.',
  'sentence2_text': ''},
 'die Babysitterin, die Babysitterinnen': {'word_en': 'babysitter (female)',
  'sentence1_id_de': '8074468',
  'sentence2_id_de': '',
  'sentence1_text': 'Als Jugendliche jobbte Maria als Babysitterin.',
  'sentence2_text': '',
  'translation1_en': 'Mary worked as a babysitter when she was a teenager.'},
 'beginnen': {'word_en': 'to begin; to st

### Get the audio id
For the sentences with audio

In [141]:
AUDIO_SENTENCES_PATH = './sentences_with_audio.csv'

In [142]:
audio_data = ingest_tsv(AUDIO_SENTENCES_PATH)

In [143]:
audio_data

[['61', '1', 'fucongcong', '', ''],
 ['68', '2', 'fucongcong', '', ''],
 ['78', '754915', 'mramosch', '', ''],
 ['85',
  '566395',
  'driini',
  'CC BY-NC 4.0',
  'https://tatoeba.org/deu/user/profile/driini'],
 ['88',
  '592881',
  'driini',
  'CC BY-NC 4.0',
  'https://tatoeba.org/deu/user/profile/driini'],
 ['91', '754916', 'mramosch', '', ''],
 ['99', '758277', 'mramosch', '', ''],
 ['104', '395121', 'Oblomov', '', ''],
 ['110', '3', 'BraveSentry', '', ''],
 ['113', '395120', 'Oblomov', '', ''],
 ['119',
  '566396',
  'driini',
  'CC BY-NC 4.0',
  'https://tatoeba.org/deu/user/profile/driini'],
 ['120',
  '790509',
  'driini',
  'CC BY-NC 4.0',
  'https://tatoeba.org/deu/user/profile/driini'],
 ['122', '4', 'BraveSentry', '', ''],
 ['125', '5', 'BraveSentry', '', ''],
 ['127', '6', 'BraveSentry', '', ''],
 ['128', '7', 'BraveSentry', '', ''],
 ['129', '8', 'BraveSentry', '', ''],
 ['130', '9', 'BraveSentry', '', ''],
 ['131', '10', 'BraveSentry', '', ''],
 ['132', '11', 'BraveSentr

In [144]:
def get_audio_id(sentence_id, audio_data):
    for xs in audio_data:
        if xs[0] == sentence_id:
            return xs[1]
    return ''

In [145]:
for word in vocab:
    sentence1_id = vocab[word]['sentence1_id_de']
    audio_id = get_audio_id(sentence1_id, audio_data)
    vocab[word]['sentence1_audio_id'] = audio_id
    
    sentence2_id = vocab[word]['sentence2_id_de']
    if sentence2_id:
        audio_id = get_audio_id(sentence2_id, audio_data)
        vocab[word]['sentence1_audio_id'] = audio_id


In [146]:
vocab

{'der Arzt, die Ärzte': {'word_en': 'doctor (male)',
  'sentence1_id_de': '458886',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sein Vater Arzt?',
  'sentence2_text': '',
  'translation1_en': 'Is his father a doctor?',
  'sentence1_audio_id': '35269'},
 'die Ärztin, die Ärztinnen': {'word_en': 'doctor (female)',
  'sentence1_id_de': '428477',
  'sentence2_id_de': '',
  'sentence1_text': 'Ist sie Ärztin?',
  'sentence2_text': '',
  'translation1_en': 'Is she a doctor?',
  'sentence1_audio_id': '33768'},
 'der Babysitter, die Babysitter': {'word_en': 'babysitter (male)',
  'sentence1_id_de': '3751325',
  'sentence2_id_de': '',
  'sentence1_text': 'Ich benötige keinen Babysitter.',
  'sentence2_text': '',
  'sentence1_audio_id': ''},
 'die Babysitterin, die Babysitterinnen': {'word_en': 'babysitter (female)',
  'sentence1_id_de': '8074468',
  'sentence2_id_de': '',
  'sentence1_text': 'Als Jugendliche jobbte Maria als Babysitterin.',
  'sentence2_text': '',
  'translation1_en': 'Mar

## Download the audio files

In [147]:
AUDIO_DIRECTORY = './days_and_times/emmas-tag'

In [148]:
def download_mp3(audio_id,
                 audio_url_template='https://tatoeba.org/audio/download/{0}',
                 audio_directory=AUDIO_DIRECTORY):
    request_url = audio_url_template.format(audio_id)
    mp3data_request = requests.get(request_url)
    mp3data = mp3data_request.content
    with open(f'{audio_directory}/{audio_id}.mp3', 'wb') as mp3file:
        mp3file.write(mp3data)

In [149]:
for word in vocab:
    audio1_id = vocab[word]['sentence1_audio_id']
    if audio1_id:
        download_mp3(audio1_id)
    try:
        audio2_id = vocab[word]['sentence2_audio_id']
        download_mp3(audio2_id)
    except KeyError:
        continue