In [1]:
# need this import so Term and Sentence
# can talk about each other
from __future__ import annotations

from collections import defaultdict
import csv
import pprint
from typing import NamedTuple

import requests

## Define the `Term` and `Sentence` objects

In [2]:
class Term(NamedTuple):
    """This basically represents all the data needed
    to generate an Anki flaschard for one word / phrase
    in the target language
    """
    text: str  # the word / phrase in the target language.
    translation: str  # the English translation of the word / phrase.
    sentence_ids: list[str]  # list of Tatoeba sentence ids for
                             # the example sentences that go with this term.
                             # This should maybe be a list of ints instead of strs.
    sentences: list[Sentence]  # Objects representing the example sentences.

# This being a named tuple is actually a problem
# bc when it's initialized it doesn't have all of
# its data. Like I want to add the audio id later
# This could be a types.SimpleNamespace but I want to
# enforce a specific constructor.
# TODO: better print function
class Sentence:
    tatoeba_id: str  # Tatoeba sentence id for this sentence.
                     # Should maybe be a str.
    text: str  # The text of the sentence.
    translations: list[str] | None  # English translations of this sentence.
                                    # Not all sentences will have translations,
                                    # so this value could be None.
    audio_id: str | None  # Tatoeba audio id for this sentence. Not all
                   # sentences will have audios. If there is no
                   # audio, this will be None. This should maybe be an int.
    parent_term: Term  # A reference to the vocab term that this sentence
                       # serves as an example for. This is useful in the
                       # data generation part. TODO: This actually isn't necessary
    
    def __init__(self, tatoeba_id, text, parent_term):
        self.tatoeba_id = tatoeba_id
        self.text = text
        self.parent_term = parent_term

        self.translations = []
        self.audio_id = None

    def print(self):
        #TODO make this reasonable
        attr_dict = {
            'tatoeba_id': self.tatoeba_id,
            'text': self.text,
            'translations': self.translations,
            'audio_id': self.audio_id
        }
        return pprint.pp(attr_dict)

In [3]:
def process_tsv(process_fn, filepath_str, encoding='utf-8-sig'):
    """Call ROW_FN on each row of a TSV file"""
    with open(filepath_str, newline='', encoding=encoding) as f:
        reader = csv.reader(f, delimiter='\t')
        return process_fn(reader)


## Turn the vocab list `tsv` file into list of `Terms`

In [4]:
def process_vocab_list(tsv_reader, skip_header=True, validation=True):
    terms = []
    if skip_header:
        header = next(tsv_reader)
    for row in tsv_reader:
        word_de, word_en = row[:2]
        sentence_ids = [s for s in row[2:] if s]
        one_term = Term(word_de, word_en, sentence_ids, [])
        terms.append(one_term)
    if validation:
        for term in terms:
            # every term should have at least
            # one sentence_id and one translation
            assert term.sentence_ids
            assert term.translation
    return terms

In [5]:
vocab_list_file = './NW_7_Days-and-times - Tageszeiten.tsv'
terms = process_tsv(process_vocab_list, vocab_list_file)

In [6]:
terms

[Term(text='der Abend, die Abende', translation='evening', sentence_ids=['998'], sentences=[]),
 Term(text='das Abendessen, die Abendessen', translation='dinner', sentence_ids=['341418'], sentences=[]),
 Term(text='am Morgen', translation='in the morning', sentence_ids=['347433'], sentences=[]),
 Term(text='jemanden an|rufen', translation='to call someone', sentence_ids=['873078'], sentences=[]),
 Term(text='der Apfel, die Äpfel', translation='apple', sentence_ids=['367095'], sentences=[]),
 Term(text='etwas auf|räumen', translation='to clean up something; to tidy up something', sentence_ids=['618818'], sentences=[]),
 Term(text='aus|gehen', translation='to go out', sentence_ids=['882822'], sentences=[]),
 Term(text='auf|stehen', translation='to get up', sentence_ids=['365619'], sentences=[]),
 Term(text='etwas ein|kaufen', translation='to buy/shop for something', sentence_ids=['3567432'], sentences=[]),
 Term(text='fast immer', translation='almost always', sentence_ids=['2571282'], se

## Now add the sentence data to the `Terms`

In [7]:
def add_sentences(tsvreader, terms, validate=True):
    
    # a simple many-to-one mapping of
    # tatoeba sentence id -> [Terms]
    terms_by_id = defaultdict(list)
    for t in terms:
        for sentence_id in t.sentence_ids:
            terms_by_id[sentence_id].append(t)
    
    for row in tsvreader:
        sentence_id, text = row[0], row[2]
        if sentence_id in terms_by_id:
            for t in terms_by_id[sentence_id]:
                sentence = Sentence(sentence_id, text, t)
                t.sentences.append(sentence)
        # could possibly speed up performance here by
        # deleting the key (sentence_id) after a hit
        # and then checking on each loop iteration to
        # see if there are any keys left.
        # tbh this feels like a good problem to solve
        # with a queue
    
    if validate:
        for t in terms:
            # all terms should have at least one sentence
            if not t.sentences:
                raise AssertionError(f"found no sentences for {t}")
            for s in t.sentences:
                assert s.tatoeba_id
                assert s.text
                assert s.tatoeba_id in t.sentence_ids
    return

In [8]:
sentences_file = 'deu_sentences.tsv'

In [9]:
process_tsv(lambda x: add_sentences(x, terms, True), sentences_file)

## Add translation data to the `Terms`
Provided that there is translation data

In [10]:
def add_translations(tsvreader, terms, limit=2):
    sentences_by_id = defaultdict(list)
    for t in terms:
        for s in t.sentences:
            sentences_by_id[s.tatoeba_id].append(s)
    for row in tsvreader:
        sentence_id = row[0]
        translation = row[3]
        if sentence_id in sentences_by_id:
            sentences = sentences_by_id[sentence_id]
            for s in sentences:
                if len(s.translations) < limit:
                    s.translations.append(translation)
    return

In [11]:
translations_file = 'Sentence pairs in German-English - 2024-12-13.tsv'

In [12]:
process_tsv(lambda x: add_translations(x, terms), translations_file)

## Add audio data to the `Terms`

In [13]:
def add_audio_ids(tsvreader, terms):
    sentence_by_id = {}
    for t in terms:
        for s in t.sentences:
            sentence_by_id[s.tatoeba_id] = s
    for row in tsvreader:
        sentence_id = row[0]
        audio_id = row[1]
        if sentence_id in sentence_by_id:
            s = sentence_by_id[sentence_id]
            s.audio_id = audio_id
            # Delete the key in case the data file
            # has multiple audio ids for this sentence
            sentence_by_id.pop(sentence_id)
    return

In [14]:
audio_id_file = 'sentences_with_audio.csv'

In [15]:
process_tsv(lambda x: add_audio_ids(x, terms), audio_id_file)

In [16]:
terms[0].sentences[0].print()

{'tatoeba_id': '998',
 'text': 'Ich dusche gewöhnlich abends.',
 'translations': ['I usually take a shower in the evening.',
                  'I usually shower at night.'],
 'audio_id': '634'}


In [17]:
no_audios = [t for t in terms if any(map(lambda s: not s.audio_id, t.sentences))]

In [18]:
print(no_audios)

[Term(text='am Morgen', translation='in the morning', sentence_ids=['347433'], sentences=[<__main__.Sentence object at 0x000002053DE91340>]), Term(text='auf|stehen', translation='to get up', sentence_ids=['365619'], sentences=[<__main__.Sentence object at 0x000002053DE913A0>]), Term(text='etwas ein|kaufen', translation='to buy/shop for something', sentence_ids=['3567432'], sentences=[<__main__.Sentence object at 0x000002053DE92210>]), Term(text='fast immer', translation='almost always', sentence_ids=['2571282'], sentences=[<__main__.Sentence object at 0x000002053DE92270>]), Term(text='die Flöte, die Flöten', translation='flute', sentence_ids=['11932738'], sentences=[<__main__.Sentence object at 0x000002053DE924E0>]), Term(text='das Mittagessen, die Mittagessen', translation='lunch', sentence_ids=['10513933'], sentences=[<__main__.Sentence object at 0x000002053DE92300>]), Term(text='das Müsli, die Müslis', translation='granola', sentence_ids=['10919429'], sentences=[<__main__.Sentence o

In [19]:
no_audios[0].sentences[0].print()

{'tatoeba_id': '347433',
 'text': 'Könnte ich am Morgen duschen?',
 'translations': ['May I take a shower in the morning?'],
 'audio_id': None}


In [20]:
no_audios[1].sentences[0].print()

{'tatoeba_id': '365619',
 'text': 'Ich stehe meistens um acht auf.',
 'translations': ['I usually get up at eight.',
                  "I usually get up at eight o'clock."],
 'audio_id': None}


## Write terms to Anki-readble `.tsv`
We'll only take two translations per sentence.
The file should look something like
```
word_de | word_en | tags| sentence1_de | audio1_id | sentence1_tr1 | sentence1_tr2 | sentence2_de audio2_id | sentence2_tr1 | sentence2_tr2 | tags
```

In [21]:
def create_anki_tsv(tsvwriter,
                    terms,
                    max_translations=2,  # max translations of word_en
                    max_sentences=2,  # max example sentences
                    format_audio=True, 
                    headers=None,
                    tags=None):
    if not headers:
        headers = ['word_de',
                   'word_en',
                   'tags',
                   'sentence1_de',
                   'audio1_id',
                   'sentence1_tr1',
                   'sentence1_tr2',
                   'sentence2_de',
                   'audio2_id',
                   'sentence2_tr1',
                   'sentence2_tr2']
    tsvwriter.writerow(headers)
    for t in terms:
        row = []  # one row per term
        row.append(t.text)
        row.append(t.translation)
        row.append(tags)
        for s in t.sentences[:max_sentences]:
            row.append(s.text)
            if s.audio_id and format_audio:
                row.append('[sound:{}.mp3]'.format(s.audio_id))
            elif s.audio_id:
                row.append(s.audio_id)
            else:
                row.append('')
            
            row.extend(s.translations[:max_translations])
            # need row padding in case there are less translations
            # than the maximum allowed
            row_padding = [''] * max(0, max_translations - len(s.translations))
            row.extend(row_padding)
        tsvwriter.writerow(row)
    return

In [26]:
out_tsv = './tageszeiten_test.tsv'

In [27]:
with open(out_tsv, 'w', newline='', encoding='utf-8-sig') as f:
    tsvwriter = csv.writer(f, delimiter='\t')
    tags = ['python::refactor-test nw::days-and-times::tagezeiten']
    create_anki_tsv(tsvwriter, terms, tags=tags)

## Download Audio files

In [80]:
AUDIO_DIRECTORY = './days_and_times/tageszeiten'

In [76]:
def download_mp3s(terms,
                 audio_url_template='https://tatoeba.org/audio/download/{0}',
                 audio_directory=AUDIO_DIRECTORY):
    
    sentences = [s for t in terms for s in t.sentences if s.audio_id]
    for s in sentences:
        audio_id = s.audio_id
        request_url = audio_url_template.format(audio_id)
        mp3data_request = requests.get(request_url)
        mp3data = mp3data_request.content
        with open(f'{audio_directory}/{audio_id}.mp3', 'wb') as mp3file:
            mp3file.write(mp3data)

In [77]:
download_mp3s(terms)