In [69]:
# need this import so Term and Sentence
# can talk about each other
from __future__ import annotations

from typing import NamedTuple
import csv
import pprint

In [84]:
class Term(NamedTuple):
    """This basically represents all the data needed
    to generate an Anki flaschard for one word / phrase
    in the target language
    """
    text: str  # the word / phrase in the target language.
    translation: str  # the English translation of the word / phrase.
    sentence_ids: list[str]  # list of Tatoeba sentence ids for
                             # the example sentences that go with this term.
                             # This should maybe be a list of ints instead of strs.
    sentences: list[Sentence]  # Objects representing the example sentences.

class Sentence(NamedTuple):
    tatoeba_id: str  # Tatoeba sentence id for this sentence.
                     # Should maybe be a str.
    text: str  # The text of the sentence.
    translations: list[str] | None  # English translations of this sentence.
                                    # Not all sentences will have translations,
                                    # so this value could be None.
    audio_id: str | None  # Tatoeba audio id for this sentence. Not all
                   # sentences will have audios. If there is no
                   # audio, this will be None. This should maybe be an int.
    parent_term: Term  # A reference to the vocab term that this sentence
                       # serves as an example for. This is useful in the
                       # data generation part.

In [85]:
def process_tsv(process_fn, filepath_str, encoding='utf-8-sig'):
    """Call ROW_FN on each row of a TSV file"""
    with open(filepath_str, newline='', encoding=encoding) as f:
        reader = csv.reader(f, delimiter='\t')
        return process_fn(reader)

In [86]:
def process_vocab_list(tsv_reader, skip_header=True):
    terms = []
    header = next(tsv_reader)
    for row in tsv_reader:
        word_de, word_en = row[:2]
        sentence_ids = [s for s in row[2:] if s]
        one_term = Term(word_de, word_en, sentence_ids, [])
        terms.append(one_term)
    return terms

In [87]:
vocab_list_file = './NW_7_Days-and-times - Tageszeiten.tsv'
terms = process_tsv(process_vocab_list, vocab_list_file)

In [88]:
terms

[Term(text='der Abend, die Abende', translation='evening', sentence_ids=['998'], sentences=[]),
 Term(text='das Abendessen, die Abendessen', translation='dinner', sentence_ids=['341418'], sentences=[]),
 Term(text='am Morgen', translation='in the morning', sentence_ids=['347433'], sentences=[]),
 Term(text='jemanden an|rufen', translation='to call someone', sentence_ids=['873078'], sentences=[]),
 Term(text='def Apfel, die Äpfel', translation='apple', sentence_ids=['367095'], sentences=[]),
 Term(text='etwas auf|räumen', translation='to clean up something; to tidy up something', sentence_ids=['618818'], sentences=[]),
 Term(text='aus|gehen', translation='to go out', sentence_ids=['882822'], sentences=[]),
 Term(text='auf|stehen', translation='to get up', sentence_ids=['365619'], sentences=[]),
 Term(text='etwas ein|kaufen', translation='to buy/shop for something', sentence_ids=['3567432'], sentences=[]),
 Term(text='fast immer', translation='almost always', sentence_ids=['2571282'], se