In [2]:
import os
os.environ['PATH_TO_REPO'] = '/Users/stevie/repos/lingo_kit_data'

In [3]:
repo_path = os.environ['PATH_TO_REPO']
assert(os.path.exists(repo_path))

# Update Term Pronunciation

Loads all CSVs under `dataframes/dataframes_by_pos`, collects unique `term_italian` values and their `pronunciation`, then PATCHes each Term via the API endpoint `terms/by-term-italian/`.

In [4]:
from pathlib import Path
from collections import defaultdict
import pandas as pd
import requests
from tqdm import tqdm
import os

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)


## Configuration

In [5]:
# Directory containing per-POS CSVs
DATA_DIR = Path(os.path.join(repo_path, 'dataframes/dataframes_by_pos'))

# Base URL of your API (no trailing slash)
API_BASE_URL = os.getenv('LINGOKIT_API_BASE_URL', 'http://127.0.0.1:8000')
# Endpoint path for patching by exact term_italian
TERMS_BY_TERM_ITALIAN_PATH = os.getenv('LINGOKIT_TERMS_BY_TERM_PATH', '/api/terms/by-term-italian/')

# Auth token (Token or JWT). If it contains dots, 'Bearer' will be used; otherwise 'Token'.
AUTH_TOKEN = ''  # explicitly unused per request
AUTH_SCHEME_OVERRIDE = ''  # explicitly unused per request
# Explicit Basic Auth credentials (per request)
USERNAME = 'stevie'
PASSWORD = 'lingokit2025!'

# Safety toggle. Set to False to actually PATCH the API.
DRY_RUN = False

def build_session():
    s = requests.Session()
    s.headers['Accept'] = 'application/json'
    s.headers['Content-Type'] = 'application/json'
    # Always use HTTP Basic with explicit credentials
    s.auth = (USERNAME, PASSWORD)
    return s

def terms_patch_url():
    return f"{API_BASE_URL.rstrip('/')}" + TERMS_BY_TERM_ITALIAN_PATH

API_BASE_URL, TERMS_BY_TERM_ITALIAN_PATH


('http://127.0.0.1:8000', '/api/terms/by-term-italian/')

## Discover CSV files

In [6]:
csv_files = sorted(DATA_DIR.rglob('*.csv'))
len(csv_files), csv_files[:5]


(801,
 [PosixPath('/Users/stevie/repos/lingo_kit_data/dataframes/dataframes_by_pos/adjective/addolorato.csv'),
  PosixPath('/Users/stevie/repos/lingo_kit_data/dataframes/dataframes_by_pos/adjective/adulto.csv'),
  PosixPath('/Users/stevie/repos/lingo_kit_data/dataframes/dataframes_by_pos/adjective/aereo.csv'),
  PosixPath('/Users/stevie/repos/lingo_kit_data/dataframes/dataframes_by_pos/adjective/aggiunto.csv'),
  PosixPath('/Users/stevie/repos/lingo_kit_data/dataframes/dataframes_by_pos/adjective/alto.csv')])

## Build term -> pronunciation mapping

In [7]:
term_to_pron = {}  # term_italian -> pronunciation
conflicts = {}      # term_italian -> set of differing pronunciations
missing_pron = set()

for path in tqdm(csv_files, desc='Scanning CSVs'):
    try:
        df = pd.read_csv(path, dtype=str, keep_default_na=False, na_filter=False, encoding='utf-8')
    except Exception as e:
        print(f'[WARN] Failed to read {path}: {e}')
        continue

    if 'term_italian' not in df.columns:
        print(f'[WARN] term_italian column missing in {path}')
        continue
    if 'pronunciation' not in df.columns:
        print(f'[WARN] pronunciation column missing in {path}')
        continue

    for term, pron in zip(df['term_italian'], df['pronunciation']):
        t = (term or '').strip()
        p = (pron or '').strip()
        if not t:
            continue
        if not p:
            missing_pron.add(t)
            # don't overwrite an existing non-empty mapping
            continue
        if t not in term_to_pron:
            term_to_pron[t] = p
        else:
            if term_to_pron[t] != p:
                # record conflict; keep the first seen
                conflicts.setdefault(t, set()).update({term_to_pron[t], p})

len(term_to_pron), len(conflicts), len(missing_pron)


Scanning CSVs: 100%|██████████| 801/801 [00:00<00:00, 1439.84it/s]


(8608, 100, 0)

If there are conflicts above, the first seen pronunciation will be used.

## Patch API

In [8]:
session = build_session()
print('Auth method: Basic (user=stevie)')
url = terms_patch_url()
print('PATCH URL:', url)

def patch_term_pronunciation(term_italian: str, pronunciation: str):
    params = {'term_italian': term_italian}
    payload = {'pronunciation': pronunciation}
    if DRY_RUN:
        return {'dry_run': True, 'term_italian': term_italian, 'pronunciation': pronunciation}
    resp = session.patch(url, params=params, json=payload, timeout=30)
    return resp

# quick smoke test (disabled by default)
# _ = patch_term_pronunciation('il treno', 'il tré-no')


Auth method: Basic (user=stevie)
PATCH URL: http://127.0.0.1:8000/api/terms/by-term-italian/


## Execute updates

In [9]:
results = {
    'updated': 0,
    'skipped_missing_pron': 0,
    'errors': 0,
}
errors = []

for term, pron in tqdm(term_to_pron.items(), desc='Patching terms'):
    if not pron:
        results['skipped_missing_pron'] += 1
        continue
    try:
        resp = patch_term_pronunciation(term, pron)
        if DRY_RUN:
            results['updated'] += 1
            continue
        if getattr(resp, 'status_code', None) in (200, 202):
            results['updated'] += 1
        else:
            results['errors'] += 1
            body = getattr(resp, 'text', '')
            errors.append({'term': term, 'status': resp.status_code, 'body': body[:500]})
    except Exception as e:
        results['errors'] += 1
        errors.append({'term': term, 'error': str(e)})

results


Patching terms: 100%|██████████| 8608/8608 [4:02:22<00:00,  1.69s/it]     


{'updated': 4098, 'skipped_missing_pron': 0, 'errors': 4510}

## Summary

In [10]:
print('Conflicts:', len(conflicts))
if conflicts:
    # show a few examples
    for i, (t, ps) in enumerate(conflicts.items()):
        if i >= 5: break
        print('-', t, '->', list(ps))
print('Missing pronunciations (not patched):', len(missing_pron))
print('Errors:', len(errors))
if errors:
    for i, e in enumerate(errors[:10]):
        print(e)


Conflicts: 100
- certo -> ['CHEHR-toh', 'CHER-toh']
- lontano -> ['lon-TAH-noh', 'lohn-TAH-noh']
- pronto -> ['PRON-toh', 'PROHN-toh']
- spesso -> ['SPEHS-soh', 'SPEH-soh']
- però -> ['pehr-OH', 'peh-ROH']
Missing pronunciations (not patched): 0
Errors: 4510
{'term': 'aereo', 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': 'aerea', 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': 'aerei', 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': 'aeree', 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': "dall'altro", 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': "dall'altra", 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': "d'altro", 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term': "l'altra", 'status': 404, 'body': '{"detail":"No Term matches the given query."}'}
{'term'