In [56]:
import fitz
import os
import pandas as pd
import re
import json
import uuid
import ast
import warnings
from datetime import timedelta
import dateparser


IN_FOLDER= '../data/excel'
OUT_FOLDER= '../data/json'

In [57]:
file="clean-csv.xlsx"
path= os.path.join(IN_FOLDER,file)
df=pd.read_excel(path)
df.head()

Unnamed: 0,ID,page_start,page_end,titre,sous-titre,dates,salle,auteur,adaptation,mise en scène,...,production,coproduction,soutien,aide,source,coréalisation,other,works,date_start,date_end
0,od_1,4,5,Le passé,Les cieux s’ouvrirent et… ici prit fin l’histo...,13 septembre—4 octobre,Odéon paris 6,['Léonid Andréïev'],['Julien Gosselin'],['Julien Gosselin'],...,['Si vous pouviez lécher mon coeur'],"['Odéon théâtre de l’europe', 'Le phénix — scè...",['Ministère de la culture'],"['Montévidéo — centre d’art', 'T2g théâtre de ...",programme de saison Odéon 25/26,,,['Ékatérina Ivanovna'],13 September 2025,04 October 2025
1,od_2,7,7,Musée duras,,9—30 novembre,Berthier paris 17,['Marguerite Duras'],,['Julien Gosselin'],...,"['Odéon théâtre de l’europe', 'Conservatoire n...",,,['Jeune théâtre national'],programme de saison Odéon 25/26,,,,09 November 2025,30 November 2025
2,od_3,8,8,Honda romance,,14—26 octobre,Odéon paris 6,,,['Vimala Pons'],...,"['Tout ça / que ça', 'Comédie de genève']","['Odéon théâtre de l’europe', 'Mc2 : maison de...",['Fondation bnp paribas'],"['""plateforme 2 pôles cirque en normandie — la...",programme de saison Odéon 25/26,['Festival d’Automne'],Festival d’Automne,,14 October 2025,26 October 2025
3,od_4,9,9,Pallaksch pallaksch!,Pièces élémentaires,26 novembre—14 février,Petit odéon paris 6,"['D. H. Lawrence', 'Arthur Schnitzler', 'Hugo ...",,['Marie-José Malis'],...,['Compagnie la llevantina'],"['Odéon théâtre de l’europe', 'Comédie de genè...",,,programme de saison Odéon 25/26,,,"['La Coccinelle', 'Le Voile de Pierrette', 'Le...",26 November 2025,14 February 2026


In [55]:
def load_json(path):
    """Load JSON file or return None if not found."""
    if not os.path.exists(path):
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data, path):
    """Save data as JSON file."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def random_person_uri(base="https://data.stage.org/auth/"):
    """Generate random person URI."""
    return base + uuid.uuid4().hex

def is_empty_cell(x):
    """Check if cell value is empty."""
    if x is None:
        return True
    if isinstance(x, float) and pd.isna(x):
        return True
    if isinstance(x, str) and x.strip() in ("", "[]"):
        return True
    return False

def parse_listlike_cell(val):
    """Parse Excel list-as-string like '["A","B"]' or list to list."""
    if val is None:
        return []
    if isinstance(val, list):
        return [x for x in val if x is not None]
    if isinstance(val, str):
        s = val.strip()
        if not s:
            return []
        if s.startswith("[") and s.endswith("]"):
            try:
                out = ast.literal_eval(s)
                return out if isinstance(out, list) else []
            except Exception:
                return []
        return [s]
    return []

def parse_names_cell(val):
    """Parse names from cell value, returns list of strings."""
    names = parse_listlike_cell(val)
    return [x.strip() for x in names if isinstance(x, str) and x.strip()]

def parse_date_any(s):
    """Parse date from various formats."""
    if not isinstance(s, str) or not s.strip():
        return None
    
    s = s.strip()
    dt = dateparser.parse(
        s,
        languages=["fr", "en"],
        settings={"TIMEZONE": "UTC", "RETURN_AS_TIMEZONE_AWARE": False}
    )
    return dt

def parse_year_from_date_start(date_str):
    """Extract year from date_start string."""
    if not isinstance(date_str, str) or not date_str.strip():
        return None
    dt = parse_date_any(date_str)
    return dt.year if dt else None

def to_iso_utc_midnight(dt):
    """Convert datetime to ISO format at midnight UTC."""
    return dt.strftime("%Y-%m-%dT00:00:00Z")

def daterange_inclusive(start_dt, end_dt):
    """Generate dates from start to end inclusive."""
    cur = start_dt
    while cur <= end_dt:
        yield cur
        cur += timedelta(days=1)

def load_person_index_csv(index_csv_path):
    """Load person index CSV, returns dict: person_name -> person_id."""
    df_idx = pd.read_csv(index_csv_path)
    return dict(zip(df_idx["person_name"], df_idx["person_id"]))

def save_person_index_csv(name_to_id, index_csv_path):
    """Save person index to CSV."""
    df_idx = pd.DataFrame(
        [(pid, name) for name, pid in name_to_id.items()],
        columns=["person_id", "person_name"]
    )
    df_idx.to_csv(index_csv_path, index=False)

def get_or_create_person_id(name, name_to_id):
    """Get existing person ID or create new one."""
    pid = name_to_id.get(name)
    if not pid:
        pid = random_person_uri()
        name_to_id[name] = pid
        warnings.warn(f"[index] Missing person '{name}' -> generated new id: {pid}")
    return pid

In [13]:
def build_fragment_a(work_id, title, director_names, name_to_id):
    """Build Fragment A: Work with directors."""
    carried_out_by = []
    
    for name in director_names:
        pid = get_or_create_person_id(name, name_to_id)
        carried_out_by.append({
            "id": pid,
            "type": "Person",
            "_label": name,
        })
    
    return {
        "@context": "https://linked.art/ns/v1/linked-art.json",
        "id": f"https://data.stage.org/works/{work_id}",
        "type": "PropositionalObject",
        "_label": f"{title} as conceived by {', '.join(director_names)}",
        "classified_as": [{
            "id": "https://vocab.getty.edu/aat/300387357",
            "type": "Type",
            "_label": "works (general, creative)",
        }],
        "identified_by": [{
            "type": "Name",
            "classified_as": [{
                "id": "http://vocab.getty.edu/aat/300404670",
                "type": "Type",
                "_label": "Title",
            }],
            "content": title,
            "language": [{
                "id": "http://vocab.getty.edu/aat/300388306",
                "type": "Language",
                "label": "French",
            }],
        }],
        "created_by": [{
            "type": "Creation",
            "part": [{
                "type": "Production",
                "technique": [{
                    "id": "http://vocab.getty.edu/page/aat/300404387",
                    "type": "Type",
                    "_label": "Creating",
                }],
                "carried_out_by": carried_out_by,
            }],
        }],
    }

def build_fragment_a_2(row, name_to_id, works_col, auteur_col):
    """Build Fragment A2: Influenced by (works or authors)."""
    influenced_by = []
    
    works_val = row.get(works_col)
    authors = parse_names_cell(row.get(auteur_col))
    author_name = authors[0] if authors else ""
    
    # If no works, use author persons
    if is_empty_cell(works_val):
        for author in authors:
            pid = get_or_create_person_id(author, name_to_id)
            influenced_by.append({
                "id": pid,
                "type": "Person",
                "_label": author
            })
    # If works present, create linguistic objects
    else:
        works = parse_names_cell(works_val)
        for work_name in works:
            influenced_by.append({
                "id": f"https://data.stage.org/text/{work_name}",
                "type": "LinguisticObject",
                "_label": f"{work_name} de {author_name}".strip() if author_name else work_name,
                "classified_as": [{
                    "id": "http://vocab.getty.edu/page/aat/300410356",
                    "type": "Type",
                    "_label": "Adaptation"
                }]
            })
    
    return {"influenced_by": influenced_by}

def build_fragment_a_3(row, date_start_col, source_id_col):
    """Build Fragment A3: Timespan and attribution."""
    source_id = row.get(source_id_col)
    date_start = row.get(date_start_col)
    
    year = parse_year_from_date_start(date_start)
    ts = None
    if year:
        ts = {
            "type": "TimeSpan",
            "_label": str(year),
            "begin_of_the_begin": f"{year}-01-01T00:00:00Z",
        }
    
    attributed_by = [{
        "type": "AttributeAssignment",
        "identified_by": [{
            "type": "Type",
            "id": "https://vocab.getty.edu/aat/300027216",
            "_label": "show programme",
            "classified_as": [{
                "id": "https://vocab.getty.edu/aat/300311936",
                "type": "Type",
                "_label": "primary source",
            }],
        }],
        "assigned_property": "takes_information_from",
        "assigned": [{
            "type": "HumanMadeObject",
            "id": f"https://data.stage.org/programs/{source_id}",
            "_label": str(source_id),
        }],
    }]
    
    return {
        "timespan": ts,
        "attributed_by": attributed_by
    }

def process_fragment_a_row(row, name_to_id, id_col, title_col, director_col, 
                          works_col, auteur_col, date_start_col, source_id_col, out_path):
    """Process one row for all Fragment A variants."""
    row_id = row.get(id_col)
    title = row.get(title_col)
    
    if is_empty_cell(row_id) or is_empty_cell(title):
        return
    
    row_id = str(row_id).strip()
    title = str(title).strip()
    
    row_dir = os.path.join(out_path, row_id)
    os.makedirs(row_dir, exist_ok=True)
    
    # Fragment A - main work
    directors = parse_names_cell(row.get(director_col))
    directors = list(dict.fromkeys(directors))  # Remove duplicates
    fragment_a = build_fragment_a(row_id, title, directors, name_to_id)
    save_json(fragment_a, os.path.join(row_dir, "fragment_a.json"))
    
    # Fragment A2 - influenced by
    fragment_a_2 = build_fragment_a_2(row, name_to_id, works_col, auteur_col)
    save_json(fragment_a_2, os.path.join(row_dir, "fragment_a_2.json"))
    
    # Fragment A3 - timespan and attribution
    fragment_a_3 = build_fragment_a_3(row, date_start_col, source_id_col)
    save_json(fragment_a_3, os.path.join(row_dir, "fragment_a_3.json"))

In [None]:
def assemble_from_folder(row_id, title, folder_path):
    """Assemble Fragment B from component fragments."""
    row_dir = os.path.join(folder_path, str(row_id))
    
    # Load fragments
    title_frag = load_json(os.path.join(row_dir, "b-production","b-title.json"))
    place_frag = load_json(os.path.join(row_dir, "b-production","b-place.json"))
    creative_frag = load_json(os.path.join(row_dir, "b-production", "b-creative.json"))
    prod_frag = load_json(os.path.join(row_dir, "b-production", "b-prod.json"))
    timespan_frag = load_json(os.path.join(row_dir, "b-production", "b-timespan.json"))
    
    # Build final structure
    final = {
        "@context": "https://linked.art/ns/v1/linked-art.json",
        "id": f"https://data.stage.org/prod/{row_id}",
        "type": "Activity",
        "_label": title,
        "classified_as": [{
            "id": "https://vocab.getty.edu/aat/300069200",
            "type": "Type",
            "_label": "Performances (creative events)",
            "classified_as": [{
                "id": "https://vocab.getty.edu/aat/300417582",
                "type": "Type",
                "_label": "Theater (genre)",
            }],
        }],
        "produced_by": [{"type": "Production", "part": []}],
    }
    
    # Add title and place
    for frag in (title_frag, place_frag):
        if isinstance(frag, dict):
            final.update(frag)
    
    # Add creative and production parts
    for frag in (creative_frag, prod_frag):
        if not isinstance(frag, dict):
            continue
        if "part" in frag and isinstance(frag["part"], list):
            final["produced_by"][0]["part"].extend(frag["part"])
        elif "technique" in frag and "carried_out_by" in frag:
            final["produced_by"][0]["part"].append(frag)
    
    # Add timespan
    if isinstance(timespan_frag, dict):
        final["timespan"] = timespan_frag
    
    return final

def process_fragment_b_row(row, id_col, title_col, folder_path):
    """Process one row for Fragment B."""
    row_id = row.get(id_col)
    title = row.get(title_col)
    
    if is_empty_cell(row_id):
        return
    
    title = title if isinstance(title, str) else ""
    row_dir = os.path.join(folder_path, str(row_id))
    os.makedirs(row_dir, exist_ok=True)
    
    final = assemble_from_folder(row_id, title, folder_path)
    save_json(final, os.path.join(row_dir, "b-production", "b-final.json"))

In [15]:
def build_C_for_date(row_id, title, odeon_id, iso_date, duration_minutes):
    """Build Fragment C for a specific date."""
    date_suffix = iso_date[:10].replace("-", "")  # YYYYMMDD
    
    return {
        "@context": "https://linked.art/ns/v1/linked-art.json",
        "id": f"https://data.stage.org/shows/{row_id}_{date_suffix}",
        "type": "Activity",
        "_label": f"Specific Dates for {title} at Odéon",
        "part_of": [{
            "id": f"https://data.stage.org/prod/{row_id}",
            "type": "Activity",
            "_label": f"Production {title} at Odéon",
        }],
        "classified_as": [{
            "id": "http://vocab.getty.edu/page/aat/300XXXXXX",
            "type": "Type",
            "_label": "performance (performing arts show) to be defined in Getty",
        }],
        "timespan": {
            "type": "TimeSpan",
            "_label": "Date",
            "begin_of_the_begin": iso_date,
            "duration": {
                "type": "Dimension",
                "value": duration_minutes,
                "unit": {
                    "id": "http://vocab.getty.edu/aat/300379240",
                    "type": "MeasurementUnit",
                    "_label": "minutes",
                },
            },
        },
        "attributed_by": [{
            "type": "AttributeAssignment",
            "identified_by": [{
                "type": "Type",
                "id": "https://vocab.getty.edu/aat/300027216",
                "_label": "show programme",
                "classified_as": [{
                    "id": "https://vocab.getty.edu/aat/300311936",
                    "type": "Type",
                    "_label": "primary source",
                }],
            }],
            "assigned_property": "takes_information_from",
            "assigned": [{
                "type": "HumanMadeObject",
                "id": f"https://data.stage.org/programs/{odeon_id}",
                "_label": str(odeon_id),
            }],
        }],
    }

def process_fragment_c_row(row, id_col, title_col, duration_col, odeon_id_col, 
                          start_col, end_col, out_dir):
    """Process one row for Fragment C (all dates)."""
    row_id = row.get(id_col)
    
    if is_empty_cell(row_id):
        return
    
    title = row.get(title_col)
    odeon_id = row.get(odeon_id_col)
    duration = row.get(duration_col)
    
    if is_empty_cell(title):
        return
    
    # Parse duration
    try:
        duration_minutes = int(duration)
    except Exception:
        duration_minutes = None
    
    # Parse dates
    start_dt = parse_date_any(row.get(start_col))
    end_dt = parse_date_any(row.get(end_col))
    
    if not start_dt or not end_dt:
        return
    
    # Normalize to midnight
    start_dt = start_dt.replace(hour=0, minute=0, second=0, microsecond=0)
    end_dt = end_dt.replace(hour=0, minute=0, second=0, microsecond=0)
    
    # Build collection for all dates
    collection = []
    for dt in daterange_inclusive(start_dt, end_dt):
        iso = to_iso_utc_midnight(dt)
        collection.append(build_C_for_date(
            row_id=str(row_id),
            title=str(title).strip(),
            odeon_id=odeon_id,
            iso_date=iso,
            duration_minutes=duration_minutes,
        ))
    
    # Save
    out_path = os.path.join(out_dir, f"c_{row_id}.json")
    save_json(collection, out_path)

In [16]:
def process_all_fragments(df, out_path, index_csv_path):
    """Process all fragments (A, B, C) for entire dataframe."""
    os.makedirs(out_path, exist_ok=True)
    
    # Load person index
    name_to_id = load_person_index_csv(index_csv_path)
    
    # Process each row
    for _, row in df.iterrows():
        # Fragment A (all variants)
        process_fragment_a_row(
            row, name_to_id,
            id_col="ID",
            title_col="titre",
            director_col="mise en scène",
            works_col="works",
            auteur_col="auteur",
            date_start_col="date_start",
            source_id_col="source",
            out_path=out_path
        )
        
        # Fragment B
        process_fragment_b_row(
            row,
            id_col="ID",
            title_col="titre",
            folder_path=out_path
        )
        
        # Fragment C
        process_fragment_c_row(
            row,
            id_col="ID",
            title_col="titre",
            duration_col="durée",
            odeon_id_col="source",
            start_col="date_start",
            end_col="date_end",
            out_dir=out_path
        )
    
    # Save updated person index
    save_person_index_csv(name_to_id, index_csv_path)

In [19]:
if __name__ == "__main__":
    # Load data
    file = "clean-csv.xlsx"
    path = os.path.join(IN_FOLDER, file)
    df = pd.read_excel(path)
    
    print(f"Loaded {len(df)} rows from {file}")
    
    # Process all fragments
    index_csv_path = os.path.join(IN_FOLDER, "person_index.csv")
    process_all_fragments(df, OUT_FOLDER, index_csv_path)
    
    print(f"\nProcessing complete. Output saved to {OUT_FOLDER}")

Loaded 4 rows from clean-csv.xlsx

Processing complete. Output saved to ../data/json


In [None]:
def assemble_final_a(row_id, folder_path):
    """Assemble final_a.json from fragment_a, fragment_a_2, and fragment_a_3."""
    row_dir = os.path.join(folder_path, str(row_id))
    
    # Load fragments
    fragment_a = load_json(os.path.join(row_dir, "fragment_a.json"))
    fragment_a_2 = load_json(os.path.join(row_dir, "fragment_a_2.json"))
    fragment_a_3 = load_json(os.path.join(row_dir, "fragment_a_3.json"))
    
    if not fragment_a:
        print(f"Warning: fragment_a.json not found for {row_id}")
        return None
    
    # Start with fragment_a as base
    final_a = fragment_a.copy()
    
    # Add influenced_by from fragment_a_2
    if fragment_a_2 and "influenced_by" in fragment_a_2:
        final_a["influenced_by"] = fragment_a_2["influenced_by"]
    
    # Add timespan and attributed_by from fragment_a_3
    if fragment_a_3:
        if "timespan" in fragment_a_3:
            final_a["timespan"] = fragment_a_3["timespan"]
        if "attributed_by" in fragment_a_3:
            final_a["attributed_by"] = fragment_a_3["attributed_by"]
    
    return final_a

def process_final_a_row(row, id_col, folder_path):
    """Process one row for final_a assembly."""
    row_id = row.get(id_col)
    
    if is_empty_cell(row_id):
        return
    
    row_id = str(row_id).strip()
    row_dir = os.path.join(folder_path, row_id)
    
    final_a = assemble_final_a(row_id, folder_path)
    
    if final_a:
        output_path = os.path.join(row_dir, "final_a.json")
        save_json(final_a, output_path)
        print(f"✓ Created final_a.json for {row_id}")
    else:
        print(f"✗ Failed to create final_a.json for {row_id}")

def assemble_all_final_a(df, out_path, id_col="ID"):
    """Assemble final_a.json for all rows in dataframe."""
    print(f"Assembling final_a.json files from fragments...")
    print(f"Output directory: {out_path}\n")
    
    count = 0
    for _, row in df.iterrows():
        process_final_a_row(row, id_col=id_col, folder_path=out_path)
        count += 1
    
    print(f"\n✓ Processed {count} rows")

Loaded 4 rows from clean-csv.xlsx

Assembling final_a.json files from fragments...
Output directory: ../data/json

✓ Created final_a.json for od_1
✓ Created final_a.json for od_2
✓ Created final_a.json for od_3
✓ Created final_a.json for od_4

✓ Processed 4 rows

✓ Assembly complete. Files saved to ../data/json


In [None]:
if __name__ == "__main__":
    # Load data
    file = "clean-csv.xlsx"
    path = os.path.join(IN_FOLDER, file)
    df = pd.read_excel(path)
    
    print(f"Loaded {len(df)} rows from {file}\n")
    
    # Assemble all final_a files
    assemble_all_final_a(df, OUT_FOLDER, id_col="ID")
    
    print(f"\n✓ Assembly complete. Files saved to {OUT_FOLDER}")