In [29]:
import fitz
import os
from utils import clean_text, make_continuous, capitalize_after_comma, time_to_minutes, split_date_range
import pandas as pd
import re
import json
import uuid
import ast
import dateparser


IN_FOLDER='../data/excel'
OUT_FOLDER='../data/json'

In [7]:
file="clean-csv.xlsx"
path= os.path.join(IN_FOLDER,file)
df=pd.read_excel(path)
df.head()

Unnamed: 0,ID,page_start,page_end,titre,sous-titre,dates,salle,auteur,adaptation,mise en scène,...,surtitrage,production,coproduction,soutien,aide,source,coréalisation,other,date_start,date_end
0,od_1,4,5,Le passé,Les cieux s’ouvrirent et… ici prit fin l’histo...,13 septembre—4 octobre,Odéon paris 6,Léonid Andréïev,['Julien Gosselin'],['Julien Gosselin'],...,,['Si vous pouviez lécher mon coeur'],"['Odéon théâtre de l’europe', 'Le phénix — scè...",['Ministère de la culture'],"['Montévidéo — centre d’art', 'T2g théâtre de ...",programme de saison Odéon 25/26,,,13 September 2025,04 October 2025
1,od_2,7,7,Musée duras,,9—30 novembre,Berthier paris 17,Marguerite Duras,,['Julien Gosselin'],...,['Alice de la Bouillerie'],"['Odéon théâtre de l’europe', 'Conservatoire n...",,,['Jeune théâtre national'],programme de saison Odéon 25/26,,,09 November 2025,30 November 2025
2,od_3,8,8,Honda romance,,14—26 octobre,Odéon paris 6,Vimala Pons,,['Vimala Pons'],...,,"['Tout ça / que ça', 'Comédie de genève']","['Odéon théâtre de l’europe', 'Mc2 : maison de...",['Fondation bnp paribas'],"['""plateforme 2 pôles cirque en normandie — la...",programme de saison Odéon 25/26,['Festival d’Automne'],Festival d’Automne,14 October 2025,26 October 2025
3,od_4,9,9,Pallaksch pallaksch!,Pièces élémentaires,26 novembre—14 février,Petit odéon paris 6,"La Coccinelle\nde D. H. Lawrence, Le Voile de...",,['Marie-José Malis'],...,,['Compagnie la llevantina'],"['Odéon théâtre de l’europe', 'Comédie de genè...",,,programme de saison Odéon 25/26,,,26 November 2025,14 February 2026


### Timespan

In [13]:
def to_iso_utc_midnight(date_str):
    if not isinstance(date_str, str) or not date_str.strip():
        return None

    dt = dateparser.parse(
        date_str,
        languages=["fr", "en"],
        settings={
            "TIMEZONE": "UTC",
            "RETURN_AS_TIMEZONE_AWARE": False
        }
    )

    if not dt:
        return None

    return dt.strftime("%Y-%m-%dT00:00:00Z")


In [26]:
def build_timespan_from_row(row, id_col, start_col, end_col):
    begin = to_iso_utc_midnight(row.get(start_col))
    end = to_iso_utc_midnight(row.get(end_col))

    if begin is None and end is None:
        return None

    row_id = row.get(id_col)
    if row_id is None:
        return None

    return {
        str(row_id): {
            "type": "TimeSpan",
            "begin_of_the_begin": begin,
            "end_of_the_end": end,
        }
    }

def pipeline_timespan(df, id_col, start_col, end_col):
    timespans = {}

    for _, row in df.iterrows():
        ts = build_timespan_from_row(
            row=row,
            id_col=id_col,
            start_col=start_col,
            end_col=end_col,
        )
        if ts:
            timespans.update(ts)

    return timespans


In [None]:
timespan_fragment = pipeline_timespan(df, id_col="ID", start_col="date_start", end_col="date_end")

In [None]:
for row_id, payload in timespan_fragment.items():
   
    row_dir = os.path.join(OUT_FOLDER, str(row_id))
    os.makedirs(row_dir, exist_ok=True)

    # save the payload (or fragment) inside the id folder
    with open(os.path.join(row_dir, "b-timespan.json"), "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

### Place

In [39]:
def build_place_fragment_from_row(row, place_col):
    label = row.get(place_col)

    if not isinstance(label, str) or not label.strip():
        return None

    return {
        "took_place_at": [
            {
                "id": "https://data.stage.org/auth/odeon",  # fixed or replace later
                "type": "Place",
                "_label": "Odéon Paris",
                "classified_as": [
                    {
                        "id": "http://vocab.getty.edu/page/aat/300121919",
                        "type": "Type",
                        "_label": "performing arts structures",
                    }
                ],
                "part": [
                    {
                        "type": "Place",
                        "_label": label.strip(),
                        "classified_as": [
                            {
                                "id": "http://vocab.getty.edu/page/aat/300449028",
                                "type": "Type",
                                "_label": "performance halls",
                            }
                        ],
                    }
                ],
            }
        ]
    }


In [40]:
def run_place_pipeline(df, id_col, place_col):
    place_fragments_by_id = {}

    for _, row in df.iterrows():
        row_id = row[id_col]
        fragment = build_place_fragment_from_row(row, place_col)

        if fragment:
            place_fragments_by_id[row_id] = fragment

    return place_fragments_by_id

In [41]:
place_fragments = run_place_pipeline(df,id_col="ID", place_col="salle")

In [42]:
place_fragments["od_3"]

{'took_place_at': [{'id': 'https://data.stage.org/auth/odeon',
   'type': 'Place',
   '_label': 'Odéon Paris',
   'classified_as': [{'id': 'http://vocab.getty.edu/page/aat/300121919',
     'type': 'Type',
     '_label': 'performing arts structures'}],
   'part': [{'type': 'Place',
     '_label': 'Odéon paris 6',
     'classified_as': [{'id': 'http://vocab.getty.edu/page/aat/300449028',
       'type': 'Type',
       '_label': 'performance halls'}]}]}]}

In [57]:
for row_id, payload in place_fragments.items():
   
    row_dir = os.path.join(OUT_FOLDER, str(row_id))
    os.makedirs(row_dir, exist_ok=True)

    # save the payload (or fragment) inside the id folder
    with open(os.path.join(row_dir, "b-place.json"), "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

In [None]:
### Title

In [52]:
def build_title_fragment_from_row(row, title_col, conception_col, id_col):
    title = row.get(title_col)
    conception = row.get(conception_col)
    work_id = row.get(id_col)

    if not isinstance(title, str) or not title.strip():
        return None
    if not isinstance(work_id, str) or not work_id.strip():
        return None

    title = title.strip()
    work_uri = f"https://data.stage.org/works/{work_id.strip()}"

    if isinstance(conception, str):
        try:
            people = ast.literal_eval(conception)
            if isinstance(people, list):
                conception = ", ".join(p.strip() for p in people if isinstance(p, str) and p.strip())
            else:
                conception = ""
        except Exception:
            conception = ""
    else:
        conception = ""

    return {
        "identified_by": [
            {
                "type": "Name",
                "classified_as": [
                    {
                        "id": "http://vocab.getty.edu/aat/300404670",
                        "type": "Type",
                        "_label": "Title",
                    }
                ],
                "content": title,
            }
        ],
        "influenced_by": [
            {
                "id": work_uri,  
                "type": "PropositionalObject",
                "_label": (
                    f"The show {title} as conceived by {conception}"
                ),
            }
        ],
    }


In [53]:
def run_title_pipeline(df, id_col, title_col, conception_col):
    title_fragments_by_id = {}

    for _, row in df.iterrows():
        row_id = row[id_col]
        fragment = build_title_fragment_from_row(
            row,
            title_col=title_col,
            conception_col=conception_col,
            id_col=id_col,
        )
        if fragment:
            title_fragments_by_id[row_id] = fragment

    return title_fragments_by_id

In [54]:
title_fragment=run_title_pipeline(df, id_col="ID", title_col="titre", conception_col="mise en scène")

In [55]:
title_fragment

{'od_1': {'identified_by': [{'type': 'Name',
    'classified_as': [{'id': 'http://vocab.getty.edu/aat/300404670',
      'type': 'Type',
      '_label': 'Title'}],
    'content': 'Le passé'}],
  'influenced_by': [{'id': 'https://data.stage.org/works/od_1',
    'type': 'PropositionalObject',
    '_label': 'The show Le passé as conceived by Julien Gosselin'}]},
 'od_2': {'identified_by': [{'type': 'Name',
    'classified_as': [{'id': 'http://vocab.getty.edu/aat/300404670',
      'type': 'Type',
      '_label': 'Title'}],
    'content': 'Musée duras'}],
  'influenced_by': [{'id': 'https://data.stage.org/works/od_2',
    'type': 'PropositionalObject',
    '_label': 'The show Musée duras as conceived by Julien Gosselin'}]},
 'od_3': {'identified_by': [{'type': 'Name',
    'classified_as': [{'id': 'http://vocab.getty.edu/aat/300404670',
      'type': 'Type',
      '_label': 'Title'}],
    'content': 'Honda romance'}],
  'influenced_by': [{'id': 'https://data.stage.org/works/od_3',
    'type':

In [56]:
for row_id, payload in title_fragment.items():
   
    row_dir = os.path.join(OUT_FOLDER, str(row_id))
    os.makedirs(row_dir, exist_ok=True)

    # save the payload (or fragment) inside the id folder
    with open(os.path.join(row_dir, "b-title.json"), "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)