In [33]:
import fitz
import os
from utils import clean_text, make_continuous, capitalize_after_comma, time_to_minutes, split_date_range
import pandas as pd
import re
import json
import uuid
import ast


IN_FOLDER='../data/excel'
OUT_FOLDER='../data/json'

In [22]:
file="clean-csv.xlsx"
path= os.path.join(IN_FOLDER,file)
df=pd.read_excel(path)
df.head()

Unnamed: 0,ID,page_start,page_end,titre,sous-titre,dates,salle,auteur,adaptation,mise en scène,...,surtitrage,production,coproduction,soutien,aide,source,coréalisation,other,date_start,date_end
0,od_1,4,5,Le passé,Les cieux s’ouvrirent et… ici prit fin l’histo...,13 septembre—4 octobre,Odéon paris 6,Léonid Andréïev,['Julien Gosselin'],['Julien Gosselin'],...,,['Si vous pouviez lécher mon coeur'],"['Odéon théâtre de l’europe', 'Le phénix — scè...",['Ministère de la culture'],"['Montévidéo — centre d’art', 'T2g théâtre de ...",programme de saison Odéon 25/26,,,13 septembre,4 octobre
1,od_2,7,7,Musée duras,,9—30 novembre,Berthier paris 17,Marguerite Duras,,['Julien Gosselin'],...,['Alice de la Bouillerie'],"['Odéon théâtre de l’europe', 'Conservatoire n...",,,['Jeune théâtre national'],programme de saison Odéon 25/26,,,9 novembre,30 novembre
2,od_3,8,8,Honda romance,,14—26 octobre,Odéon paris 6,Vimala Pons,,['Vimala Pons'],...,,"['Tout ça / que ça', 'Comédie de genève']","['Odéon théâtre de l’europe', 'Mc2 : maison de...",['Fondation bnp paribas'],"['""plateforme 2 pôles cirque en normandie — la...",programme de saison Odéon 25/26,['Festival d’Automne'],Festival d’Automne,14 octobre,26 octobre
3,od_4,9,9,Pallaksch pallaksch!,Pièces élémentaires,26 novembre—14 février,Petit odéon paris 6,"La Coccinelle\nde D. H. Lawrence, Le Voile de...",,['Marie-José Malis'],...,,['Compagnie la llevantina'],"['Odéon théâtre de l’europe', 'Comédie de genè...",,,programme de saison Odéon 25/26,,,26 novembre,14 février


In [10]:
len(df.columns)

54

In [23]:
REFERRED_TO_BY_TEMPLATE = {
    "type": "LinguisticObject",
    "_label": "role as appears in doc",
    "classified_as": [
        {
            "id": "http://vocab.getty.edu/page/aat/300435423",
            "type": "Type",
            "_label": "Literal transcription",
        }
    ],
}


def random_person_uri(base="https://data.stage.org/auth/"):
    return base + uuid.uuid4().hex


TECHNIQUE_MAP = {
    "video": "https://id.loc.gov/vocabulary/relators/vdg",
    "music": "http://id.loc.gov/vocabulary/relators/msd",
    "sound-design": "http://id.loc.gov/vocabulary/relators/sds",
    "sound-engineering": "http://id.loc.gov/vocabulary/relators/sde",
    "director":"http://id.loc.gov/vocabulary/relators/drt",
    "set-design" : "http://id.loc.gov/vocabulary/relators/std",
    "technical-director": "http://id.loc.gov/vocabulary/relators/tcd",
    "translation":"http://id.loc.gov/vocabulary/relators/trl",
    "actor": "https://id.loc.gov/vocabulary/relators/act.html",
    "adaptation": "http://id.loc.gov/vocabulary/relators/adp",
    "lighting" : "http://id.loc.gov/vocabulary/relators/lgd",
    "costumes" : "http://id.loc.gov/vocabulary/relators/cst",
    "collaboration" : "http://id.loc.gov/vocabulary/relators/csl",
    "make-up" : "http://id.loc.gov/vocabulary/relators/mka",
    "other" : "http://id.loc.gov/vocabulary/relators/oth"
}

name_to_id = {}


In [24]:
def parse_list_cell(val):
    if not isinstance(val, str):
        return []
    val = val.strip()
    if not (val.startswith("[") and val.endswith("]")):
        return []
    return ast.literal_eval(val)  # turns "['A','B']" into ['A','B']

def build_part_from_row(row, cols, technique_key, person_index):
    technique_id = TECHNIQUE_MAP[technique_key]

    carried_out_by = []

    for col in cols:
        val = row.get(col, None)

        # skip empty/NaN cells
        if not isinstance(val, str) or not val.strip():
            continue

        names = [x.strip() for x in parse_list_cell(val) if isinstance(x, str) and x.strip()]
        names = list(dict.fromkeys(names))


        for name in names:
            person_id = name_to_id.get(name, random_person_uri())

            name_to_id.setdefault(name, person_id)
            person_index[person_id] = name

            carried_out_by.append({
                "id": person_id,
                "type": "Person",
                "_label": name,
                "referred_to_by": [
                    {**REFERRED_TO_BY_TEMPLATE, "content": col}
                ],
            })

    return {
        "technique": [
            {
                "id": technique_id,
                "type": "Type",
                "_label": technique_key,
            }
        ],
        "carried_out_by": carried_out_by,
    }

In [34]:
def build_parts_for_row(row, group_cols_map, person_index):
    parts = []

    for technique_key, cols in group_cols_map.items():
        if technique_key not in TECHNIQUE_MAP:
            raise KeyError(f"Technique '{technique_key}' not found in TECHNIQUE_MAP")

        part = build_part_from_row(row, cols, technique_key, person_index)

        # skip empty parts
        if part["carried_out_by"]:
            parts.append(part)

    return {"part": parts}

def run_pipeline(df, id_col, group_cols_map):
    person_index = {}
    parts_by_id = {}

    for _, row in df.iterrows():
        row_id = row[id_col]
        parts_by_id[row_id] = build_parts_for_row(
            row=row,
            group_cols_map=group_cols_map,
            person_index=person_index,
        )

    return parts_by_id, person_index

In [35]:
group_cols_map = {
    "video": ['vidéo', 'cadre vidéo', 'régie vidéo',
       'collaboration à la vidéo'],
    
    "music": ['musique', 'composition musicale',
       'composition musicale du satellite', 'composition musicale du Choeur',
       'collaboration artistique pour la direction, l’adaptation et l’arrangement musical'],
    
    "sound-design": ['son', 'collaboration au son'],
       
    "director":['mise en scène', 
                'collaboration conception et mise en scène', 
                'assistanat à la mise en scène'],
   
    "set-design" : ['scénographie',
       'collaboration à la scénographie', 'recherche scénographique',
       'regard scénographique', 'espaces', "accessoires"],

    "technical-director": ['régie générale'],
    "translation":["traduction"],
    "actor": ["avec"],
    "adaptation": ["adaptation", "dramaturgie"],
    "lighting" : ["lumière"],
    "costumes" : ['costumes','assistanat aux costumes', "masks"],
    "collaboration" : ["collaboration artistique"],
    "make-up" : ["maquillages"],
    "other" : ['confection du satellite', 'créateur des souffleurs', 'surtitrage']
}


In [36]:
payloads_by_id, person_index = run_pipeline(df, id_col="ID", group_cols_map=group_cols_map)


In [37]:
len(person_index)

72

In [38]:
print(json.dumps(payloads_by_id["od_1"], ensure_ascii=False, indent=2))

{
  "part": [
    {
      "technique": [
        {
          "id": "https://id.loc.gov/vocabulary/relators/vdg",
          "type": "Type",
          "_label": "video"
        }
      ],
      "carried_out_by": [
        {
          "id": "https://data.stage.org/auth/4b9c65b85ab144d0b09d6b30898ca06e",
          "type": "Person",
          "_label": "Pierre Martin Oriol",
          "referred_to_by": [
            {
              "type": "LinguisticObject",
              "_label": "role as appears in doc",
              "classified_as": [
                {
                  "id": "http://vocab.getty.edu/page/aat/300435423",
                  "type": "Type",
                  "_label": "Literal transcription"
                }
              ],
              "content": "vidéo"
            }
          ]
        },
        {
          "id": "https://data.stage.org/auth/982f848064104b68828714da4d9d8ffa",
          "type": "Person",
          "_label": "Jérémie Bernaert",
          "referred_to

In [39]:
for row_id, payload in payloads_by_id.items():
   
    row_dir = os.path.join(OUT_FOLDER, str(row_id))
    os.makedirs(row_dir, exist_ok=True)

    # save the payload (or fragment) inside the id folder
    with open(os.path.join(row_dir, "b-creative.json"), "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

In [32]:
df_index = pd.DataFrame(
    person_index.items(),
    columns=["person_id", "person_name"]
)

df_index.to_csv("../data/excel/person_index.csv", index=False)