In [23]:
import fitz
import os
from utils import clean_text, make_continuous, capitalize_after_comma, time_to_minutes, split_date_range
import pandas as pd
import re
import json
import uuid
import ast


IN_FOLDER='../data/excel'
OUT_FOLDER='../data/json'

In [10]:
file="clean-csv.xlsx"
path= os.path.join(IN_FOLDER,file)
df=pd.read_excel(path)
df.head()

Unnamed: 0,ID,page_start,page_end,titre,sous-titre,dates,salle,auteur,adaptation,mise en scène,...,surtitrage,production,coproduction,soutien,aide,source,coréalisation,other,date_start,date_end
0,od_1,4,5,Le passé,Les cieux s’ouvrirent et… ici prit fin l’histo...,13 septembre—4 octobre,Odéon paris 6,Léonid Andréïev,['Julien Gosselin'],['Julien Gosselin'],...,,['Si vous pouviez lécher mon coeur'],"['Odéon théâtre de l’europe', 'Le phénix — scè...",['Ministère de la culture'],"['Montévidéo — centre d’art', 'T2g théâtre de ...",programme de saison Odéon 25/26,,,13 septembre,4 octobre
1,od_2,7,7,Musée duras,,9—30 novembre,Berthier paris 17,Marguerite Duras,,['Julien Gosselin'],...,['Alice de la Bouillerie'],"['Odéon théâtre de l’europe', 'Conservatoire n...",,,['Jeune théâtre national'],programme de saison Odéon 25/26,,,9 novembre,30 novembre
2,od_3,8,8,Honda romance,,14—26 octobre,Odéon paris 6,Vimala Pons,,['Vimala Pons'],...,,"['Tout ça / que ça', 'Comédie de genève']","['Odéon théâtre de l’europe', 'Mc2 : maison de...",['Fondation bnp paribas'],"['""plateforme 2 pôles cirque en normandie — la...",programme de saison Odéon 25/26,['Festival d’Automne'],Festival d’Automne,14 octobre,26 octobre
3,od_4,9,9,Pallaksch pallaksch!,Pièces élémentaires,26 novembre—14 février,Petit odéon paris 6,"La Coccinelle\nde D. H. Lawrence, Le Voile de...",,['Marie-José Malis'],...,,['Compagnie la llevantina'],"['Odéon théâtre de l’europe', 'Comédie de genè...",,,programme de saison Odéon 25/26,,,26 novembre,14 février


In [11]:
PROD_MAP = {
    "production": "http://id.loc.gov/vocabulary/relators/pro",
    "sponsor": "http://id.loc.gov/vocabulary/relators/spn",
    "funder": "http://id.loc.gov/vocabulary/relators/fnd",
}

def random_group_uri(base="https://data.stage.org/auth/"):
    return base + uuid.uuid4().hex

REFERRED_TO_BY_TEMPLATE = {
    "type": "LinguisticObject",
    "_label": "role as appears in doc",
    "classified_as": [
        {
            "id": "http://vocab.getty.edu/page/aat/300435423",
            "type": "Type",
            "_label": "Literal transcription",
        }
    ],
}



name_to_id = {}


In [None]:
def parse_list_cell(val):
    if not isinstance(val, str):
        return []
    val = val.strip()
    if not (val.startswith("[") and val.endswith("]")):
        return []
    return ast.literal_eval(val)  # turns "['A','B']" into ['A','B']

def build_group_part_from_row(row, cols, prod_key, group_index, name_to_id):
    """
    Builds ONE part dict for org/group roles (production/sponsor/funder).
    - type is Group
    - _label from group name
    - referred_to_by.content = column name
    - dedup by group name using name_to_id
    - updates group_index: id -> name
    """
    technique_id = PROD_MAP[prod_key]

    carried_out_by = []

    for col in cols:
        val = row.get(col)

        # expecting list stored as string from Excel
        names = parse_list_cell(val)
        names = [x.strip() for x in names if isinstance(x, str) and x.strip()]
        names = list(dict.fromkeys(names))  # optional: dedupe within cell

        for name in names:
            group_id = name_to_id.get(name, random_group_uri())
            name_to_id.setdefault(name, group_id)
            group_index[group_id] = name

            carried_out_by.append({
                "id": group_id,
                "type": "Group",
                "_label": name,
                "referred_to_by": [
                    {**REFERRED_TO_BY_TEMPLATE, "content": col}  # column name
                ],
            })

    return {
        "technique": [
            {
                "id": technique_id,
                "type": "Type",
                "_label": prod_key,   # or a prettier label map if you want
            }
        ],
        "carried_out_by": carried_out_by,
    }



In [36]:
def build_group_fragment_for_row(row, group_cols_map, group_index, group_name_to_id):
    parts = []

    for prod_key, cols in group_cols_map.items():
        if prod_key not in PROD_MAP:
            raise KeyError(f"'{prod_key}' not in PROD_MAP")

        part = build_group_part_from_row(row, cols, prod_key, group_index, group_name_to_id)

        if part["carried_out_by"]:
            parts.append(part)

    # fragment is always a dict (safe to JSON dump)
    return {"part": parts}

def run_group_pipeline(df, id_col, group_cols_map):
    group_index = {}
    group_name_to_id = {}
    fragments_by_id = {}

    for _, row in df.iterrows():
        row_id = row[id_col]
        fragments_by_id[row_id] = build_group_fragment_for_row(
            row=row,
            group_cols_map=group_cols_map,
            group_index=group_index,
            group_name_to_id=group_name_to_id,
        )

    return fragments_by_id, group_index



In [30]:
group_cols_map =   {  
    "production": ["production", "coproduction"],
    "sponsor": ["soutien"],
    "funder": ["aide"],
}


In [37]:
payloads_by_id, group_index = run_group_pipeline(df, id_col="ID", group_cols_map=group_cols_map)


In [38]:
len(group_index)

41

In [39]:
print(json.dumps(payloads_by_id, ensure_ascii=False, indent=2))

{
  "od_1": {
    "part": [
      {
        "technique": [
          {
            "id": "http://id.loc.gov/vocabulary/relators/pro",
            "type": "Type",
            "_label": "production"
          }
        ],
        "carried_out_by": [
          {
            "id": "https://data.stage.org/auth/6c44b74329e741fbba44dcec4493eb46",
            "type": "Group",
            "_label": "Si vous pouviez lécher mon coeur",
            "referred_to_by": [
              {
                "type": "LinguisticObject",
                "_label": "role as appears in doc",
                "classified_as": [
                  {
                    "id": "http://vocab.getty.edu/page/aat/300435423",
                    "type": "Type",
                    "_label": "Literal transcription"
                  }
                ],
                "content": "production"
              }
            ]
          },
          {
            "id": "https://data.stage.org/auth/1e6ebd46f73e44afbbbe7fe5d39a70

In [42]:
for row_id, payload in payloads_by_id.items():
   
    row_dir = os.path.join(OUT_FOLDER, str(row_id))
    os.makedirs(row_dir, exist_ok=True)

    # save the payload (or fragment) inside the id folder
    with open(os.path.join(row_dir, "b-prod.json"), "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

In [21]:
df_index = pd.DataFrame(
    group_index.items(),
    columns=["group_id", "group_name"]
)

df_index.to_csv("../data/excel/group_index.csv", index=False)