In [1]:
import pandas as pd

df_clean = pd.read_csv("ArtHistory_Openalex_with_macro_tags_clean.csv")
df_llm = pd.read_csv("ArtHistory_Openalex_with_llm_study_tags.csv")

df_clean['openalex_author_id'] = df_clean['openalex_author_id'].astype(str).str.strip()
df_llm['openalex_author_id'] = df_llm['openalex_author_id'].astype(str).str.strip()

df_merged = pd.merge(
    df_clean,
    df_llm,
    on='openalex_author_id',
    how='left',
    suffixes=('', '_from_llm')
)

mask_missing = df_merged['llm_study_tags'].isna()
name_to_tags = df_llm.set_index('name')['llm_study_tags'].to_dict()
df_merged.loc[mask_missing, 'llm_study_tags'] = df_merged.loc[mask_missing, 'name'].map(name_to_tags)
df_merged.to_csv("ArtHistory_Final_Merged.csv", index=False)

print(f"Fusion termin√©e ! Le fichier final contient {len(df_merged)} chercheurs.")
print("Fichier sauvegard√© sous : ArtHistory_Final_Merged.csv")

Fusion termin√©e ! Le fichier final contient 120 chercheurs.
Fichier sauvegard√© sous : ArtHistory_Final_Merged.csv


In [2]:
import pandas as pd
import requests
import time
import ast 

EMAIL_CONTACT = "nina.vivierbarte@psl.eu" 
df = pd.read_csv("ArtHistory_Final_Merged.csv")

if 'french_institution_names' not in df.columns:
    df['french_institution_names'] = ""
if 'french_collab_topics' not in df.columns:
    df['french_collab_topics'] = ""

def get_collab_details(author_id):
    """R√©cup√®re les institutions et topics fran√ßais pour un ID donn√©"""
    clean_id = str(author_id).split('/')[-1]
    url = f"https://api.openalex.org/works?filter=author.id:{clean_id}"
    headers = {'User-Agent': f'mailto:{EMAIL_CONTACT}'}
    
    try:
        r = requests.get(url, headers=headers)
        if r.status_code != 200: return None, None
        
        data = r.json()
        insts = set()
        topics = set()
        
        for work in data.get('results', []):
            is_french = False
            for authorship in work.get('authorships', []):
                for institution in authorship.get('institutions', []):
                    if institution.get('country_code') == 'FR':
                        insts.add(institution.get('display_name'))
                        is_french = True
            if is_french:
                for t in work.get('topics', []):
                    topics.add(t.get('display_name'))
                    
        return "; ".join(insts), "; ".join(topics)
    except:
        return None, None

print("D√©but de l'enrichissement cibl√©...")
count = 0

for index, row in df.iterrows():
    collab_status = str(row['openalex_france_collab']).lower()
    if collab_status not in ['no', 'nan', '[]'] and pd.notna(row['openalex_author_id']):
        
        print(f"--> Traitement de {row['name']} (Collab d√©tect√©e !)")
        
        institutions, topics = get_collab_details(row['openalex_author_id'])
        
        if institutions:
            df.at[index, 'french_institution_names'] = institutions
            print(f" Institutions trouv√©es : {institutions}")
        if topics:
            df.at[index, 'french_collab_topics'] = topics
            
        count += 1
        time.sleep(0.1)

df.to_csv("ArtHistory_Final_With_Institutions.csv", index=False)
print(f"\nFini ! {count} chercheurs enrichis avec les d√©tails institutionnels.")

D√©but de l'enrichissement cibl√©...
--> Traitement de Mike Pope (Collab d√©tect√©e !)
--> Traitement de Marlene Hansen Esplin (Collab d√©tect√©e !)
--> Traitement de Jeffrey F. Hamburger (Collab d√©tect√©e !)
--> Traitement de Alina Payne (Collab d√©tect√©e !)
--> Traitement de Sarah Lewis (Collab d√©tect√©e !)
--> Traitement de Christina Maranci (Collab d√©tect√©e !)
--> Traitement de Usha Iyer (Collab d√©tect√©e !)
--> Traitement de Ziliang Liu (Collab d√©tect√©e !)
--> Traitement de Amy A. McKenna (Collab d√©tect√©e !)
 Institutions trouv√©es : IFP √ânergies nouvelles
--> Traitement de Kern Samuel (Collab d√©tect√©e !)
--> Traitement de Xiaotian Yin (Collab d√©tect√©e !)
 Institutions trouv√©es : Institut national de recherche en informatique et en automatique; Centre Inria de l'Universit√© de Lorraine; Laboratoire Lorrain de Recherche en Informatique et ses Applications
--> Traitement de Ben Benedict (Collab d√©tect√©e !)
 Institutions trouv√©es : Fondation pour l‚Äôinnovation en 

In [3]:
import pandas as pd

df = pd.read_csv("ArtHistory_Final_With_Institutions.csv")


keep_cols = [
    'name', 'university', 'openalex_author_id', 'openalex_works_count',
    'macro_tags',           
    'llm_study_tags',        
    'openalex_france_collab', 
    'french_institution_names', 
    'french_collab_topics',
    'link_france'         
]
df_clean = df[keep_cols].copy()

french_keywords = ['france', 'french', 'paris', 'normandy', 'provence', 'loire', 'versailles', 'louvre', 'bordeaux', 'lyon']

def extract_french_focus(tags_str):
    if pd.isna(tags_str): return ""
    tags = [t.strip() for t in str(tags_str).split(',')]
    found = [t for t in tags if any(k in t.lower() for k in french_keywords)]
    return ", ".join(found)

df_clean['french_study_focus'] = df_clean['llm_study_tags'].apply(extract_french_focus)


def get_type(row):
    is_study = (str(row['link_france']).lower() == 'yes') or (len(row['french_study_focus']) > 2)
    is_collab = str(row['openalex_france_collab']).lower() not in ['no', 'nan', '[]']
    
    if is_study and is_collab: return "both"
    if is_collab: return "collab"
    if is_study: return "study"
    return "none"

df_clean['france_connection_type'] = df_clean.apply(get_type, axis=1)

df_clean.to_csv("ArtHistory_Viz_Ready.csv", index=False)
print("Fichier nettoy√© et enrichi : ArtHistory_Viz_Ready.csv")

Fichier nettoy√© et enrichi : ArtHistory_Viz_Ready.csv


In [4]:
import pandas as pd

df = pd.read_csv("ArtHistory_Viz_Ready.csv")

fake_collabs = [
    "Sarah Lewis", "Usha Iyer", "Ziliang Liu", "Amy A. McKenna", 
    "Mike Pope", "Kern Samuel", "Jeffrey F. Hamburger",
 
    "Xiaotian Yin",   
    "Ben Benedict",    
    "David Doris",   
    "Lihong Liu",    
    "Bryan K. Miller"  
]

print(f"Nettoyage de {len(fake_collabs)} homonymes scientifiques...")

def clean_homonyms(row):
    if row['name'] in fake_collabs:
        row['openalex_france_collab'] = 'no'
        row['french_institution_names'] = None
        row['french_collab_topics'] = None
    return row

df_clean = df.apply(clean_homonyms, axis=1)

def recalculate_type(row):
    is_study = (str(row['link_france']).lower() == 'yes') or (len(str(row['french_study_focus'])) > 2)
    is_collab = str(row['openalex_france_collab']).lower() not in ['no', 'nan', '[]', 'none']
    
    if is_study and is_collab: return "both" 
    if is_collab: return "collab"           
    if is_study: return "study"              
    return "none"                            

df_clean['france_connection_type'] = df_clean.apply(recalculate_type, axis=1)

true_collabs = df_clean[df_clean['france_connection_type'].isin(['collab', 'both'])]
print(f"\nNombre final de vrais collaborateurs : {len(true_collabs)}")
print(true_collabs[['name', 'university', 'french_institution_names']])

df_clean.to_csv("ArtHistory_Viz_Final_Cleaned.csv", index=False)

Nettoyage de 12 homonymes scientifiques...

Nombre final de vrais collaborateurs : 4
                      name university  french_institution_names
26   Marlene Hansen Esplin        BYU                       NaN
50             Alina Payne    Harvard                       NaN
59       Christina Maranci    Harvard                       NaN
104            Tina Bawden   Michigan                       NaN


In [5]:
import pandas as pd
import requests
import time


EMAIL = "nina.vivierbarte@psl.eu"  

targets = [
    {"name": "Marlene Hansen Esplin", "id": "A5049988626"},
    {"name": "Alina Payne",           "id": "A5038293506"},
    {"name": "Christina Maranci",     "id": "A5064151424"},
    {"name": "Tina Bawden",           "id": "A5005450456"}
]

def deep_search_author(author_name, author_id):
    print(f"\nüîç Analyse approfondie pour : {author_name} ({author_id})")

    clean_id = author_id.replace("https://openalex.org/", "")

    url = f"https://api.openalex.org/works?filter=author.id:{clean_id}&per-page=200"
    headers = {'User-Agent': f'mailto:{EMAIL}'}
    
    try:
        r = requests.get(url, headers=headers)
        data = r.json()
        
        found_something = False
        
        for work in data.get('results', []):
            title = work.get('title', 'Sans titre')
            pub_year = work.get('publication_year')
   
            french_institutions = []
            for authorship in work.get('authorships', []):
                for inst in authorship.get('institutions', []):
                    if inst.get('country_code') == 'FR':
                        french_institutions.append(inst.get('display_name'))

            has_french_title = any(kw in str(title).lower() for kw in ['france', 'french', 'paris', 'louvre'])

            if french_institutions:
                print(f"   [COLLAB ] {pub_year} - \"{title}\"")
                print(f"      -> Avec : {', '.join(set(french_institutions))}")
                found_something = True
                
            elif has_french_title:

                print(f"   [ETUDE ] {pub_year} - \"{title}\"")
                found_something = True

        if not found_something:
            print("   -> Aucune trace explicite trouv√©e dans les m√©tadonn√©es.")
            
    except Exception as e:
        print(f"Erreur API : {e}")
    
    time.sleep(0.5) 

print("D√©marrage de l'enqu√™te...")
for t in targets:
    deep_search_author(t['name'], t['id'])

D√©marrage de l'enqu√™te...

üîç Analyse approfondie pour : Marlene Hansen Esplin (A5049988626)
   -> Aucune trace explicite trouv√©e dans les m√©tadonn√©es.

üîç Analyse approfondie pour : Alina Payne (A5038293506)
   [COLLAB ] 2009 - "Compositio and the Materiality of Architecture in the Italian Renaissance"
      -> Avec : √âcole Pratique des Hautes √âtudes

üîç Analyse approfondie pour : Christina Maranci (A5064151424)
   [COLLAB ] 2022 - "Cultural Interactions in Medieval Georgia, Michele Bacci, Thomas Kaffenberger, Manuela¬†Studer-Karlen (√©d.)"
      -> Avec : Universit√© de Poitiers

üîç Analyse approfondie pour : Tina Bawden (A5005450456)
   -> Aucune trace explicite trouv√©e dans les m√©tadonn√©es.


In [7]:
import pandas as pd
import json
import math


df = pd.read_csv("ArtHistory_Viz_Final_Cleaned.csv")

sherlock_updates = {
    "Marlene Hansen Esplin": {
        "status": "yes",
        "institution": "Universit√© Bordeaux Montaigne",
        "details": "Article: 'Reviews of Books' (2020)"
    },
    "Christina Maranci": {
        "status": "yes",
        "institution": "Universit√© de Poitiers",
        "details": "Livre: 'Cultural Interactions in Medieval Georgia' (2022)"
    },
    "Alina Payne": {
        "status": "yes",
        "institution": "Ecole Pratique des Hautes Etudes, Universit√© Paris Sciences et Lettres",
        "details": "Article: 'Compositio and the Materiality of Architecture in the Italian Renaissance' (2009)"
    },
    "Tina Bawden": { "status": "no", "institution": None, "details": None }
}

def apply_updates(row):
    name = row['name']
    if name in sherlock_updates:
        info = sherlock_updates[name]
        
        if info['status'] == 'yes':
            row['openalex_france_collab'] = 'yes'
            row['french_institution_names'] = info['institution']
            current_topics = str(row['french_collab_topics']).replace('nan', '')
            if info['details'] not in current_topics:
                row['french_collab_topics'] = info['details']
        else:
            row['openalex_france_collab'] = 'no'
            row['french_institution_names'] = None
            row['french_collab_topics'] = None
            
    return row

df_final = df.apply(apply_updates, axis=1)

def get_final_type(row):
    is_study = False
    if str(row['link_france']).lower() == 'yes':
        is_study = True
    if pd.notna(row['french_study_focus']) and len(str(row['french_study_focus'])) > 3:
        keywords = ['france', 'french', 'paris', 'bordeaux', 'poitiers', 'louvre', 'chartres']
        if any(k in str(row['french_study_focus']).lower() for k in keywords):
            is_study = True

    is_collab = str(row['openalex_france_collab']).lower() in ['yes', 'true']
    if pd.notna(row['french_institution_names']) and len(str(row['french_institution_names'])) > 2:
        is_collab = True
        
    # Verdict
    if is_study and is_collab: return "both"  
    if is_collab: return "collab"             
    if is_study: return "study"            
    return "none"                           

df_final['france_connection_type'] = df_final.apply(get_final_type, axis=1)

df_final.to_csv("ArtHistory_Final_Graph_Ready.csv", index=False)
print("CSV Final g√©n√©r√© : ArtHistory_Final_Graph_Ready.csv")


nodes = []
links = []
existing_nodes = set()
topic_counts = {}

def add_node(id, group, type_node, attributes={}):
    if id not in existing_nodes:
        node = {"id": id, "group": group, "type": type_node}
        node.update(attributes)
        nodes.append(node)
        existing_nodes.add(id)

print("G√©n√©ration du JSON...")

for idx, row in df_final.iterrows():
    researcher_name = row['name']

    try:
        works_count = int(row.get('openalex_works_count', 1))
    except:
        works_count = 1
        
    researcher_attrs = {
        "university": row.get('university', 'Unknown'),
        "france_type": row['france_connection_type'],
        "radius": 5 + math.log(works_count + 1) * 2,

        "institutions": str(row.get('french_institution_names', '')).replace('nan', ''),
        "study_focus": str(row.get('french_study_focus', '')).replace('nan', '')
    }
    
    add_node(researcher_name, row.get('university', 'Unknown'), "researcher", researcher_attrs)
    tags = [t.strip() for t in str(row['macro_tags']).split(',')]
    for tag in tags:
        if not tag or tag.lower() == 'nan': continue

        topic_counts[tag] = topic_counts.get(tag, 0) + 1

        add_node(tag, "Topic", "topic", {"france_type": "none"}) 
        links.append({"source": researcher_name, "target": tag, "value": 1})

for node in nodes:
    if node['type'] == 'topic':
        count = topic_counts.get(node['id'], 1)
        node['radius'] = 3 + math.log(count + 1) * 3
network_data = {"nodes": nodes, "links": links}

with open('network_data_final.json', 'w') as f:
    json.dump(network_data, f)

print(f"JSON Final g√©n√©r√© : network_data_final.json ({len(nodes)} noeuds, {len(links)} liens)")
print("\n--- Bilan des Types ---")
print(df_final['france_connection_type'].value_counts())

CSV Final g√©n√©r√© : ArtHistory_Final_Graph_Ready.csv
G√©n√©ration du JSON...
JSON Final g√©n√©r√© : network_data_final.json (300 noeuds, 1303 liens)

--- Bilan des Types ---
france_connection_type
none      65
study     52
both       2
collab     1
Name: count, dtype: int64
