# RPPS — Annuaire Santé (HCP Registry)

**What:** National registry of all healthcare professionals — physicians, pharmacists, nurses, etc. The entity backbone of pharma SFE.

| Property | Value |
|----------|-------|
| Source | data.gouv.fr (ANS managed) |
| Format | Pipe-delimited TXT (`\|` separator) |
| Encoding | UTF-8 |
| Main file | `ps-libreacces-personne-activite.txt` (~761 MB) |

> **Note:** The main file is ~761 MB. We use `nrows` to sample for exploration.

In [None]:
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path("../../data/raw/rpps")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# We download only the savoir-faire file (49 MB) for quick exploration,
# and sample the main file header to understand its structure.
FILES = {
    "ps-libreacces-savoirfaire.txt": "https://static.data.gouv.fr/resources/annuaire-sante-extractions-des-donnees-en-libre-acces-des-professionnels-intervenant-dans-le-systeme-de-sante-rpps/20260217-085133/ps-libreacces-savoirfaire.txt",
    "ps-libreacces-personne-activite.txt": "https://static.data.gouv.fr/resources/annuaire-sante-extractions-des-donnees-en-libre-acces-des-professionnels-intervenant-dans-le-systeme-de-sante-rpps/20260217-084840/ps-libreacces-personne-activite.txt",
}

In [None]:
# Download the savoir-faire file (small, 49 MB)
sf_file = RAW_DIR / "ps-libreacces-savoirfaire.txt"
if not sf_file.exists():
    print("Downloading savoir-faire file (49 MB)...")
    resp = httpx.get(FILES["ps-libreacces-savoirfaire.txt"], follow_redirects=True, timeout=120)
    resp.raise_for_status()
    sf_file.write_bytes(resp.content)
    print(f"Done: {len(resp.content)/1e6:.0f} MB")
else:
    print(f"Already exists: {sf_file.stat().st_size/1e6:.0f} MB")

In [None]:
# Download the main file (761 MB — this takes a few minutes)
main_file = RAW_DIR / "ps-libreacces-personne-activite.txt"
if not main_file.exists():
    print("Downloading main HCP file (761 MB)... this takes a few minutes")
    with httpx.stream("GET", FILES["ps-libreacces-personne-activite.txt"],
                      follow_redirects=True, timeout=600) as resp:
        resp.raise_for_status()
        total = int(resp.headers.get("content-length", 0))
        downloaded = 0
        with open(main_file, "wb") as f:
            for chunk in resp.iter_bytes(65536):
                f.write(chunk)
                downloaded += len(chunk)
                if total:
                    print(f"\r  {downloaded/1e6:.0f}/{total/1e6:.0f} MB ({100*downloaded/total:.0f}%)", end="")
    print(f"\nDone!")
else:
    print(f"Already exists: {main_file.stat().st_size/1e6:.0f} MB")

## Load and explore the main HCP file

In [None]:
# Read first 500K rows to explore structure
hcp = pd.read_csv(main_file, sep="|", encoding="utf-8", nrows=500_000, low_memory=False)
print(f"Loaded {len(hcp):,} rows (sample)")
print(f"\nColumns ({len(hcp.columns)}):\n{list(hcp.columns)}")
hcp.head(3)

In [None]:
hcp.dtypes

In [None]:
# How many unique HCPs?
rpps_col = [c for c in hcp.columns if "rpps" in c.lower() or "identifiant" in c.lower()]
print(f"Potential RPPS columns: {rpps_col}")
if rpps_col:
    print(f"Unique RPPS numbers: {hcp[rpps_col[0]].nunique():,}")
    print(f"Total rows: {len(hcp):,}")
    print(f"→ Average rows per HCP: {len(hcp) / hcp[rpps_col[0]].nunique():.1f}")
    print("  (multiple rows = multiple activities/practice locations)")

In [None]:
# Profession distribution
prof_col = [c for c in hcp.columns if "profession" in c.lower() and "libel" in c.lower()]
if not prof_col:
    prof_col = [c for c in hcp.columns if "profession" in c.lower()]
print(f"Profession columns: {prof_col}")

if prof_col:
    col = prof_col[0]
    print(f"\nProfession distribution:\n")
    profession_counts = hcp[col].value_counts()
    print(profession_counts.to_string())
    
    fig, ax = plt.subplots(figsize=(10, 6))
    profession_counts.head(10).sort_values().plot.barh(ax=ax, color="steelblue")
    ax.set_title("Top 10 Professions in RPPS")
    ax.set_xlabel("Number of activity records")
    plt.tight_layout()
    plt.show()

In [None]:
# Specialty / savoir-faire distribution (for doctors)
sf_col = [c for c in hcp.columns if "savoir" in c.lower() and "libel" in c.lower()]
if not sf_col:
    sf_col = [c for c in hcp.columns if "savoir" in c.lower() or "specialite" in c.lower()]
print(f"Specialty columns: {sf_col}")

if sf_col:
    col = sf_col[0]
    top_specialties = hcp[col].dropna().value_counts().head(20)
    print(f"\nTop 20 specialties:\n")
    print(top_specialties.to_string())
    
    fig, ax = plt.subplots(figsize=(12, 8))
    top_specialties.sort_values().plot.barh(ax=ax, color="darkorange")
    ax.set_title("Top 20 HCP Specialties")
    ax.set_xlabel("Number of records")
    plt.tight_layout()
    plt.show()

In [None]:
# Exercise mode distribution (Libéral, Salarié, etc.)
mode_col = [c for c in hcp.columns if "mode" in c.lower() and "exercice" in c.lower()]
if mode_col:
    print(f"Exercise mode distribution:\n")
    print(hcp[mode_col[0]].value_counts().to_string())

In [None]:
# Geographic distribution by department
dept_col = [c for c in hcp.columns if "departement" in c.lower() or "dept" in c.lower()]
if not dept_col:
    dept_col = [c for c in hcp.columns if "code" in c.lower() and "commune" in c.lower()]
print(f"Geographic columns: {dept_col}")

if dept_col:
    col = dept_col[0]
    top_depts = hcp[col].astype(str).str[:2].value_counts().head(15)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    top_depts.sort_values().plot.barh(ax=ax, color="seagreen")
    ax.set_title("Top 15 Departments by HCP Activity Records")
    ax.set_xlabel("Number of records")
    plt.tight_layout()
    plt.show()

In [None]:
# Null rates
null_pct = (hcp.isnull().sum() / len(hcp) * 100).sort_values(ascending=False)
print("Top null rates:\n")
print(null_pct.head(15).to_string())