# FINESS — Healthcare Establishments

**What:** National registry of all healthcare, social, and medico-social establishments (hospitals, clinics, labs, pharmacies, EHPAD, etc.)

**Why it matters:** Institutional mapping — where are the hospitals, what type, what activity. Links to RPPS via practitioner affiliation.

| Property | Value |
|----------|-------|
| Source | data.gouv.fr |
| Format | CSV, semicolon-delimited |
| Encoding | UTF-8 |
| Geolocated file | ~48 MB |
| Standard file | ~36 MB |

> **Important:** FINESS distinguishes EJ (Entité Juridique = legal entity) from ET (Établissement = physical site). One EJ can have multiple ETs.

In [None]:
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path("../../data/raw/finess")
RAW_DIR.mkdir(parents=True, exist_ok=True)

FILES = {
    "finess_geoloc.csv": "https://static.data.gouv.fr/resources/finess-extraction-du-fichier-des-etablissements/20260108-153415/etalab-cs1100507-stock-20260107-0342.csv",
    "finess_standard.csv": "https://static.data.gouv.fr/resources/finess-extraction-du-fichier-des-etablissements/20260108-153611/etalab-cs1100502-stock-20260107-0343.csv",
}

In [None]:
for fname, url in FILES.items():
    dest = RAW_DIR / fname
    if dest.exists():
        print(f"  [skip] {fname} ({dest.stat().st_size/1e6:.0f} MB)")
        continue
    print(f"  Downloading {fname}...", end=" ")
    resp = httpx.get(url, follow_redirects=True, timeout=120)
    resp.raise_for_status()
    dest.write_bytes(resp.content)
    print(f"{len(resp.content)/1e6:.0f} MB")

print("Done!")

In [None]:
# Load the geolocated file (richer — has lat/lon)
df = pd.read_csv(RAW_DIR / "finess_geoloc.csv", sep=";", encoding="utf-8", low_memory=False)
print(f"Loaded {len(df):,} establishments")
print(f"\nColumns ({len(df.columns)}):\n{list(df.columns)}")
df.head()

In [None]:
df.dtypes

In [None]:
# FINESS identifiers
finess_cols = [c for c in df.columns if 'finess' in c.lower() or 'nofiness' in c.lower()]
print(f"FINESS columns: {finess_cols}")
for col in finess_cols:
    print(f"  {col}: {df[col].nunique():,} unique values")

In [None]:
# Establishment categories
cat_col = [c for c in df.columns if 'categ' in c.lower() and 'lib' in c.lower()]
if not cat_col:
    cat_col = [c for c in df.columns if 'categ' in c.lower()]
print(f"Category columns: {cat_col}")

if cat_col:
    col = cat_col[-1]  # prefer the libelle
    top_cats = df[col].value_counts().head(20)
    print(f"\nTop 20 establishment categories:\n")
    print(top_cats.to_string())
    
    fig, ax = plt.subplots(figsize=(12, 8))
    top_cats.sort_values().plot.barh(ax=ax, color="steelblue")
    ax.set_title("Top 20 Establishment Categories")
    ax.set_xlabel("Number of establishments")
    plt.tight_layout()
    plt.show()

In [None]:
# Geographic distribution by department
dept_col = [c for c in df.columns if 'dep' in c.lower()]
print(f"Department columns: {dept_col}")

if dept_col:
    col = dept_col[0]
    top_depts = df[col].astype(str).value_counts().head(20)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    top_depts.sort_values().plot.barh(ax=ax, color="darkorange")
    ax.set_title("Top 20 Departments by Number of Establishments")
    ax.set_xlabel("Number of establishments")
    plt.tight_layout()
    plt.show()

In [None]:
# Geolocalization coverage
lat_col = [c for c in df.columns if 'lat' in c.lower() or 'coordonnee' in c.lower()]
lon_col = [c for c in df.columns if 'lon' in c.lower() or 'lng' in c.lower()]
print(f"Latitude columns: {lat_col}")
print(f"Longitude columns: {lon_col}")

if lat_col:
    has_coords = df[lat_col[0]].notna().sum()
    print(f"\nEstablishments with GPS coordinates: {has_coords:,} / {len(df):,} ({100*has_coords/len(df):.1f}%)")

In [None]:
# Plot establishments on a scatter map (if coordinates available)
if lat_col and lon_col:
    lc, lnc = lat_col[0], lon_col[0]
    geo_df = df[[lc, lnc]].dropna()
    # Convert to numeric if needed
    geo_df[lc] = pd.to_numeric(geo_df[lc], errors='coerce')
    geo_df[lnc] = pd.to_numeric(geo_df[lnc], errors='coerce')
    geo_df = geo_df.dropna()
    # Filter to mainland France
    geo_df = geo_df[(geo_df[lc] > 41) & (geo_df[lc] < 52) & (geo_df[lnc] > -6) & (geo_df[lnc] < 10)]
    
    fig, ax = plt.subplots(figsize=(8, 10))
    ax.scatter(geo_df[lnc], geo_df[lc], s=0.1, alpha=0.3, color="steelblue")
    ax.set_title(f"Healthcare Establishments in Metropolitan France ({len(geo_df):,} points)")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.set_aspect(1.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Open/closed establishments
date_cols = [c for c in df.columns if 'date' in c.lower() and 'ferm' in c.lower()]
print(f"Closure date columns: {date_cols}")

if date_cols:
    closed = df[date_cols[0]].notna().sum()
    print(f"\nClosed establishments: {closed:,} / {len(df):,} ({100*closed/len(df):.1f}%)")

In [None]:
# Null rates
null_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
print("Top null rates:\n")
print(null_pct.head(15).to_string())