# Transparence Santé — Pharma-to-HCP Payments

**What:** All declared financial links between pharma companies and healthcare actors: conventions, remunerations, benefits.

**Why it matters:** Competitive intelligence — who is paying whom, in which therapeutic areas, at what scale.

| Property | Value |
|----------|-------|
| Source | EurosForDocs (cleaned version of Transparence Santé) |
| URL | eurosfordocs.fr/download/ts_declaration.csv |
| Format | CSV, comma-separated |
| Encoding | UTF-8 |
| Size | ~500 MB+ |

> **Note:** We use the EurosForDocs cleaned version as recommended. It handles deduplication, RPPS matching, and data quality corrections. The file is large — we'll sample it for exploration.

In [None]:
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path("../../data/raw/transparence_sante")
RAW_DIR.mkdir(parents=True, exist_ok=True)

DEST = RAW_DIR / "ts_declaration.csv"
URL = "https://www.eurosfordocs.fr/download/ts_declaration.csv"

In [None]:
# Download (this is ~500MB+ — takes a few minutes)
if DEST.exists():
    print(f"Already downloaded: {DEST.stat().st_size / 1e6:.0f} MB")
else:
    print(f"Downloading from {URL}...")
    print("This file is large (~500MB+), be patient...")
    with httpx.stream("GET", URL, follow_redirects=True, timeout=600) as resp:
        resp.raise_for_status()
        total = int(resp.headers.get("content-length", 0))
        downloaded = 0
        with open(DEST, "wb") as f:
            for chunk in resp.iter_bytes(chunk_size=65536):
                f.write(chunk)
                downloaded += len(chunk)
                if total:
                    print(f"\r  {downloaded/1e6:.0f} / {total/1e6:.0f} MB ({100*downloaded/total:.0f}%)", end="")
    print(f"\nDone! {DEST.stat().st_size / 1e6:.0f} MB")

In [None]:
# Read just the first 500K rows to explore (full file is millions of rows)
df = pd.read_csv(DEST, encoding="utf-8", nrows=500_000, low_memory=False)
print(f"Loaded {len(df):,} rows (sample)")
print(f"Columns ({len(df.columns)}): {list(df.columns)}")
df.head()

In [None]:
df.dtypes

In [None]:
# Null rates
null_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
print("Null % per column:\n")
print(null_pct.to_string())

In [None]:
# What categories of payments exist?
if "categorie" in df.columns:
    print("Payment categories:\n")
    print(df["categorie"].value_counts().to_string())

In [None]:
# Identify the amount column (may vary in EurosForDocs format)
amount_cols = [c for c in df.columns if "montant" in c.lower() or "amount" in c.lower()]
print(f"Amount columns found: {amount_cols}")

if amount_cols:
    amt_col = amount_cols[0]
    df[amt_col] = pd.to_numeric(df[amt_col], errors="coerce")
    print(f"\n{amt_col} stats:")
    print(df[amt_col].describe())

In [None]:
# Top paying companies
enterprise_col = [c for c in df.columns if "entreprise" in c.lower() or "lab" in c.lower() or "company" in c.lower()]
print(f"Enterprise columns: {enterprise_col}")

if enterprise_col and amount_cols:
    ent_col = enterprise_col[0]
    top_payers = (
        df.groupby(ent_col)[amt_col]
        .sum()
        .sort_values(ascending=False)
        .head(20)
    )
    
    fig, ax = plt.subplots(figsize=(12, 8))
    top_payers.sort_values().plot.barh(ax=ax, color="crimson")
    ax.set_title("Top 20 Labs by Total Payment Amount (sample)")
    ax.set_xlabel("Total amount (€)")
    plt.tight_layout()
    plt.show()

In [None]:
# RPPS coverage — how many payments have an RPPS number?
rpps_col = [c for c in df.columns if "rpps" in c.lower()]
print(f"RPPS columns: {rpps_col}")

if rpps_col:
    rpps_c = rpps_col[0]
    has_rpps = df[rpps_c].notna().sum()
    print(f"\nRows with RPPS: {has_rpps:,} / {len(df):,} ({100*has_rpps/len(df):.1f}%)")
    print(f"Unique RPPS: {df[rpps_c].nunique():,}")

In [None]:
# Payment category distribution (pie chart)
if "categorie" in df.columns:
    fig, ax = plt.subplots(figsize=(8, 8))
    df["categorie"].value_counts().plot.pie(ax=ax, autopct="%1.1f%%", startangle=90)
    ax.set_title("Payment Categories")
    ax.set_ylabel("")
    plt.tight_layout()
    plt.show()