# BDPM — Base de Données Publique des Médicaments

**What:** Official reference database for all drugs authorized in France — AMM status, compositions, prices, reimbursement rates, generic groups.

**Why it matters:** Drug reference data — decodes CIP/CIS codes from Open Medic and maps drugs to active substances, labs, and therapeutic positioning.

| Property | Value |
|----------|-------|
| Source | base-donnees-publique.medicaments.gouv.fr |
| Format | Tab-separated TXT (no headers) |
| Encoding | UTF-8 |
| Size | ~4 MB total (8 small files) |

In [None]:
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path("../../data/raw/bdpm")
RAW_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://base-donnees-publique.medicaments.gouv.fr/telechargement.php?fichier="
FILES = {
    "CIS_bdpm.txt": "Drug specialties (main table)",
    "CIS_CIP_bdpm.txt": "Presentations (CIP13, prices, reimbursement)",
    "CIS_COMPO_bdpm.txt": "Compositions (active substances)",
    "CIS_GENER_bdpm.txt": "Generic groups",
    "CIS_HAS_SMR_bdpm.txt": "SMR opinions (medical benefit)",
    "CIS_HAS_ASMR_bdpm.txt": "ASMR opinions (improvement of medical benefit)",
    "CIS_CPD_bdpm.txt": "Prescribing/dispensing conditions",
    "HAS_LiensPageCT_bdpm.txt": "HAS transparency commission links",
}

for fname, desc in FILES.items():
    dest = RAW_DIR / fname
    if dest.exists():
        print(f"  [skip] {fname}")
        continue
    print(f"  Downloading {fname} ({desc})...", end=" ")
    resp = httpx.get(f"{BASE_URL}{fname}", follow_redirects=True, timeout=60)
    resp.raise_for_status()
    dest.write_bytes(resp.content)
    print(f"{len(resp.content) / 1024:.0f} KB")

print("\nAll BDPM files ready.")

## Load the 3 main tables

In [None]:
# --- CIS_bdpm: Drug specialties ---
specialites = pd.read_csv(
    RAW_DIR / "CIS_bdpm.txt", sep="\t", header=None, encoding="utf-8",
    names=["code_cis", "denomination", "forme_pharma", "voie_admin",
           "statut_amm", "type_procedure", "etat_commercialisation",
           "date_amm", "statut_bdm", "numero_autorisation_euro",
           "titulaire", "surveillance_renforcee"],
)
print(f"Drug specialties: {len(specialites):,} rows")
specialites.head()

In [None]:
# --- CIS_CIP_bdpm: Presentations (CIP13, prices) ---
presentations = pd.read_csv(
    RAW_DIR / "CIS_CIP_bdpm.txt", sep="\t", header=None, encoding="utf-8",
    names=["code_cis", "code_cip7", "libelle_presentation", "statut_admin",
           "etat_commercialisation", "date_declaration", "code_cip13",
           "agrement_collectivites", "taux_remboursement", "prix_sans_honoraire",
           "prix_avec_honoraire", "honoraire_dispensation", "indications_remboursement"],
)
print(f"Presentations: {len(presentations):,} rows")
presentations.head()

In [None]:
# --- CIS_COMPO_bdpm: Compositions ---
compositions = pd.read_csv(
    RAW_DIR / "CIS_COMPO_bdpm.txt", sep="\t", header=None, encoding="utf-8",
    names=["code_cis", "designation_element", "code_substance",
           "denomination_substance", "dosage", "ref_dosage",
           "nature_composant", "numero_liaison_sa_ft"],
)
print(f"Compositions: {len(compositions):,} rows")
compositions.head()

## Explore: Top labs, substances, forms

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

# Top 15 labs
specialites["titulaire"].str.strip().value_counts().head(15).sort_values().plot.barh(
    ax=axes[0], color="steelblue")
axes[0].set_title("Top 15 Labs (by # of drug specialties)")

# Top 10 pharma forms
specialites["forme_pharma"].value_counts().head(10).sort_values().plot.barh(
    ax=axes[1], color="darkorange")
axes[1].set_title("Top 10 Pharmaceutical Forms")

# Top 15 active substances
compositions["denomination_substance"].str.strip().value_counts().head(15).sort_values().plot.barh(
    ax=axes[2], color="seagreen")
axes[2].set_title("Top 15 Active Substances")

plt.tight_layout()
plt.show()

In [None]:
# Reimbursement rate distribution
print("Reimbursement rates:\n")
print(presentations["taux_remboursement"].value_counts(dropna=False).to_string())

In [None]:
# AMM status breakdown
print("AMM statuses:\n")
print(specialites["statut_amm"].value_counts().to_string())

In [None]:
# Generics
generiques = pd.read_csv(
    RAW_DIR / "CIS_GENER_bdpm.txt", sep="\t", header=None, encoding="utf-8",
    names=["id_groupe", "libelle_groupe", "code_cis", "type_generique",
           "numero_tri", "col6"],
)
print(f"Generic groups: {generiques['id_groupe'].nunique():,} groups")
print(f"Drugs in generic groups: {generiques['code_cis'].nunique():,}")
print(f"\nType distribution:")
print(generiques["type_generique"].value_counts().to_string())

## Quick DuckDB query demo

In [None]:
import duckdb

con = duckdb.connect(":memory:")
con.register("specialites", specialites)
con.register("compositions", compositions)
con.register("presentations", presentations)

con.sql("""
    SELECT
        TRIM(s.titulaire) AS lab,
        COUNT(DISTINCT s.code_cis) AS nb_drugs,
        COUNT(DISTINCT c.denomination_substance) AS nb_unique_substances
    FROM specialites s
    JOIN compositions c ON s.code_cis = c.code_cis
    GROUP BY TRIM(s.titulaire)
    ORDER BY nb_drugs DESC
    LIMIT 10
""").show()