# INSEE COG — Code Officiel Géographique

**What:** Official geographic reference for all French communes, departments, and regions. The geographic glue across all other datasets.

| Property | Value |
|----------|-------|
| Source | INSEE via data.gouv.fr |
| Format | CSV, comma-separated |
| Encoding | UTF-8 |
| Vintage | 2025 (January 1st) |
| Size | ~2 MB total |

In [None]:
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path("../../data/raw/insee_cog")
RAW_DIR.mkdir(parents=True, exist_ok=True)

FILES = {
    "v_commune_2025.csv": "https://www.insee.fr/fr/statistiques/fichier/8377162/v_commune_2025.csv",
    "v_departement_2025.csv": "https://www.insee.fr/fr/statistiques/fichier/8377162/v_departement_2025.csv",
    "v_region_2025.csv": "https://www.insee.fr/fr/statistiques/fichier/8377162/v_region_2025.csv",
}

In [None]:
for fname, url in FILES.items():
    dest = RAW_DIR / fname
    if dest.exists():
        print(f"  [skip] {fname}")
        continue
    print(f"  Downloading {fname}...", end=" ")
    resp = httpx.get(url, follow_redirects=True, timeout=60)
    resp.raise_for_status()
    dest.write_bytes(resp.content)
    print(f"{len(resp.content)/1024:.0f} KB")

print("Done!")

## Communes

In [None]:
communes = pd.read_csv(RAW_DIR / "v_commune_2025.csv", encoding="utf-8")
print(f"Communes: {len(communes):,} rows")
print(f"\nColumns: {list(communes.columns)}")
communes.head()

In [None]:
# Commune types
if 'TYPECOM' in communes.columns:
    print("Commune types:")
    print(communes['TYPECOM'].value_counts().to_string())
    print("\nCOM = commune, ARM = arrondissement municipal, COMD = commune déléguée, COMA = commune associée")

In [None]:
# Filter to actual communes (COM type)
if 'TYPECOM' in communes.columns:
    actual_communes = communes[communes['TYPECOM'] == 'COM']
    print(f"Actual communes (COM): {len(actual_communes):,}")
else:
    actual_communes = communes

In [None]:
# Communes per department
dep_col = [c for c in communes.columns if c in ['DEP', 'dep', 'code_departement']]
print(f"Department column: {dep_col}")

if dep_col:
    col = dep_col[0]
    communes_per_dept = actual_communes[col].value_counts().sort_index()
    
    fig, ax = plt.subplots(figsize=(14, 5))
    communes_per_dept.plot.bar(ax=ax, color="steelblue", width=0.8)
    ax.set_title("Number of Communes per Department")
    ax.set_xlabel("Department code")
    ax.set_ylabel("Number of communes")
    plt.xticks(rotation=90, fontsize=6)
    plt.tight_layout()
    plt.show()

## Departments

In [None]:
departements = pd.read_csv(RAW_DIR / "v_departement_2025.csv", encoding="utf-8")
print(f"Departments: {len(departements)} rows")
print(f"Columns: {list(departements.columns)}")
departements.head(10)

## Regions

In [None]:
regions = pd.read_csv(RAW_DIR / "v_region_2025.csv", encoding="utf-8")
print(f"Regions: {len(regions)} rows")
print(f"Columns: {list(regions.columns)}")
regions

## Build the geographic hierarchy

This is what `dim_geography` will look like in our star schema:

In [None]:
import duckdb

con = duckdb.connect(":memory:")
con.register("communes", actual_communes)
con.register("departements", departements)
con.register("regions", regions)

# Show the column names to build the join
print("Communes columns:", list(actual_communes.columns))
print("Departements columns:", list(departements.columns))
print("Regions columns:", list(regions.columns))

In [None]:
# Build the full hierarchy: commune → department → region
geo = con.sql("""
    SELECT
        c.COM AS code_commune_insee,
        c.LIBELLE AS nom_commune,
        c.TYPECOM AS type_commune,
        c.DEP AS code_departement,
        d.LIBELLE AS nom_departement,
        c.REG AS code_region,
        r.LIBELLE AS nom_region
    FROM communes c
    LEFT JOIN departements d ON c.DEP = d.DEP
    LEFT JOIN regions r ON c.REG = r.REG
    ORDER BY c.COM
""").df()

print(f"Geographic hierarchy: {len(geo):,} communes")
geo.head(10)

In [None]:
# Communes per region
communes_per_region = geo.groupby('nom_region').size().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
communes_per_region.sort_values().plot.barh(ax=ax, color="darkorange")
ax.set_title("Number of Communes per Region")
ax.set_xlabel("Number of communes")
plt.tight_layout()
plt.show()

In [None]:
# Summary stats
print(f"Total communes: {len(geo):,}")
print(f"Departments: {geo['code_departement'].nunique()}")
print(f"Regions: {geo['code_region'].nunique()}")
print(f"\nThis is the 'geographic glue' — every other dataset joins here via code_commune_insee.")