In [3]:
import pandas as pd
import numpy as np
import re
import unicodedata
import json
import time
import requests
from pathlib import Path


ModuleNotFoundError: No module named 'pandas'

# Analyse du fichier

1Ô∏è‚É£ Chargement CSV robuste (encodage + s√©parateur)

In [None]:
def load_csv_safely(path: Path) -> pd.DataFrame:
    encodings = ["utf-8", "utf-8-sig", "latin1"]
    seps = [",", ";", "\t"]

    for enc in encodings:
        for sep in seps:
            try:
                df = pd.read_csv(
                    path,
                    encoding=enc,
                    sep=sep,
                    engine="python",
                    on_bad_lines="skip",
                )
                print(f"‚úÖ CSV charg√© | enc={enc} | sep='{sep}'")
                return df
            except Exception:
                pass

    raise RuntimeError("‚ùå Impossible de lire le CSV")


In [None]:
df = load_csv_safely(r"C:\Users\abapst\nb_python\copublication_italie\ITALIE\plotly2\Copublis_Internationales_Inria_nov_2025_complet.csv")
print(df.shape)

‚úÖ CSV charg√© | enc=latin1 | sep=','
(32084, 1)


2Ô∏è‚É£ Normalisation des noms de colonnes (camelCase ASCII)

Adapter le nom des colonnes en camelCase sans caract√®res sp√©ciaux.

In [None]:
def normalize_column(col: str) -> str:
    col = unicodedata.normalize("NFKD", col)
    col = col.encode("ascii", "ignore").decode("utf-8")
    col = re.sub(r"[^\w\s]", "", col)
    parts = col.strip().split()
    return parts[0].lower() + "".join(p.capitalize() for p in parts[1:])


In [None]:
df.columns = [normalize_column(c) for c in df.columns]
print(df.columns.tolist())


['centreequipeauteursFrauteursCopubliantsorganismeCopubliantadressevillepaysidAurehaluenonUeanneehaliddomainesdomainesConsolidesmotsclesresumelatitudelongitudegeonameid']


3Ô∏è‚É£ Nettoyage des villes et organismes

In [None]:
def clean_city(city):
    if pd.isna(city):
        return None

    city = city.strip()

    if city.upper() in {"ANL", "??", "???", "N/A"}:
        return None

    city = re.sub(r"\d+", "", city)
    city = re.sub(r"[^\w\s-]", "", city)

    city = unicodedata.normalize("NFKD", city)
    city = city.encode("ascii", "ignore").decode("utf-8")

    city = re.sub(r"\s+", " ", city).strip()

    return city if len(city) > 1 else None


In [None]:
if "Ville" in df.columns:
    df["villeClean"] = df["Ville"].apply(clean_city)


Nettoyage des organismes copubliants

In [None]:
def clean_org(org):
    if pd.isna(org):
        return None
    org = re.sub(r"[\[\]\?]", "", org)
    return re.sub(r"\s+", " ", org).strip()


In [None]:
if "Organisme_copubliant" in df.columns:
    df["organismeCopubliantClean"] = df["Organisme_copubliant"].apply(clean_org)


4Ô∏è‚É£ Mise √† jour des domaines

In [None]:
if "Domaine(s)" in df.columns:
    df["Domaine(s)"] = (
        df["Domaine(s)"]
        .astype(str)
        .str.replace(r"[^\w\s;/]", "", regex=True)
        .str.strip()
    )


5Ô∏è‚É£ Validation g√©ographique AVANT g√©ocodage

In [None]:
def geo_diagnostics(df):
    print("\nüó∫Ô∏è DIAGNOSTICS G√âO")

    if not {"Latitude", "Longitude"} <= set(df.columns):
        print("‚ö†Ô∏è Pas de colonnes latitude/longitude")
        return

    invalid = df[
        (df["Latitude"].abs() > 90) | (df["Longitude"].abs() > 180)
    ]
    print("Coordonn√©es invalides :", len(invalid))

    missing = df[
        df["villeClean"].notna()
        & (df["Latitude"].isna() | df["Longitude"].isna())
    ]
    print("Villes sans coordonn√©es :", len(missing))


In [None]:
geo_diagnostics(df)



üó∫Ô∏è DIAGNOSTICS G√âO
‚ö†Ô∏è Pas de colonnes latitude/longitude


6Ô∏è‚É£ G√©ocodage Nominatim avec cache

In [None]:
def load_cache():
    if CACHE_PATH.exists():
        return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
    return {}

def save_cache(cache):
    CACHE_PATH.write_text(
        json.dumps(cache, indent=2, ensure_ascii=False),
        encoding="utf-8"
    )


In [None]:
def nominatim(city, country=None):
    url = "https://nominatim.openstreetmap.org/search"
    params = {"city": city, "format": "json", "limit": 1}
    if country:
        params["country"] = country

    headers = {"User-Agent": "INRIA-Geocoder/1.0"}
    time.sleep(1.1)

    r = requests.get(url, params=params, headers=headers, timeout=20)
    if r.status_code != 200 or not r.json():
        return None

    res = r.json()[0]
    return float(res["lat"]), float(res["lon"]), float(res.get("importance", 0.5))


In [None]:
cache = load_cache()

lat_geo, lon_geo, source, conf, status = [], [], [], [], []

for _, row in df.iterrows():
    city = row.get("villeClean")
    country = row.get("Pays")

    if pd.notna(row.get("Latitude")) and pd.notna(row.get("Longitude")):
        lat_geo.append(row["Latitude"])
        lon_geo.append(row["Longitude"])
        source.append("existing")
        conf.append(1.0)
        status.append("ok")
        continue

    if not city:
        lat_geo.append(None)
        lon_geo.append(None)
        source.append(None)
        conf.append(0.0)
        status.append("missing")
        continue

    key = f"{city}|{country}"

    if key in cache:
        lat, lon, c = cache[key]
        lat_geo.append(lat)
        lon_geo.append(lon)
        source.append("cache")
        conf.append(c)
        status.append("ok")
        continue

    res = nominatim(city, country)
    if res:
        lat, lon, c = res
        cache[key] = (lat, lon, c)
        lat_geo.append(lat)
        lon_geo.append(lon)
        source.append("nominatim")
        conf.append(c)
        status.append("ok")
    else:
        lat_geo.append(None)
        lon_geo.append(None)
        source.append("nominatim")
        conf.append(0.0)
        status.append("ambiguous")

save_cache(cache)

df["latitudeGeo"] = lat_geo
df["longitudeGeo"] = lon_geo
df["geoSource"] = source
df["geoConfidence"] = conf
df["geoStatus"] = status


NameError: name 'CACHE_PATH' is not defined

7Ô∏è‚É£ Pr√©-agr√©gations Dash

In [None]:
agg_pays = df.groupby("Pays").size().reset_index(name="nbPublications")
agg_annee = df.groupby("Ann√©e").size().reset_index(name="nbPublications")
agg_ue = df.groupby("UE/Non_UE").size().reset_index(name="nbPublications")
agg_ville = (
    df[df["geoStatus"] == "ok"]
    .groupby(["villeClean", "latitudeGeo", "longitudeGeo"])
    .size()
    .reset_index(name="nbPublications")
)


8Ô∏è‚É£ Tests rapides (qualit√© minimale)

In [None]:
assert df["villeClean"].isna().mean() < 0.3, "‚ö†Ô∏è Trop de villes invalides"
assert (df["geoStatus"] == "ok").mean() > 0.6, "‚ö†Ô∏è G√©ocodage insuffisant"


9Ô∏è‚É£ Export final

In [None]:
df.to_csv(f"{OUTPUT_PREFIX}.csv", sep=";", encoding="utf-8", index=False)
df.to_parquet(f"{OUTPUT_PREFIX}.parquet", index=False)

agg_pays.to_csv("agg_pays.csv", index=False)
agg_annee.to_csv("agg_annee.csv", index=False)
agg_ville.to_parquet("agg_ville.parquet", index=False)

print("‚úÖ PIPELINE TERMIN√â AVEC SUCC√àS")


# üîú 2. Cr√©ation du rapport PDF

Pour le rapport PDF automatique, voici la fonction qui g√©n√®re une documentation sur le pipeline, en tenant compte des √©tapes pr√©c√©dentes.

In [None]:
# Continue PDF generation code

from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4

def generate_pdf_report(df):
    output_pdf = "pipeline_report.pdf"
    doc = SimpleDocTemplate(output_pdf, pagesize=A4)
    story = []
    styles = getSampleStyleSheet()

    def add_title(text):
        story.append(Paragraph(f"<b><font size=16>{text}</font></b>", styles["Title"]))
        story.append(Spacer(1, 12))

    def add_section(title, body):
        story.append(Paragraph(f"<b>{title}</b>", styles["Heading2"]))
        story.append(Spacer(1, 6))
        for line in body.split("\n"):
            story.append(Paragraph(line, styles["Normal"]))
        story.append(Spacer(1, 12))

    add_title("Rapport du Pipeline de Nettoyage et G√©ocodage")

    add_section("R√©sum√©", f"""
    Nombre total de lignes : {len(df)}
    Colonnes disponibles : {len(df.columns)}
    Taux de g√©ocodage OK : {(df['geoStatus']=='ok').mean():.2%}
    """)

    add_section("G√©ocodage", f"""
    Sources :
    - Existing : {(df['geoSource']=='existing').mean():.2%}
    - Cache : {(df['geoSource']=='cache').mean():.2%}
    - Nominatim : {(df['geoSource']=='nominatim').mean():.2%}

    Cas ambigus : {(df['geoStatus']=='ambiguous').sum()}
    """)

    add_section("Qualit√© des donn√©es", f"""
    Villes nettoy√©es manquantes : {df['villeClean'].isna().sum()}
    Organismes nettoy√©s manquants : {df['organismeCopubliantClean'].isna().sum()}
    """)

    doc.build(story)
    print(f"üìÑ Rapport PDF g√©n√©r√© : {output_pdf}")




#  DASH APP (branch√©e DIRECTEMENT sur mes donn√©es)

pipeline/
‚îú‚îÄ‚îÄ pipeline_notebook.ipynb   ‚Üê ex√©cution principale
‚îú‚îÄ‚îÄ dash_app.py               ‚Üê dashboard Plotly Dash
‚îú‚îÄ‚îÄ report.py                 ‚Üê g√©n√©ration PDF
‚îú‚îÄ‚îÄ tests/
‚îÇ   ‚îú‚îÄ‚îÄ test_villes.py
‚îÇ   ‚îú‚îÄ‚îÄ test_organismes.py
‚îÇ   ‚îî‚îÄ‚îÄ test_geo.py
‚îî‚îÄ‚îÄ geocode_cache.json


In [None]:
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
import pandas as pd

df = pd.read_parquet("Copublications_INRIA_clean.parquet")

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Copublications internationales INRIA"),

    dcc.Dropdown(
        id="ue_filter",
        options=[{"label": v, "value": v} for v in df["UE/Non_UE"].dropna().unique()],
        multi=True,
        placeholder="UE / Non UE"
    ),

    dcc.Dropdown(
        id="year_filter",
        options=[{"label": int(y), "value": int(y)} for y in sorted(df["Ann√©e"].dropna().unique())],
        multi=True,
        placeholder="Ann√©e"
    ),

    dcc.Graph(id="map"),
    dcc.Graph(id="by_country")
])


In [None]:
@app.callback(
    [Output("map", "figure"),
     Output("by_country", "figure")],
    [Input("ue_filter", "value"),
     Input("year_filter", "value")]
)
def update_graphs(ue, years):
    dff = df.copy()

    if ue:
        dff = dff[dff["UE/Non_UE"].isin(ue)]
    if years:
        dff = dff[dff["Ann√©e"].isin(years)]

    map_fig = px.scatter_geo(
        dff[dff["geoStatus"] == "ok"],
        lat="latitudeGeo",
        lon="longitudeGeo",
        size=dff.groupby("Ville").transform("size"),
        hover_name="Ville",
        title="Carte des copublications"
    )

    country_fig = px.bar(
        dff.groupby("Pays").size().reset_index(name="n"),
        x="Pays",
        y="n",
        title="Publications par pays"
    )

    return map_fig, country_fig


RAPPORT PDF AUTOMATIQUE (STATISTIQUES R√âELLES)

In [None]:
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4
import pandas as pd

df = pd.read_parquet("Copublications_INRIA_clean.parquet")

doc = SimpleDocTemplate("rapport_pipeline_INRIA.pdf", pagesize=A4)
styles = getSampleStyleSheet()
story = []

def add(title, text):
    story.append(Paragraph(f"<b>{title}</b>", styles["Heading2"]))
    story.append(Spacer(1, 6))
    story.append(Paragraph(text, styles["Normal"]))
    story.append(Spacer(1, 12))

story.append(Paragraph("<b>Rapport de traitement des copublications</b>", styles["Title"]))

add("Vue d'ensemble", f"""
Nombre total de lignes : {len(df)}<br/>
Ann√©es couvertes : {int(df['Ann√©e'].min())} ‚Äì {int(df['Ann√©e'].max())}<br/>
Taux de g√©ocodage valide : {(df['geoStatus']=='ok').mean():.2%}
""")

add("G√©ocodage", f"""
Sources utilis√©es :<br/>
- Coordonn√©es existantes : {(df['geoSource']=='existing').mean():.2%}<br/>
- Cache : {(df['geoSource']=='cache').mean():.2%}<br/>
- Nominatim : {(df['geoSource']=='nominatim').mean():.2%}<br/>
Cas ambigus : {(df['geoStatus']=='ambiguous').sum()}
""")

add("Qualit√© des donn√©es", f"""
Villes non exploitables : {df['villeClean'].isna().sum()}<br/>
Organismes nettoy√©s manquants : {df['organismeCopubliantClean'].isna().sum()}
""")

doc.build(story)
print("üìÑ Rapport PDF g√©n√©r√© : rapport_pipeline_INRIA.pdf")


TESTS UNITAIRES (pytest)

In [None]:
# tests/test_villes.py
from pipeline_notebook import clean_city

def test_city_cleaning():
    assert clean_city(" Paris ") == "Paris"
    assert clean_city("???") is None
    assert clean_city("ANL") is None



In [None]:
# tests/test_organismes.py
from pipeline_notebook import clean_org

def test_org_cleaning():
    assert clean_org("INRIA [Paris]?") == "INRIA Paris"


In [None]:
# tests/test_geo.py
def test_coordinates_range():
    from pipeline_notebook import geo_diagnostics
    assert -90 <= 48.8 <= 90
    assert -180 <= 2.3 <= 180