# Validation Etapes 1 et 2 (HÃ©rault)

Notebook pedagogique minimal pour verifier:
- Etape 1: environnement (versions + imports critiques)
- Etape 2: configuration, recuperation mockee, cleaning, schema

Ce notebook est 100% offline par defaut (pas d'appel reseau reel).


In [1]:
from pathlib import Path
import os
import sys

def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / 'config.yaml').exists() and (candidate / 'src').exists():
            return candidate
    raise RuntimeError('Impossible de trouver la racine du projet.')

PROJECT_ROOT = find_project_root(Path.cwd())
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print('PROJECT_ROOT =', PROJECT_ROOT)


PROJECT_ROOT = /Users/steph/Code/Python/Jupyter/OCR_projet09


## 1) Validation environnement (Etape 1)

Cette cellule verifie versions et imports demandes dans la mission.


In [2]:
import importlib.metadata as md
import platform

def version(pkg: str) -> str:
    try:
        return md.version(pkg)
    except md.PackageNotFoundError:
        return 'not installed'

print('python      :', platform.python_version())
print('langchain   :', version('langchain'))
print('faiss-cpu   :', version('faiss-cpu'))
print('mistralai   :', version('mistralai'))
print('pandas      :', version('pandas'))
print('requests    :', version('requests'))

checks = [
    ('import faiss', 'import faiss'),
    ('from langchain.vectorstores import FAISS', 'from langchain.vectorstores import FAISS'),
    ('from langchain.embeddings import HuggingFaceEmbeddings', 'from langchain.embeddings import HuggingFaceEmbeddings'),
    ('from mistral import MistralClient', 'from mistral import MistralClient'),
]

failures = []
for label, stmt in checks:
    try:
        exec(stmt, {})
        print('[OK] ', label)
    except Exception as exc:
        print('[KO] ', label, '->', exc)
        failures.append((label, str(exc)))

if failures:
    raise AssertionError(f'Imports en echec: {failures}')

print('Etape 1 validee.')


python      : 3.11.3
langchain   : 0.0.353
faiss-cpu   : 1.13.2
mistralai   : 0.4.2
pandas      : 2.2.2
requests    : 2.32.3
[OK]  import faiss
[OK]  from langchain.vectorstores import FAISS
[OK]  from langchain.embeddings import HuggingFaceEmbeddings
[OK]  from mistral import MistralClient
Etape 1 validee.


## 2) Charger la config (Etape 2)

On verifie la configuration geographique et temporelle.


In [3]:
from datetime import date, timedelta
import yaml

config = yaml.safe_load(Path('config.yaml').read_text(encoding='utf-8'))
oa = config['openagenda']

today = date.today()
start_date = oa['time_window'].get('start_date') or str(today - timedelta(days=365))
end_date = oa['time_window'].get('end_date') or str(today + timedelta(days=90))

print('Zone cible dept :', oa['location'].get('department'))
print('Ville pivot     :', oa['location'].get('city'))
print('Coordonnees     :', oa['location']['latitude'], oa['location']['longitude'])
print('Rayon (km)      :', oa['location']['radius_km'])
print('Fenetre debut   :', start_date)
print('Fenetre fin     :', end_date)
print('Langue          :', oa['request']['language'])
print('Pagination size :', oa['pagination']['page_size'])

assert oa['location'].get('department') == '34'
assert oa['request']['language'] == 'fr'
print('Configuration validee.')


Zone cible dept : 34
Ville pivot     : Montpellier
Coordonnees     : 43.6119 3.8772
Rayon (km)      : 55
Fenetre debut   : 2025-02-10
Fenetre fin     : 2026-05-11
Langue          : fr
Pagination size : 100
Configuration validee.


## 3) Client OpenAgenda sans reseau (pagination mockee)

Cette cellule valide la logique de pagination/requete sans HTTP reel.


In [4]:
from src.openagenda.client import OpenAgendaConfig, fetch_events

class FakeResponse:
    def __init__(self, payload, status_code=200):
        self._payload = payload
        self.status_code = status_code
        self.text = str(payload)

    def json(self):
        return self._payload

class FakeSession:
    def __init__(self, responses):
        self.responses = list(responses)
        self.call_count = 0

    def get(self, url, params=None, timeout=None):
        del url, params, timeout
        self.call_count += 1
        if not self.responses:
            return FakeResponse({'events': []})
        return FakeResponse(self.responses.pop(0))

    def close(self):
        pass

fake_session = FakeSession([
    {'events': [{'uid': 'evt-1', 'title': {'fr': 'Evenement 1'}}]},
    {'events': [{'uid': 'evt-2', 'title': {'fr': 'Evenement 2'}}]},
    {'events': []},
])

base_kwargs = dict(
    base_url='https://api.openagenda.com/v2/events',
    api_key='demo-key',
    city='Montpellier',
    page_size=1,
    max_pages=5,
    max_events=10,
    start_date='2025-01-01',
    end_date='2026-12-31',
)

fields = getattr(OpenAgendaConfig, '__dataclass_fields__', {})
if 'department' in fields:
    cfg = OpenAgendaConfig(department='34', **base_kwargs)
else:
    cfg = OpenAgendaConfig(**base_kwargs)
    extra = dict(getattr(cfg, 'extra_params', {}) or {})
    extra['department'] = '34'
    cfg.extra_params = extra

raw_events = fetch_events(cfg, session=fake_session)
print('Evenements recuperes:', len(raw_events))
print('UIDs:', [evt.get('uid') for evt in raw_events])

assert len(raw_events) == 2
assert fake_session.call_count == 3
print('Pagination mockee validee.')


Evenements recuperes: 2
UIDs: ['evt-1', 'evt-2']
Pagination mockee validee.


## 4) Cleaning + schema (Etape 2)

On valide les regles de filtrage, deduplication, schema et document_text.


In [5]:
import pandas as pd
from src.preprocess.cleaning import clean_events
from src.preprocess.schema import EVENT_RECORD_FIELDS

demo_raw_events = [
    {
        'uid': 'evt-valid',
        'title': {'fr': 'Concert test'},
        'description': {'fr': 'Soiree musicale'},
        'firstTiming': {'begin': '2025-06-01T19:00:00Z', 'end': '2025-06-01T21:00:00Z'},
        'location': {
            'name': {'fr': 'Salle A'},
            'address': '10 rue de la Paix',
            'city': 'Montpellier',
            'latitude': 43.6119,
            'longitude': 3.8772,
        },
        'canonicalUrl': 'https://example.org/events/evt-valid',
        'tags': ['musique', 'culture'],
    },
    {
        'uid': 'evt-valid',  # doublon volontaire
        'title': {'fr': 'Concert test'},
        'description': {'fr': 'Soiree musicale'},
        'firstTiming': {'begin': '2025-06-01T19:00:00Z', 'end': '2025-06-01T21:00:00Z'},
        'location': {'name': {'fr': 'Salle A'}, 'city': 'Montpellier'},
    },
    {
        'uid': 'evt-old',
        'title': {'fr': 'Ancien evenement'},
        'firstTiming': {'begin': '2023-01-01T10:00:00Z'},
        'location': {'city': 'Montpellier'},
    },
    {
        'uid': 'evt-missing-title',
        'title': {'fr': ''},
        'firstTiming': {'begin': '2025-06-02T10:00:00Z'},
        'location': {'city': 'Montpellier'},
    },
]

processed, stats = clean_events(
    raw_events=demo_raw_events,
    start_date='2025-01-01',
    end_date='2026-01-31',
    language='fr',
    source='openagenda',
)

print('Stats:', stats)
display(pd.DataFrame(processed))

assert stats['raw_events'] == 4
assert stats['duplicates_removed'] == 1
assert stats['outside_period'] == 1
assert stats['missing_required'] == 1
assert len(processed) == 1

record = processed[0]
assert set(record.keys()) == set(EVENT_RECORD_FIELDS)
assert record['document_text'].strip()
assert isinstance(record['retrieval_metadata'], dict)
print('Cleaning + schema valides.')


Stats: {'raw_events': 4, 'missing_required': 1, 'outside_period': 1, 'after_period_filter': 2, 'duplicates_removed': 1, 'invalid_records': 0, 'processed_events': 1}


Unnamed: 0,event_id,title,description,start_datetime,end_datetime,city,location_name,address,latitude,longitude,url,tags,source,document_text,retrieval_metadata
0,evt-valid,Concert test,Soiree musicale,2025-06-01T19:00:00Z,2025-06-01T21:00:00Z,Montpellier,Salle A,10 rue de la Paix,43.6119,3.8772,https://example.org/events/evt-valid,"[musique, culture]",openagenda,Titre: Concert test\nDescription: Soiree music...,"{'event_id': 'evt-valid', 'city': 'Montpellier..."


Cleaning + schema valides.


## 5) Validation des sorties fichiers (simulation locale)

On simule l'ecriture raw/processed comme `build_dataset.py`.


In [6]:
import json
import tempfile

with tempfile.TemporaryDirectory() as tmp_dir:
    tmp = Path(tmp_dir)
    raw_path = tmp / 'events_raw.jsonl'
    processed_path = tmp / 'events_processed.parquet'

    with raw_path.open('w', encoding='utf-8') as f:
        for item in demo_raw_events:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    pd.DataFrame(processed).to_parquet(processed_path, index=False)

    assert raw_path.exists()
    assert processed_path.exists()

    check_df = pd.read_parquet(processed_path)
    print('raw_path      =', raw_path)
    print('processed_path=', processed_path)
    print('rows parquet  =', len(check_df))

print('Validation des sorties fichiers OK.')


raw_path      = /var/folders/by/r6dmty813rxgqdnr7hvtrl480000gn/T/tmplaf5zwia/events_raw.jsonl
processed_path= /var/folders/by/r6dmty813rxgqdnr7hvtrl480000gn/T/tmplaf5zwia/events_processed.parquet
rows parquet  = 1
Validation des sorties fichiers OK.


## 6) Validation rapide scripts/tests du repo

Cellule optionnelle: execute `scripts/check_env.py` puis `pytest -q`.


In [7]:
import subprocess

commands = [
    ['python3', 'scripts/check_env.py'],
    ['pytest', '-q'],
]

for cmd in commands:
    print('\n$ ' + ' '.join(cmd))
    completed = subprocess.run(cmd, text=True, capture_output=True)
    print(completed.stdout)
    if completed.returncode != 0:
        print(completed.stderr)
        raise RuntimeError(f"Commande en echec: {' '.join(cmd)}")

print('Notebook: toutes les validations Etape 1 + Etape 2 sont OK.')



$ python3 scripts/check_env.py
=== Environment versions ===
python: 3.11.3
langchain: 0.0.353
faiss-cpu: 1.13.2
mistralai: 0.4.2
pandas: 2.2.2
requests: 2.32.3

=== Import checks ===
[OK] import faiss
[OK] from langchain.vectorstores import FAISS
[OK] from langchain.embeddings import HuggingFaceEmbeddings
[OK] from mistral import MistralClient

[SUCCESS] Environment smoke test passed.


$ pytest -q
[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                    [100%][0m
[32m[32m[1m5 passed[0m[32m in 0.02s[0m[0m

Notebook: toutes les validations Etape 1 + Etape 2 sont OK.
