In [1]:
!uv pip install -q bertopic spacy polars datasets hf_xet

In [2]:
!uv pip install -q https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl

In [3]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from hdbscan import HDBSCAN
from scipy.cluster import hierarchy as sch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from umap import UMAP
import polars as pl
import spacy
import pandas as pd
import numpy as np
import time


  axis.set_ylabel('$\lambda$ value')
  """Perform robust single linkage clustering from a vector array
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nlp = spacy.load("fr_core_news_sm")  

In [5]:
def preprocess(docs):
    cleaned = []
    for doc in nlp.pipe(docs, batch_size=20):
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        cleaned.append(' '.join(tokens))
    return cleaned

In [6]:
DICTIONNARY =  ['accord','entreprise', 'preambule', 'sommaire',  'code', 'syndical', 'responsable', 'representant', 
                'present', 'ca', 'organisation', 'preambule', 'peut', 'etre', 'contrat','travail', 'ressources','humaines', 'mise',
                'ainsi', 'et', 'ou', 'alors','collaborateur', 'ci', 'apres', 'party', 'signataire', 'tout', 'etat', 'cause', 'societe', 
                'notamment','article','activite', 'cette', 'donc', 'si', 'sous', 'disposition', 'convention', 'collective', 'dans', 'a', 'cadre',
                'signataire', 'partie', 'parties', 'entre', 'doit', 'mme', 'mr', 'madame', 'monsieur'
               ]

DICTIONNARY_STEM = ['part', 'signatair', 'organis', 'syndical', 
                    'dont', 'sieg', 'social', 'conseil', 'prud', 'homm', 
                   'vi', 'professionnel', 'disposit', 'legal', 'conventionnel']

In [7]:
import re

def normalize(text):
    return text.lower().strip()

def split_text_by_sentences(text, flagged_sentences):
    split_texts = []
    positions = []

    normalized_text = normalize(text)

    # On garde un mapping (titre original, position) pour préserver les titres initiaux
    for sentence in flagged_sentences:
        norm_sentence = normalize(sentence)
        pos = normalized_text.find(norm_sentence)
        if pos != -1:
            # On retrouve la position réelle dans le texte original
            real_pos = text.lower().find(sentence.lower())
            if real_pos != -1:
                positions.append(real_pos)

    # Si aucune position trouvée, retourner le texte complet
    if not positions:
        return [text]

    positions = sorted(set(positions))
    positions.insert(0, 0)
    positions.append(len(text))

    for i in range(len(positions) - 1):
        start = positions[i]
        end = positions[i + 1]
        split_texts.append(text[start:end].strip())

    return split_texts



In [8]:
def split_text_with_titles(text, summary_titles):
    chunks = split_text_by_sentences(text, summary_titles)
    result = {}
    for title in summary_titles:
        for chunk in chunks:
            if normalize(title) in normalize(chunk[:len(title)+30]):
                result[title] = chunk.strip()
                break
    return result


In [9]:
sommaire_hs = pd.read_parquet("data/echantillon_1000_hs_accords_TOC.parquet")
df_hs = pd.read_parquet("data/echantillon_1000_hs_accords.parquet")
df_hs = df_hs.set_index("numdossier_new")
df_hs = df_hs.merge(sommaire_hs,how="inner",left_index=True,right_index=True)
df_hs = df_hs.rename(columns={"extracted_summary":"summary"})

In [10]:
df_hs["section_dict"] = df_hs.apply(
    lambda row: split_text_with_titles(row["accorddocx"], row["summary"]),
    axis=1
)

In [11]:
def get_all_chunks(section_dict):
    chunks = list(section_dict.values())
    return [chunk.strip() for chunk in chunks]

In [12]:
def get_valid_chunks_filtered(section_dict, skip_titles=["préambule", "annexe"], seuil_sim=0.85):
    skip_titles_norm = [normalize(t) for t in skip_titles]

    # supprimer le préambule et avant 
    titles = list(section_dict.keys())
    preamble_idx = next((i for i, t in enumerate(titles) if "préambule" in normalize(t)), -1)
    if preamble_idx != -1:
        titles = titles[preamble_idx + 1:]

    # garder les titres valides uniquement
    valid_titles = [
        t for t in titles if all(skip_kw not in normalize(t) for skip_kw in skip_titles_norm)
    ]
    candidate_dict = {t: section_dict[t] for t in valid_titles}

    # filtrer par similarité des titres
    return filtre_chunks_par_titre(candidate_dict, phrases_non_metier, seuil=seuil_sim)


# Sans filtrer les chunks

In [13]:
df_hs["lda_documents"] = df_hs["section_dict"].apply(get_all_chunks)

In [None]:
all_chunks_hs = [chunk for doc_chunks in df_hs["lda_documents"] for chunk in doc_chunks]
all_docs_cleaned = preprocess(all_chunks_hs)


In [None]:
#Embeddings --> étape trop longue sans gpu 
start = time.time()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2",device='cuda')  
embeddings = embedding_model.encode(all_docs_cleaned, show_progress_bar=True)
print(f"[1] Embedding en {time.time() - start:.2f}s")

In [None]:
# ACP --> plus rapide 
start = time.time()
pca_model = PCA(n_components=5)
pca_embeddings = pca_model.fit_transform(embeddings)
print(f"[PCA] en {time.time() - start:.2f}s")

In [None]:
#Réduction UMAP --> trop long besoin de trouver une version avec gpu 
#start = time.time()
#umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
#umap_embeddings = umap_model.fit_transform(embeddings)
#print(f"[2] UMAP en {time.time() - start:.2f}s")

In [None]:
# Clustering 
start = time.time()
hdbscan_model =  HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
clusters = hdbscan_model.fit_predict(pca_embeddings)
print(f"[3] HDBSCAN en {time.time() - start:.2f}s")
print(f"[3] Nombre de clusters trouvés : {len(np.unique(clusters))}")

In [None]:
start = time.time()
topic_model = BERTopic(
    language="french",
    embedding_model=embedding_model,
    umap_model=pca_model,
    hdbscan_model=hdbscan_model,
    verbose=True
)
topics, probs = topic_model.fit_transform(all_docs_cleaned, embeddings=embeddings)
print(f"[4] BERTopic final en {time.time() - start:.2f}s")



In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
#from random import sample
#sample_docs = sample(all_docs_cleaned, 200)

In [None]:
#topic_model_no_filter = BERTopic(language="french")
#topics_no_filter, probs_no_filter = topic_model_no_filter.fit_transform(sample_docs)

In [None]:
#from bertopic import BERTopic
#topic_model_no_filter = BERTopic(language="french")
#topic_no_filter, probs_no_filter = topic_model_no_filter.fit_transform(all_docs_cleaned)

In [None]:
#topic_model.visualize_topics()

In [None]:
#topic_model.visualize_barchart()

# En filtrant les chunks

In [None]:
#df_hs["lda_documents"] = df_hs["section_dict"].apply(get_valid_chunks_filtered)

In [None]:
#filtered_chunks_hs = [chunk for doc_chunks in df_hs["lda_documents"] for chunk in doc_chunks]
#filtered_docs_cleaned = preprocess(all_chunks_hs)

In [None]:
#from bertopic import BERTopic
#topic_model_filter = BERTopic(language="french")
#topic_model_filter, probs_no_filter = topic_model_no_filter.fit_transform(all_docs_cleaned)

In [None]:
#topic_model.visualize_topics()

In [None]:
#topic_model.visualize_barchart()

# BERTopic (KeyBERTInspired)

In [None]:
#representation_model = KeyBERTInspired()

#topic_model = BERTopic(representation_model=representation_model,language="french")
#topics, probs = topic_model.fit_transform(docs_cleaned)

In [None]:
#topic_model.get_topic_info()topic_model.visualize_barchart()

In [None]:
#topic_model.visualize_barchart()

# BERTopic (MMR)

In [None]:
#representation_model = MaximalMarginalRelevance(diversity=0.3)

#topic_model = BERTopic(representation_model=representation_model,language="french")
#topics, probs = topic_model.fit_transform(docs_cleaned)

In [None]:
#topic_model.get_topic_info()

 # Hierarchical topics 

In [None]:
#hierarchical_topics = topic_model.hierarchical_topics(docs_cleaned)
#hierarchical_topics

In [None]:
#linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
#hierarchical_topics = topic_model.hierarchical_topics(docs_cleaned, linkage_function=linkage_function)

In [None]:
#topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)