In [7]:
from pypdf import PdfReader
import re
import os
from collections import defaultdict
import json
import networkx as nx
import plotly.graph_objects as go
from llama_index.llms.ollama import Ollama
from tqdm.auto import tqdm
import random

In [None]:
if not os.path.exists("./code_assurances_raw.txt"):
    reader = PdfReader("./LEGITEXT000006073984.pdf")
    text = ""
    for page in reader.pages:
        content = page.extract_text()
        if content:
            text += content + "\n"

    with open("code_assurances_raw.txt", "w", encoding="utf-8") as f:
        f.write(text)
else:
    with open("code_assurances_raw.txt", "r", encoding="utf-8") as f:
        text = f.read()

In [57]:

init_len = len(text)
sub_pattern = r'Code des assurances - Dernière modification le 15 août 2025 - Document généré le 14 août 2025'
text = re.sub(sub_pattern, '', text)
text = text.strip()

In [58]:
# Define regex patterns for hierarchy and articles
part_pattern = r'(Partie législative|Partie réglementaire - Arrêtés|Partie réglementaire)\n'
book_pattern = r"(^Livre [IVXLCDM]+.*$)"
title_pattern = r"(^Titre [IVXLCDM]+.*$)"
chapter_pattern = r"(^Chapitre [IVXLCDM]+.*$)"
section_pattern = r"(^Section (?:[IVXLCDM]+|[0-9]+)+.*$)"
sous_section_pattern = r"(^Sous-section\s+(?:[IVXLCDM]+|[0-9]+).*$)"
article_pattern = r'(Article\s+[A-Z]\*?\d+(?:-\d+)*)'

patterns = [part_pattern, book_pattern, title_pattern, chapter_pattern, section_pattern, sous_section_pattern]
level_keys = ["partie", "livre", "titre", "chapitre", "section", "sous_section"]

In [None]:
# Initialize data structures
articles_list = []
hierarchy_tree = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))))
curr_hierarchy = {lvl: "" for lvl in level_keys}
reference_graph = defaultdict(list)  
all_articles = set() 

# Split text by article pattern
articles_splits = re.split(article_pattern, text, flags=re.M)
articles_id = articles_splits[1::2]
articles_content = articles_splits[2::2]
preceding_texts = articles_splits[0::2] # contains article hierarchy

print("Number of articles:", len(articles_id))

Number of articles: 2356


In [None]:
def extract_keywords(content):
    words = re.findall(r'\w+', content.lower())
    common_words = {'le', 'la', 'les', 'de', 'du', 'des', 'et', 'en', 'pour', 'par'}
    return [w for w in words if w not in common_words][:5]  # Top 5 non-common words

In [None]:
def generate_summary(content):
    return content[:100] + "..." if len(content) > 100 else content

In [None]:
# Process articles
prev_hierarchy = {lvl: "" for lvl in level_keys}  # Track previous levels

for i, article_id in enumerate(articles_id):
    preceding_text = preceding_texts[i] if i < len(preceding_texts) else ""

    # Track prev hierarchy
    curr_hierarchy = prev_hierarchy.copy()

    # Detect hierarchy changes from biggest → smallest
    for idx, pattern in enumerate(patterns):
        matches = re.findall(pattern, preceding_text, flags=re.MULTILINE)
        if matches:
            new_val = re.sub(r"\n", " ", matches[-1].strip())
            
            # If hierarchy level changed, reset all lower levels
            if curr_hierarchy[level_keys[idx]] != new_val:
                curr_hierarchy[level_keys[idx]] = new_val
                for lower_idx in range(idx + 1, len(level_keys)):
                    curr_hierarchy[level_keys[lower_idx]] = ""

    # Process article content
    content = articles_content[i].strip()
    content = re.sub(r"\n", " ", content)
    references = re.findall(r"[A-Z]\.\s*\d{3}-\d+(?:-\d+)?", content)
    for ref in references:
        reference_graph["Article " + re.sub(". ", "", ref)].append(article_id)

    # Build article record
    article = {
        "article_id": article_id,
        "content": content,
        "hierarchy": curr_hierarchy.copy(),
        "references": ["Article " + re.sub(". ", "", r) for r in references],
        "referenced_by": [],
        "summary": generate_summary(content),
        "keywords": extract_keywords(content),
        "page_number": None
    }
    articles_list.append(article)
    all_articles.add(article_id)

    # Build hierarchy tree
    node = hierarchy_tree
    for lvl in level_keys[:-1]:  # Stop at sous_section for simplicity
        if curr_hierarchy[lvl]:
            node = node[curr_hierarchy[lvl]]
    node["articles"] = node.get("articles", []) + [article_id]

    # Update prev_hierarchy for next article
    prev_hierarchy = curr_hierarchy.copy()

# Populate referenced_by field
for article in articles_list:
    article_id = article["article_id"]
    article["referenced_by"] = reference_graph.get(article_id, [])

# Create final output
output = {
    "articles": articles_list,
    "hierarchy_tree": dict(hierarchy_tree)
}

# Save to JSON
with open("code_assurances.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print("Data saved to code_assurances.json")

Data saved to code_assurances.json


In [None]:
# Load processed data
with open("code_assurances.json", "r", encoding="utf-8") as f:
    data = json.load(f)

articles = data["articles"]

def normalize_id(article_id: str) -> str:
    article_id = article_id.replace("Article ", "").replace(" ", "")
    article_id = re.sub(r'\.', '', article_id)
    return article_id

# Create directed graph
G = nx.DiGraph()

# Add nodes
for article in articles:
    article_id = normalize_id(article["article_id"])
    G.add_node(
        article_id,
        livre=article["hierarchy"]["livre"],
        titre=article["hierarchy"]["titre"],
        summary=article["summary"]
    )

# Add edges
for article in articles:
    article_id = normalize_id(article["article_id"])
    for ref in article["references"]:
        ref_id = normalize_id(ref)  
        if ref_id in G.nodes:                  
            G.add_edge(article_id, ref_id)


# Compute degree
for node in G.nodes:
    G.nodes[node]["degree"] = G.in_degree(node) + G.out_degree(node)

# Compute layout
pos = nx.spring_layout(G, k=0.5, iterations=50)

# Edge coordinates
edge_x = []
edge_y = []
for edge in G.edges:
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color="#888"),
    hoverinfo="none",
    mode="lines"
)

# Node coordinates
node_x = [pos[node][0] for node in G.nodes]
node_y = [pos[node][1] for node in G.nodes]
node_degree = [G.nodes[node]["degree"] for node in G.nodes]

node_text = [
    f"Article: {node}<br>Livre: {G.nodes[node]['livre']}<br>Titre: {G.nodes[node]['titre']}<br>Summary: {G.nodes[node]['summary']}<br>Degree: {G.nodes[node]['degree']}"
    for node in G.nodes
]

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode="markers",
    hoverinfo="text",
    text=node_text,
    marker=dict(
        showscale=True,
        colorscale="Viridis",
        color=node_degree,  # Color by degree
        size=[max(10, d*2) for d in node_degree],
        colorbar=dict(
            thickness=15,
            title="Node Degree",
            xanchor="left",
        ),
        line_width=2
    )
)

# Create figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title="Reference Graph of Code des Assurances",
                    title_x=0.5,
                    showlegend=False,
                    hovermode="closest",
                    margin=dict(b=20, l=5, r=5, t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                ))

# Save and show
fig.write_html("reference_graph.html")
print("Graph saved to reference_graph.html")


Graph saved to reference_graph.html


## Generate single article summary using LLMs

In [4]:
# Load processed data
with open("code_assurances.json", "r", encoding="utf-8") as f:
    data = json.load(f)

articles = data["articles"]

In [5]:
llm = Ollama(model="llama3.2:1b", temperature=0.1)
def generate_summary(content):
    prompt = (
        "Provide a concise summary of the following French insurance code article in about 100 characters in french. "
        "Focus on key legal obligations and procedures, using precise legal terminology and only give the summary as answer:\n\n"
        f"{content}"
    )
    response = llm.complete(prompt)
    summary = str(response).strip()
    return summary

In [9]:
list(articles[0].keys())

['article_id',
 'content',
 'hierarchy',
 'references',
 'referenced_by',
 'summary',
 'keywords',
 'page_number']

In [11]:
for article in random.sample(articles, 3):
    content = article["content"]
    article_id = article["article_id"]
    summary = generate_summary(content)
    print(article_id)
    print("Content: ", content)
    print("Summary: ", summary)
    print(f"Content length: {len(content)} | Summary length: {len(summary)}")
    print("-" * 50)

Article R421-4
Content:  Lorsqu'un contrat d'assurance a été souscrit pour garantir les conséquences pécuniaires de la responsabilité civile de l'auteur de dommages résultant d'atteintes aux personnes nés d'un accident mentionné au I de l'article L. 421-1, le fonds de garantie ne peut être appelé à payer l'indemnité allouée à la victime ou à ses ayants droit qu'en cas de suspension du contrat ou de non-assurance ou d'assurance partielle, opposables à la victime ou à ses ayants droit.   Pour les dommages causés aux personnes à la suite d'un accident mentionné au II de l'article L. 421-1, le fonds de garantie ne peut être appelé à payer l'indemnité allouée à la victime ou à ses ayants droit qu'en cas de nullité ou de suspension du contrat ou de la garantie de non-assurance ou d'assurance partielle, opposables à la victime ou à ses ayants droit.   Dans le cas où, par suite de l'insuffisance du montant de la garantie stipulée au contrat, une part de l'indemnité due à la victime ou à ses ay