# Link taxpayers with clustering

**Pre-treament or not ?**

**Embedding models**
* Word2Vec
* all-mpnet-base-v2 (Abhishek et al. 2024) : https://huggingface.co/sentence-transformers/all-mpnet-base-v2

**Clustering methods**
* k-means
* DBSCAN
* Agglomerative Hierarchical Clustering

**References**
* https://huggingface.co/blog/getting-started-with-embeddings
* https://cloud.google.com/blog/topics/developers-practitioners/meet-ais-multitool-vector-embeddings?hl=en

Code from https://www.sbert.net/examples/applications/semantic-search/README.html

In [20]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(os.environ["CUDA_VISIBLE_DEVICES"])

1


In [21]:
import sys
from pathlib import Path

BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve() # If not on GColab, BASE will be the directory of this notebook
DATASETS = Path('/home/STual/DAN-cadastre/data').resolve()
OUT_BASE = Path('/home/STual/DAN-cadastre/outputs/clustering').resolve()

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/STual/DAN-cadastre/.venv_dan/lib/python3.10/site-packages']
/home/STual/DAN-cadastre/scripts/Clustering
/home/STual/DAN-cadastre/data
/home/STual/DAN-cadastre/outputs/clustering


## Load data

In [22]:
import glob

DATASET = DATASETS / "Taxpayers"
files = glob.glob(str(DATASET) + '/*.json')
print(files)

['/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_all.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_0_100.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_400_500.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_100_200.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_200_300.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_500_600.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_300_400.json', '/home/STual/DAN-cadastre/data/Taxpayers/taxpayers_600_700.json']


In [23]:
import json
# Loop through the JSON files
data = []
for file in files:
    with open(file, 'r') as f:
            data += json.load(f)

In [24]:
#with open('taxpayers_backup.json', 'w',encoding='utf-8') as f:
#    json.dump(data, f, ensure_ascii=False, indent=4)

## 1. Data preparation

In [25]:
from str_normalization import remove_accents, normalize_text

transcriptions_structured = []

for d in data:
    uuid = d["element_uuid"]
    counter = 1
    try:
        for taxpayer_json in d["entities_json"]["taxpayers"]:
            taxpayer_desc = []
            if len(taxpayer_json['name']) > 0:
                name = remove_accents(normalize_text(taxpayer_json['name']))
                taxpayer_desc.append(name)
            else:
                taxpayer_desc.append('DONT TREAT NAME')
            if len(taxpayer_json['firstnames']) > 0:
                f_n = remove_accents(normalize_text(taxpayer_json['firstnames']))
                f_n_s = f_n.split(' ')
                #firstnames = sorted(f_n_s)
                firstnames_ = " ".join(f_n_s)
                taxpayer_desc.append(firstnames_)
            else:
                taxpayer_desc.append('')
            lst = [uuid,counter] + taxpayer_desc
            transcriptions_structured.append(lst)
            counter += 1
    except:
        print(taxpayer_json)

In [26]:
name_ = [e[2] for e in transcriptions_structured]
firstnames_ = [e[3] for e in transcriptions_structured]

## 2. Compute similarities for each property

In [27]:
len(name_)

2844

In [39]:
from embedding_similarity import compute_similarity_matrix
import time
import datetime

start = time.time()
name_sim = compute_similarity_matrix(name_,0.85,top_k='ALL') #Pairs with a cosine similarity lower than 0.85 are not considered
end = time.time()
runtime = end-start
print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")

Run-time is equal to 0:01:07.895302


In [40]:
start = time.time()
firstnames_sim = compute_similarity_matrix(firstnames_,0.70,top_k='ALL') #Pairs with a cosine similarity lower than 0.70 are not considered
end = time.time()
runtime = end-start
print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")

Run-time is equal to 0:01:09.531571


## 3. Intersection of similar pairs according several properties 

In [41]:
from embedding_similarity import find_common_similarities

assert len(name_sim) == len(firstnames_sim)
test = find_common_similarities(name_sim, firstnames_sim)

In [42]:
from embedding_similarity import export_groups

groups = export_groups(test)

clusters = []
for i in range(len(groups)):
    #print(f"\n################# {i} #################")
    #print(groups[i])
    c = []
    for elem in groups[i]:
        #print(transcriptions_structured[elem])
        c.append(transcriptions_structured[elem])
    clusters.append(c)

# Sort clusters based on the first row's last name (row[2])
sorted_clusters = sorted(clusters, key=lambda cluster: cluster[0][2].lower())

## 4. Statistics and inconsistencies raising

### Intra-cluster evaluation

In [None]:
import numpy as np
import itertools
from sentence_transformers import SentenceTransformer, util
import Levenshtein

model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_statistics_by_prop(cluster, prop_value_ix):
    """Compute similarity statistics (mean, median, std) for embeddings cosine similarity and edit distance similarity."""
    
    names = [row[prop_value_ix] for row in cluster]

    if len(names) < 2:
        return None  # No statistics if there's only one name

    # Compute embeddings
    embeddings = model.encode(names, convert_to_tensor=True)
    
    # Compute pairwise similarities
    embedding_similarities = []
    edit_distances = []
    
    for name1, name2 in itertools.combinations(names, 2):
        # Cosine similarity for embeddings
        sim = util.pytorch_cos_sim(model.encode(name1, convert_to_tensor=True), 
                                   model.encode(name2, convert_to_tensor=True))
        embedding_similarities.append(sim.item())

        # Edit distance normalized (Levenshtein distance / max_length)
        edit_distance = Levenshtein.distance(name1, name2) / max(len(name1), len(name2))
        edit_distances.append(1 - edit_distance)  # Convert to similarity

    # Compute statistics
    stats = {
        "Cosinus Similarity Mean (emb)": np.mean(embedding_similarities),
        "Cosinus Similarity Median (emb)": np.median(embedding_similarities),
        "Cosinus Similarity Std (emb)": np.std(embedding_similarities),
        "Levenshtein Similarity Mean (str)": np.mean(edit_distances),
        "Levenshtein Similarity Median (str)": np.median(edit_distances),
        "Levenshtein Similarity Std (str)": np.std(edit_distances),
    }

    return stats

# Process each cluster
all_stats = []
for i, cluster in enumerate(sorted_clusters):
    stats_name = compute_statistics_by_prop(cluster,2)
    stats_firstnames = compute_statistics_by_prop(cluster,3)
    if stats:
        stats["cluster_id"] = i

In [68]:
for stats in all_stats:
    if stats["Cosinus Similarity Mean (emb)"] < 0.95 and stats["Levenshtein Similarity Mean (str)"] < 0.95:
        print(f"Cluster {stats['cluster_id']} Statistics:")
        for key, value in stats.items():
            print(f"  {key}: {value:.4f}")
        print(sorted_clusters[stats['cluster_id']])
        print("\n---\n")

Cluster 17 Statistics:
  Cosinus Similarity Mean (emb): 0.9394
  Cosinus Similarity Median (emb): 0.9092
  Cosinus Similarity Std (emb): 0.0428
  Levenshtein Similarity Mean (str): 0.9048
  Levenshtein Similarity Median (str): 0.8571
  Levenshtein Similarity Std (str): 0.0673
  cluster_id: 17.0000
[['21a8e30c-2ff8-44eb-a921-5fa3d4e95fea', 1, 'ancelet', 'nicolas jh'], ['8fab4098-b861-4399-a2c8-6f4a4f2bd4f0', 1, 'ancelot', 'nicolas jh'], ['69e3322e-1ed8-494e-a67c-efaad14a8da3', 1, 'ancelet', 'nicolas'], ['8a8e62da-6386-405d-bb13-8403563b61b7', 1, 'ancelot', 'nicolas joseph']]

---

Cluster 21 Statistics:
  Cosinus Similarity Mean (emb): 0.9092
  Cosinus Similarity Median (emb): 0.9092
  Cosinus Similarity Std (emb): 0.0000
  Levenshtein Similarity Mean (str): 0.8571
  Levenshtein Similarity Median (str): 0.8571
  Levenshtein Similarity Std (str): 0.0000
  cluster_id: 21.0000
[['ec04a09f-1610-4d56-ac48-9e13c6db13d8', 1, 'ancelet', 'nicolas dit ancelon'], ['1a95c73d-2991-4aa1-97a1-0825275a

### Inter-cluster evaluation

In [77]:
import numpy as np
import itertools
import plotly.express as px
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# Store visualization data
all_embeddings = []
all_labels = []
all_clusters = []
representative_points = []

for i, cluster in enumerate(sorted_clusters):
    # Extract names
    names = [row[2] + ' ' + row[3] for row in cluster]

        # Compute embeddings
    embeddings = model.encode(names)

    if len(cluster) == 1:
        # If only one element, keep it as is
        all_embeddings.append(embeddings[0])
        all_labels.append(names[0])
        all_clusters.append(i)
    else:
        # Compute centroid of cluster
        centroid = np.mean(embeddings, axis=0)

        # Find closest element to centroid
        distances = [np.linalg.norm(embedding - centroid) for embedding in embeddings]
        most_representative_idx = np.argmin(distances)

        all_embeddings.append(embeddings[most_representative_idx])
        all_labels.append(names[most_representative_idx])
        all_clusters.append(i)

# Reduce dimensions using PCA
pca = PCA(n_components=2)
embeddings_2D = pca.fit_transform(all_embeddings)

# Create scatter plot
fig = px.scatter(
    x=embeddings_2D[:, 0], y=embeddings_2D[:, 1], 
    hover_name=all_labels,  # Show labels only on hover
    color=[str(c) for c in all_clusters],  # Cluster coloring
    title="Clustered Names in 2D Space (via PCA)",
    labels={"x": "PCA Component 1", "y": "PCA Component 2"},
)

# Show plot
fig.write_html("clusters.html")

## 4. Analysis

In [63]:
import plotly.graph_objects as go
import torch

# Convert tensors to CPU and numpy
data2 = [[row[0], row[1].cpu().numpy(), row[2].cpu().numpy()] for row in name_sim[:500]]

# Create plot
fig = go.Figure()

for row in data2:
    x_val = row[0]  # First number in each list
    y_vals = row[2]  # Second list (numbers for y-axis)

    fig.add_trace(go.Scatter(x=[x_val] * len(y_vals), y=y_vals, mode='markers', name=f'Group {x_val}'))

# Update layout
fig.update_layout(title="Plotly Scatter Plot",
                  xaxis_title="Group Number",
                  yaxis_title="Values",
                  showlegend=True)

# Show figure
fig.write_html("scatter_plot.html")

In [65]:
# Convert tensors to CPU and numpy
data2 = [[row[0], row[1].cpu().numpy(), row[2].cpu().numpy()] for row in name_sim[:500]]

# Create figure
fig = go.Figure()

for row in data2:
    x_val = row[0]  # First number in each list (Group)
    y_vals = row[2]  # Second list (numbers for y-axis)

    fig.add_trace(go.Box(y=y_vals, name=f'Group {x_val}', boxpoints="outliers", boxmean=True))  # Box plot for each group

# Update layout
fig.update_layout(title="Box Plot of Values per Group",
                  xaxis_title="Group Number",
                  yaxis_title="Values",
                  showlegend=True)

# Save figure as HTML
fig.write_html("box_plot2.html")

print("HTML file 'box_plot.html' has been created.")

HTML file 'box_plot.html' has been created.


In [66]:
import numpy as np

# Convert tensors to CPU and numpy
data2 = [[row[0], row[1].cpu().numpy(), row[2].cpu().numpy()] for row in name_sim[:500]]

# Create figure
fig = go.Figure()

for row in data2:
    x_val = row[0]  # First number in each list (Group)
    y_vals = row[2]  # Second list (numbers for y-axis)

    # Compute deciles (10th, 20th, ..., 90th percentiles)
    percentiles = np.percentile(y_vals, [10, 20, 30, 40, 50, 60, 70, 80, 90])

    # Outlier values (values greater than 90th percentile)
    outliers = y_vals[y_vals > percentiles[8]]

    # Create box plot with custom percentiles (deciles)
    fig.add_trace(go.Box(
        y=y_vals,
        x=[x_val] * len(y_vals),  # Use x-values for positioning each box plot vertically
        name=f'Group {x_val}',
        boxmean=True,  # Show mean
        whiskerwidth=1,  # Make whiskers thicker
        boxpoints='all',  # Show all points, including outliers
        jitter=0.3,  # Add jitter to show outliers clearly
        #whiskerlength=0.5,  # Whisker length for clarity
        lowerfence=[percentiles[0]],  # 10th percentile as a list
        upperfence=[percentiles[8]],  # 90th percentile (D9) as a list
        q1=[percentiles[1]],  # 20th percentile
        median=[percentiles[4]],  # 50th percentile (median)
        q3=[percentiles[7]],  # 80th percentile
        #outliercolor='red',  # Set the outlier color
    ))

    # Plot outliers manually above D9
    for outlier in outliers:
        fig.add_trace(go.Scatter(
            x=[x_val],
            y=[outlier],
            mode='markers',
            marker=dict(color='red', size=10, symbol='circle'),
            name=f'Outlier Group {x_val}',
            showlegend=False
        ))

# Update layout for vertical orientation
fig.update_layout(
    title="Box Plot with Deciles and Outliers",
    xaxis_title="Group Number",
    yaxis_title="Values",
    showlegend=True,
    boxmode='group',  # Group the boxes by x values
    height=600  # Adjust the height for better visualization
)

# Save figure as HTML
fig.write_html("box_plot_deciles_outliers.html")

print("HTML file 'box_plot_deciles_outliers.html' has been created.")

HTML file 'box_plot_deciles_outliers.html' has been created.


In [31]:
import pandas as pd
import plotly.express as px
# Convert tensors to CPU and numpy for plotting
data2 = [[row[0], row[2].cpu().numpy()] for row in name_sim[:500]]

# Flatten data for a single DataFrame
x_vals = []
y_vals = []

for row in data2:
    x_vals.extend([row[0]] * len(row[1]))  # Repeat the group number (row[0]) for each y value
    y_vals.extend(row[1])  # Add all the y values (row[2])

# Create a DataFrame
df = pd.DataFrame({
    'Group': x_vals,
    'Value': y_vals
})

# Create a marginal plot
fig = px.scatter(df, x='Group', y='Value', marginal_y='histogram', 
                 title="Scatter Plot with Marginal Histogram on Y values",
                 labels={'Group': 'Group Number', 'Value': 'Y Values'})

# Save figure as HTML
fig.write_html("marginal_plot_y_values500.html")

print("HTML file 'marginal_plot_y_values.html' has been created.")

HTML file 'marginal_plot_y_values.html' has been created.


In [67]:
import pandas as pd
import plotly.express as px
# Convert tensors to CPU and numpy for plotting
data2 = [[row[0], row[2].cpu().numpy()] for row in name_sim[:500]]

# Flatten data for a single DataFrame
x_vals = []
y_vals = []

for row in data2:
    x_vals.extend([row[0]] * len(row[1]))  # Repeat the group number (row[0]) for each y value
    y_vals.extend(row[1])  # Add all the y values (row[2])

# Create a DataFrame
df = pd.DataFrame({
    'Group': x_vals,
    'Value': y_vals
})

# Compute deciles (10th, 20th, ..., 90th percentiles)
deciles = np.percentile(y_vals, np.arange(10, 100, 10))

# Compute centiles (91st, 92nd, ..., 100th percentiles)
centiles = np.percentile(y_vals, np.arange(91, 101, 1))

# Create a marginal plot with scatter and histogram
fig = px.scatter(df, x='Group', y='Value', marginal_y='histogram', 
                 title="Scatter Plot with Marginal Histogram on Y values",
                 labels={'Group': 'Group Number', 'Value': 'Y Values'})

# Add decile lines to scatter plot
for decile in deciles:
    fig.add_shape(
        type="line",
        x0=df['Group'].min(), x1=df['Group'].max(),
        y0=decile, y1=decile,
        line=dict(color="blue", dash="dash", width=2),
    )

# Add centile lines to scatter plot
for centile in centiles:
    fig.add_shape(
        type="line",
        x0=df['Group'].min(), x1=df['Group'].max(),
        y0=centile, y1=centile,
        line=dict(color="green", dash="dashdot", width=2),
    )

# Add decile lines to marginal histogram (horizontal lines)
for decile in deciles:
    fig.add_shape(
        type="line",
        x0=0, x1=1,
        y0=decile, y1=decile,
        yref="y2",  # Reference to the marginal y-axis
        line=dict(color="blue", dash="dash", width=2),
    )

# Add centile lines to marginal histogram (horizontal lines)
for centile in centiles:
    fig.add_shape(
        type="line",
        x0=0, x1=1,
        y0=centile, y1=centile,
        yref="y2",  # Reference to the marginal y-axis
        line=dict(color="green", dash="dashdot", width=2),
    )

# Save figure as HTML
fig.write_html("marginal_plot_with_deciles_and_centiles.html")

print("HTML file 'marginal_plot_with_deciles_and_centiles.html' has been created.")

HTML file 'marginal_plot_with_deciles_and_centiles.html' has been created.
