In [None]:
%pip install -U Levenshtein numpy scikit-learn matplotlib

In [None]:
import Levenshtein
import numpy as np
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, DBSCAN, AffinityPropagation, KMeans

In [None]:
# List of words
words = [
    "bapiermacher", "bapirmachers", "bapismachers", "bappermachers", "bappiomachers",
    "bappirmacher", "bappirmachers", "bappismacher", "cappirmacher", "capyrmacher",
    "papirmacher", "papitmacher", "barethemacher", "barethmacher", "baretlimacher",
    "baretlinmacherin", "barettleinmacher", "barettlinmacherin", "bartlimacher", "byrettlimacher",
    "paretlimacherin", "paretthmachers", "partlimacher",
    "bechtrucker", "bechtruckher", "buch trucker", "buchbrucker", "buchdrucke",
    "buchdrucker", "buchdruckher", "buchtruckerher", "buchtrucken",
    "buchtrucker", "buchtruckers", "buchtruckher", "buchtruckhers",
    "buchtruker", "buchtucker", "büchtrucker", "büchtruckher",
    "schuch machers", "schuchma chers", "schuchmacher", "schuchmacheren", "schuchmachers",
    "schue macher", "schuema cher", "schuemache", "schuemacher", "schuemachern",
    "schuemachers", "schuengcher", "schuenmacher", "schuhemachers", "schuhma cher",
    "schuhmacher", "schuhmacherbede", "schuhmacheren", "schuhmachern", "schuhmachers",
    "schuhmercher", "schumachen", "schumacher", "schumachere", "schumacherin",
    "schumachern", "schumachers", "schumacker", "schuo macher", "schuomacher",
    "schuomachers", "sischmacher", "thischmacher", "thischmachers", "tischmacher",
    "tischmachers", "tschmacher", "tyschmacher", "vischmacher", "vischmachers"
]


In [None]:

# Compute Levenshtein distance matrix
def levenshtein_distance_matrix(words):
    n = len(words)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            distance = Levenshtein.distance(words[i], words[j])
            matrix[i][j] = distance
            matrix[j][i] = distance
    return matrix

distance_matrix = levenshtein_distance_matrix(words)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import MDS

# 2D Visualization using MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
points_2d = mds.fit_transform(distance_matrix)

plt.figure(figsize=(40, 24))
for i, word in enumerate(words):
    plt.scatter(points_2d[i, 0], points_2d[i, 1])
    plt.text(points_2d[i, 0] + 0.1, points_2d[i, 1] + 0.1, word, fontsize=14)

plt.title("2D Visualization of Word Clusters")
plt.show()


In [None]:
# Select clustering algorithm
CLUSTERING_ALGO = "Affinity"  # Options: "Agglomerative", "Spectral", "DBSCAN", "Affinity", "KMeans"
num_clusters = 5  # Used for some algorithms

In [None]:
if CLUSTERING_ALGO == "Agglomerative":
    clustering = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', linkage='complete')
elif CLUSTERING_ALGO == "Spectral":
    similarity_matrix = np.exp(-distance_matrix / np.std(distance_matrix))  # Convert distance to similarity
    clustering = SpectralClustering(n_clusters=num_clusters, affinity='precomputed')
elif CLUSTERING_ALGO == "DBSCAN":
    clustering = DBSCAN(metric='precomputed', eps=3, min_samples=2)
elif CLUSTERING_ALGO == "Affinity":
    similarity_matrix = np.exp(-distance_matrix / np.std(distance_matrix))
    clustering = AffinityPropagation(affinity='precomputed', random_state=42)
else:
    raise ValueError("Invalid clustering algorithm chosen.")

labels = clustering.fit_predict(distance_matrix if CLUSTERING_ALGO not in ["Spectral", "Affinity"] else similarity_matrix)
num_clusters = len(set(labels))  # Update num_clusters based on actual number of clusters found

In [None]:
# Find central terms
central_terms = []

for i in range(num_clusters):
    # Get indices of words in current cluster
    cluster_indices = np.where(labels == i)[0]
    
    # Get distance submatrix for current cluster
    cluster_distances = distance_matrix[cluster_indices][:, cluster_indices]
    
    # Calculate average distance for each word to all other words in cluster
    avg_distances = np.mean(cluster_distances, axis=1)
    
    # Find word with minimum average distance
    if len(cluster_indices) == 1:
        central_terms.append(words[cluster_indices[0]])
        continue
    if len(cluster_indices) == 0:
        central_terms.append("")
        continue
    central_index = cluster_indices[np.argmin(avg_distances)]
    central_terms.append(words[central_index])

In [None]:
# Print cluster results
clusters = {i: [] for i in range(num_clusters)}

for word, label in zip(words, labels):
    clusters[label].append(word)

for i in range(num_clusters):
    print(f"Cluster {i + 1}: {clusters[i]}")
    print(f"Most central term: {central_terms[i]}\n")

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import MDS

# 2D Visualization using MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
points_2d = mds.fit_transform(distance_matrix)

plt.figure(figsize=(40, 24))
colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'olive', 'cyan', 'black', 'darkblue']
for i, word in enumerate(words):
    plt.scatter(points_2d[i, 0], points_2d[i, 1], color=colors[labels[i]], label=f"Cluster {labels[i]+1}" if f"Cluster {labels[i]+1}" not in plt.gca().get_legend_handles_labels()[1] else "")
    plt.text(points_2d[i, 0] + 0.1, points_2d[i, 1] + 0.1, word, fontsize=18)

plt.title("2D Visualization of Word Clusters")
plt.legend()
plt.show()


In [None]:
%pip install -U plotly

In [None]:
import plotly.express as px
import numpy as np
from sklearn.manifold import MDS

# 3D Visualization using MDS
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
points_3d = mds.fit_transform(distance_matrix)

# Create DataFrame for plotting
import pandas as pd
df = pd.DataFrame(points_3d, columns=['x', 'y', 'z'])
df['word'] = words
df['cluster'] = labels

# Define a list of distinct colors
distinct_colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'olive', 'cyan', 'black', 'darkblue']

# Create an interactive 3D scatter plot
fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster', text='word', 
                    title="3D Visualization of Word Clusters",
                    labels={'cluster': 'Cluster'}, 
                    color_discrete_sequence=distinct_colors)

# Save the plot as an HTML file
fig.write_html("3d_word_clusters_distinct_colors.html")