### Import necessary libraries

In [1]:
# !pip uninstall slenps
# !pip install slenps

In [2]:
import numpy as np
import pandas as pd
from pprint import pprint
import os
import shutil

### Obtain and process data

In [3]:
from slenps.eclusters import (
    EmbeddingModelRegistry,
    load_embedding_model,
    embed_and_save,
    get_data_from_paths,
    sample,
    reduce_dimension,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load documents
with open("sample_documents.txt", "r") as file:
    documents = np.array([line.strip() for line in file.readlines()])

In [5]:
# See all possible models name
# Note that you need to format the embedding/document
# according to the algorithm's specificrequires
print(EmbeddingModelRegistry.REGISTRY.keys())

dict_keys(['BaseEmbeddingModel', 'TfidfEM', 'Word2VecEM', 'Doc2VecEM', 'SbertEM'])


In [6]:
# embedding model
embedding_model = load_embedding_model(model_name="Word2VecEM")
# embedding_model = load_embedding_model(
#     model_name='all-MiniLM-L6-v2', mode='huggingface'
# )
print(f"Embedding model used: {embedding_model}")

INFO:gensim.utils:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2024-07-09T17:15:19.606125', 'gensim': '4.3.2', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
INFO:slenps.eclusters.embedding_models:Word2Vec model initialized with 100 dimensions


Embedding model used: <slenps.eclusters.embedding_models.Word2VecEM object at 0x000002DD269881D0>


In [7]:
# embed documents
embeddings = embedding_model.encode(documents)
print(f"Embedding shape: {embeddings.shape}\nDocuments shape: {documents.shape}")

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 74 word types from a corpus of 4121 raw words and 106 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 74 unique words (100.00% of original 74, drops 0)', 'datetime': '2024-07-09T17:15:19.636716', 'gensim': '4.3.2', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 4121 word corpus (100.00% of original 4121, drops 0)', 'datetime': '2024-07-09T17:15:19.636716', 'gensim': '4.3.2', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'plat

Embedding shape: (106, 100)
Documents shape: (106,)


#### helper functions for preprocessing

In [8]:
# embed the document and save the embedding-document pair as a pickle file
embedding_filename = "test_save_embedding.pickle"
if os.path.exists(embedding_filename):
    os.remove(embedding_filename)
embed_and_save(embedding_model, documents, "test_save_embedding.pickle")

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 74 word types from a corpus of 4121 raw words and 106 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 74 unique words (100.00% of original 74, drops 0)', 'datetime': '2024-07-09T17:15:19.840571', 'gensim': '4.3.2', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 4121 word corpus (100.00% of original 4121, drops 0)', 'datetime': '2024-07-09T17:15:19.840571', 'gensim': '4.3.2', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'plat

In [9]:
if os.path.exists("test"):
    shutil.rmtree("test")
os.mkdir("test")
shutil.copy(embedding_filename, "test/file1.pickle")
shutil.copy(embedding_filename, "test/file2.pickle")

'test/file2.pickle'

In [10]:
# Retrieve and concatenate embedding-document pairs via
# the pickle files specified in the list of paths
temp_embedding, temp_documents = get_data_from_paths(
    [os.path.join("test", path) for path in os.listdir("test")]
)

In [11]:
os.remove(embedding_filename)
shutil.rmtree("test")

In [12]:
sampled_embedding, sampled_documents = sample(temp_embedding, temp_documents)
sampled_embedding.shape, sampled_documents.shape

((106, 100), (106,))

In [13]:
reduced_embedding = reduce_dimension(sampled_embedding, n_dim=5)
reduced_embedding.shape, sampled_embedding.shape, sampled_documents.shape

((106, 5), (106, 100), (106,))

### Cluster embeddings


In [14]:
from slenps.eclusters import (
    get_clustering_model_dict,
    load_clustering_model,
    cluster,
    find_best_algorithm,
    sample_random_documents,
    sample_centroids_documents,
)

In [15]:
clustering_model_dict = get_clustering_model_dict()
pprint(clustering_model_dict)

{'affinity_propagation': AffinityPropagation(),
 'agglomerative_clustering': AgglomerativeClustering(),
 'birch': Birch(threshold=0.2),
 'kmeans': KMeans(),
 'mean_shift': MeanShift(),
 'spectral_clustering': SpectralClustering()}


In [16]:
# Select a clustering model and number of clusters
model_name = "kmeans"
num_cluster = 3

In [17]:
# create a clustering model
clustering_model = load_clustering_model(model_name).set_params(n_clusters=num_cluster)
clustering_model

In [18]:
# fit the model and retrieve labels and metrics
labels, metrics = cluster(
    embeddings,
    clustering_model,
    metrics=["dbs", "silhouette", "calinski"],
    return_model=False,
)
print(f"Clustering metrics: {metrics}")

Clustering metrics: {'dbs': 0.3354528928709557, 'silhouette': 0.6926789, 'calinski': 896.8286713967469}


In [19]:
n_samples = 10
for document, label in zip(documents[:n_samples], labels[:n_samples]):
    print(f"{document} --> Label {label}")

The most important factor to consider when choosing a job is the pay.â€™ Do you agree? Why, or why not? --> Label 0
the following parallelogram.Can you find its area? [Take the area of 1 grid to be 1 u n i t squared] Explain your method. (You may download the image and annotate on it) --> Label 0
promoting public ecucation Describe how this strategy helps manage tropical forest in a sustainable manner. ' --> Label 0
youth to youth how are you feeling --> Label 0
The score is 'deuce'.  Both players then win a point each. The score now is ______________. --> Label 0
movement at knee joint what class level --> Label 0
part time teacher vs full time teacher --> Label 0
literature poem neighbours singapore analysis --> Label 0
chromebook switch tabs --> Label 0
cool fortnite birthday cakerobloxbirthday party --> Label 0


### Find the best clustering algorithm and num_cluster

In [20]:
# define a list of clustering models to evaluate
# see avaliable model via
# Add your own model that implemented .fit_predict() to a new_model_dict if needed
print(get_clustering_model_dict().keys())
model_names = ["kmeans", "agglomerative_clustering", "spectral_clustering"]
model_names

dict_keys(['kmeans', 'affinity_propagation', 'mean_shift', 'spectral_clustering', 'agglomerative_clustering', 'birch'])


['kmeans', 'agglomerative_clustering', 'spectral_clustering']

In [21]:
results = find_best_algorithm(
    embeddings,
    model_names=model_names,
    metrics=["dbs", "silhouette"],
    test_metric="dbs",
    min_cluster_num=2,
    max_cluster_num=10,
    result_filepath="sample_result_metric.csv",
    print_topk=True,
)

                                                                                                                       

Results saved
[{'cluster_num': 9,
  'dbs': 2.7848662256301617,
  'model_name': 'spectral_clustering',
  'silhouette': 0.18767078},
 {'cluster_num': 8,
  'dbs': 2.702902430858376,
  'model_name': 'spectral_clustering',
  'silhouette': 0.21068594},
 {'cluster_num': 10,
  'dbs': 1.9686950452838516,
  'model_name': 'spectral_clustering',
  'silhouette': 0.10196404}]




In [22]:
# print all the clustering results
pd.DataFrame(results)

Unnamed: 0,model_name,cluster_num,dbs,silhouette
0,spectral_clustering,9,2.784866,0.187671
1,spectral_clustering,8,2.702902,0.210686
2,spectral_clustering,10,1.968695,0.101964
3,spectral_clustering,6,1.912282,0.327641
4,spectral_clustering,7,1.668664,0.221815
5,spectral_clustering,5,1.067864,0.354461
6,kmeans,9,0.689118,0.381502
7,spectral_clustering,4,0.665684,0.273986
8,kmeans,10,0.61516,0.465553
9,agglomerative_clustering,9,0.587831,0.468574


In [23]:
os.remove("sample_result_metric.csv")

#### obtain corresponding model

In [24]:
# Select a clustering model and number of clusters
model_name, cluster_num = results[0]["model_name"], results[0]["cluster_num"]
model_name, cluster_num

('spectral_clustering', 9)

In [25]:
# create a clustering model
clustering_model = load_clustering_model(model_name).set_params(n_clusters=cluster_num)
clustering_model

In [26]:
# fit the model and retrieve labels and metrics
labels, metrics, best_model = cluster(
    embeddings,
    clustering_model,
    metrics=["dbs", "silhouette", "calinski"],
    return_model=True,
)
print(f"Clustering metrics: {metrics}")

Clustering metrics: {'dbs': 2.8852606910488454, 'silhouette': 0.20316514, 'calinski': 328.7318599626432}


In [27]:
n_samples = 5
random_documents_per_label = sample_random_documents(
    embeddings, documents, cluster_num, labels, n_samples=n_samples
)
random_documents_per_label

Unnamed: 0,cluster_id,document
0,0,470x2
1,0,æ³¢æ¾œ
2,0,è½¦è‰ºèŽ²
3,1,vollyball design
4,1,rankdle
5,1,capebara
6,1,corkboard
7,1,japanese democratic government 1912
8,2,effectss of trash volunteer work
9,2,The score is 'deuce'. Both players then win a...


In [28]:
centroid_documents = sample_centroids_documents(
    embeddings, documents, cluster_num, labels, n_samples=n_samples
)
centroid_documents

Unnamed: 0,cluster_id,document,distance_to_centroid
0,0,è½¦è‰ºèŽ²,0.061028
1,0,470x2,0.030464
2,0,æ³¢æ¾œ,0.044797
3,1,http://pennstatehershey.adam.com,0.02805
4,1,risky meaning],0.109218
5,1,skibidi toilet 15,0.111353
6,1,capebara,0.020644
7,1,amx 10 rc,0.015151
8,2,how have social media influencers helped us,0.024585
9,2,effectss of trash volunteer work,0.017082
