### Import necessary libraries

In [50]:
!pip uninstall slenps
!pip install slenps

^C
^C


In [51]:
import numpy as np
import pandas as pd
from pprint import pprint
import os
import shutil

### Obtain and process data

In [52]:
from slenps.eclusters import (
    EmbeddingModelRegistry,
    load_embedding_model,
    embed_and_save,
    get_data_from_paths,
    sample,
    reduce_dimension,
)

In [53]:
# Load documents
with open("sample_documents.txt", "r") as file:
    documents = np.array([line.strip() for line in file.readlines()])

In [54]:
# See all possible models name
# Note that you need to format the embedding/document
# according to the algorithm's specificrequires
print(EmbeddingModelRegistry.REGISTRY.keys())

dict_keys(['BaseEmbeddingModel', 'TfidfEM', 'Word2VecEM', 'Doc2VecEM', 'SbertEM'])


In [57]:
# embedding model
embedding_model = load_embedding_model(model_name="Word2VecEM")
# embedding_model = load_embedding_model(
#     model_name='all-MiniLM-L6-v2', mode='huggingface'
# )
print(f"Embedding model used: {embedding_model}")

Embedding model used: <slenps.eclusters.embedding_models.Word2VecEM object at 0x00000226F5A7A750>


In [58]:
# embed documents
embeddings = embedding_model.encode(documents)
print(f"Embedding shape: {embeddings.shape}\nDocuments shape: {documents.shape}")



Embedding shape: (106, 100)
Documents shape: (106,)


#### helper functions for preprocessing

In [59]:
# embed the document and save the embedding-document pair as a pickle file
embedding_filename = "test_save_embedding.pickle"
if os.path.exists(embedding_filename):
    os.remove(embedding_filename)
embed_and_save(embedding_model, documents, "test_save_embedding.pickle")



In [60]:
if os.path.exists("test"):
    shutil.rmtree("test")
os.mkdir("test")
shutil.copy(embedding_filename, "test/file1.pickle")
shutil.copy(embedding_filename, "test/file2.pickle")

'test/file2.pickle'

In [61]:
# Retrieve and concatenate embedding-document pairs via
# the pickle files specified in the list of paths
temp_embedding, temp_documents = get_data_from_paths(
    [os.path.join("test", path) for path in os.listdir("test")]
)

In [62]:
os.remove(embedding_filename)
shutil.rmtree("test")

In [63]:
sampled_embedding, sampled_documents = sample(temp_embedding, temp_documents)
sampled_embedding.shape, sampled_documents.shape

((106, 100), (106,))

In [64]:
reduced_embedding = reduce_dimension(sampled_embedding, n_dim=5)
reduced_embedding.shape, sampled_embedding.shape, sampled_documents.shape

((106, 5), (106, 100), (106,))

### Cluster embeddings


In [65]:
from slenps.eclusters import (
    get_clustering_model_dict,
    load_clustering_model,
    cluster,
    find_best_algorithm,
    sample_random_documents,
    sample_centroids_documents,
)

In [66]:
clustering_model_dict = get_clustering_model_dict()
pprint(clustering_model_dict)

{'affinity_propagation': AffinityPropagation(),
 'agglomerative_clustering': AgglomerativeClustering(),
 'birch': Birch(threshold=0.2),
 'kmeans': KMeans(),
 'mean_shift': MeanShift(),
 'spectral_clustering': SpectralClustering()}


In [67]:
# Select a clustering model and number of clusters
model_name = "kmeans"
num_cluster = 3

In [68]:
# create a clustering model
clustering_model = load_clustering_model(model_name).set_params(n_clusters=num_cluster)
clustering_model

In [69]:
# fit the model and retrieve labels and metrics
labels, metrics = cluster(
    embeddings,
    clustering_model,
    metrics=["dbs", "silhouette", "calinski"],
    return_model=False,
)
print(f"Clustering metrics: {metrics}")

Clustering metrics: {'dbs': 0.3354528928709557, 'silhouette': 0.6926789, 'calinski': 896.8286713967469}


In [70]:
n_samples = 10
for document, label in zip(documents[:n_samples], labels[:n_samples]):
    print(f"{document} --> Label {label}")

The most important factor to consider when choosing a job is the pay.â€™ Do you agree? Why, or why not? --> Label 0
the following parallelogram.Can you find its area? [Take the area of 1 grid to be 1 u n i t squared] Explain your method. (You may download the image and annotate on it) --> Label 0
promoting public ecucation Describe how this strategy helps manage tropical forest in a sustainable manner. ' --> Label 0
youth to youth how are you feeling --> Label 0
The score is 'deuce'.  Both players then win a point each. The score now is ______________. --> Label 0
movement at knee joint what class level --> Label 0
part time teacher vs full time teacher --> Label 0
literature poem neighbours singapore analysis --> Label 0
chromebook switch tabs --> Label 0
cool fortnite birthday cakerobloxbirthday party --> Label 0


### Find the best clustering algorithm and num_cluster

In [71]:
# define a list of clustering models to evaluate
# see avaliable model via
# Add your own model that implemented .fit_predict() to a new_model_dict if needed
print(get_clustering_model_dict().keys())
model_names = ["kmeans", "agglomerative_clustering", "spectral_clustering"]
model_names

dict_keys(['kmeans', 'affinity_propagation', 'mean_shift', 'spectral_clustering', 'agglomerative_clustering', 'birch'])


['kmeans', 'agglomerative_clustering', 'spectral_clustering']

In [72]:
results = find_best_algorithm(
    embeddings,
    model_names=model_names,
    metrics=["dbs", "silhouette"],
    test_metric="dbs",
    min_cluster_num=2,
    max_cluster_num=10,
    result_filepath="sample_result_metric.csv",
    print_topk=True,
)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 13.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 38.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.34it/s]

save results
[{'cluster_num': 8,
  'dbs': 2.541923979584553,
  'model_name': 'spectral_clustering',
  'silhouette': 0.27459568},
 {'cluster_num': 9,
  'dbs': 2.216026623404584,
  'model_name': 'spectral_clustering',
  'silhouette': 0.0747698},
 {'cluster_num': 10,
  'dbs': 2.137415419522453,
  'model_name': 'spectral_clustering',
  'silhouette': 0.1573877}]





In [73]:
# print all the clustering results
pd.DataFrame(results)

Unnamed: 0,model_name,cluster_num,dbs,silhouette
0,spectral_clustering,8,2.541924,0.274596
1,spectral_clustering,9,2.216027,0.07477
2,spectral_clustering,10,2.137415,0.157388
3,spectral_clustering,6,1.912282,0.327641
4,spectral_clustering,7,1.868956,0.22706
5,spectral_clustering,5,1.067864,0.354461
6,spectral_clustering,4,0.665684,0.273986
7,kmeans,9,0.612639,0.442419
8,kmeans,10,0.599998,0.4687
9,agglomerative_clustering,9,0.587831,0.468574


In [74]:
os.remove("sample_result_metric.csv")

#### obtain corresponding model

In [86]:
# Select a clustering model and number of clusters
model_name, cluster_num = results[0]["model_name"], results[0]["cluster_num"]
model_name, cluster_num

('spectral_clustering', 8)

In [97]:
# create a clustering model
clustering_model = load_clustering_model(model_name).set_params(n_clusters=cluster_num)
clustering_model

In [98]:
# fit the model and retrieve labels and metrics
labels, metrics, best_model = cluster(
    embeddings,
    clustering_model,
    metrics=["dbs", "silhouette", "calinski"],
    return_model=True,
)
print(f"Clustering metrics: {metrics}")

Clustering metrics: {'dbs': 2.7175640611196927, 'silhouette': 0.20956203, 'calinski': 439.20953690039636}


In [126]:
n_samples = 5
random_documents_per_label = sample_random_documents(
    embeddings, documents, cluster_num, labels, n_samples=n_samples
)
random_documents_per_label

Unnamed: 0,cluster_id,document
0,0,http://pennstatehershey.adam.com
1,0,xinminsecondary
2,0,natqiqix
3,0,skibidi toilet 15
4,0,blind rivet
5,1,american vs chinese rap battle
6,1,the internet makes learning easier essay
7,1,two question you have after reading emily of e...
8,1,christian i
9,1,how adrien agreste got all the miraculous


In [127]:
centroid_documents = sample_centroids_documents(
    embeddings, documents, cluster_num, labels, n_samples=n_samples
)
centroid_documents

Unnamed: 0,cluster_id,document,distance_to_centroid
0,0,http://pennstatehershey.adam.com,0.02805
1,0,risky meaning],0.109218
2,0,skibidi toilet 15,0.111353
3,0,capebara,0.020644
4,0,amx 10 rc,0.015151
5,1,how adrien agreste got all the miraculous,0.017812
6,1,how does critical thinking promotes outdoor ac...,0.021327
7,1,why does people having demantia finding it har...,0.050651
8,1,future forecast essay,0.01413
9,1,things to do in school to not be bored,0.040242
