<a href="https://colab.research.google.com/github/raymondwcs/learning_bert/blob/master/Cluster_Sentence_Embeddings_(kMeans%2C_DBSCAN%2C_sentence_transformers).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet transformers
!pip install -U sentence-transformers



In [2]:
# from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# CHECKPOINT = 'bert-base-chinese'
CHECKPOINT = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'

MAX_LENGTH = 80

# corpus = [
#   "Vodafone Wins ₹20,000 Crore Tax Arbitration Case Against Government",
#   "Voda Idea shares jump nearly 15% as Vodafone wins retro tax case in Hague",
#   "Gold prices today fall for 4th time in 5 days, down ₹6500 from last month high",
#   "Silver futures slip 0.36% to Rs 59,415 per kg, down over 12% this week",
#   "Amazon unveils drone that films inside your home. What could go wrong?",
#   "IPHONE 12 MINI PERFORMANCE MAY DISAPPOINT DUE TO THE APPLE B14 CHIP",
#   "Delhi Capitals vs Chennai Super Kings: Prithvi Shaw shines as DC beat CSK to post second consecutive win in IPL",
#   "French Open 2020: Rafael Nadal handed tough draw in bid for record-equaling 20th Grand Slam"
# ]

corpus = [
  '這個服務生很不親切',         
  '这个服务生很不亲切', 
  '黄昏時滂沱大雨',  
  '放工時下大雨',
  '這個週末陽光普照!',
  '天朗氣清的星期天。',
  '現時恒指已跌穿重要支持位26500點來看。',
  '港股繼續尋底的機會是頗高的。'         
]

# tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
# model = AutoModel.from_pretrained(CHECKPOINT)

# tokens = tokenizer(text=corpus, max_length=MAX_LENGTH, add_special_tokens=True, padding='max_length', truncation=True, return_tensors='pt')
# output = model(**tokens)

corpus_embeddings = []
# for i in range(len(output.pooler_output)):
#   corpus_embeddings.append(output.pooler_output[i].detach().numpy())

embedder = SentenceTransformer(CHECKPOINT)
corpus_embeddings = embedder.encode(corpus)

Embedding Similarity

In [3]:
similarities = cosine_similarity(corpus_embeddings)
similarities_sorted = similarities.argsort()
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
    id_1.append(index)
    id_2.append(array[-2])
    score.append(similarities[index][array[-2]])
index_df = pd.DataFrame({'id_1' : id_1,
                          'id_2' : id_2,
                          'score' : score})

print(index_df)

   id_1  id_2     score
0     0     1  0.980593
1     1     0  0.980593
2     2     3  0.867589
3     3     2  0.867589
4     4     5  0.719055
5     5     4  0.719055
6     6     2  0.211332
7     7     4  0.296934


K-Means Clustering (distance)

In [4]:
from sklearn.cluster import KMeans

num_clusters = 4
# Define kmeans model
clustering_model = KMeans(n_clusters=num_clusters)
# Fit the embedding with kmeans clustering.
clustering_model.fit(corpus_embeddings)
# Get the cluster id assigned to each news headline.
cluster_assignment = clustering_model.labels_

print(cluster_assignment)

[0 0 1 1 3 3 1 2]


In [5]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['這個服務生很不親切', '这个服务生很不亲切']

Cluster  2
['黄昏時滂沱大雨', '放工時下大雨', '現時恒指已跌穿重要支持位26500點來看。']

Cluster  3
['港股繼續尋底的機會是頗高的。']

Cluster  4
['這個週末陽光普照!', '天朗氣清的星期天。']



K-Means Clustering (similarity)

In [6]:
import nltk
from nltk.cluster.kmeans import KMeansClusterer
num_clusters = 4
data = corpus_embeddings
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

print(assigned_clusters)
print()

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

[0, 0, 3, 3, 2, 2, 1, 2]

Cluster  1
['這個服務生很不親切', '这个服务生很不亲切']

Cluster  2
['黄昏時滂沱大雨', '放工時下大雨', '現時恒指已跌穿重要支持位26500點來看。']

Cluster  3
['港股繼續尋底的機會是頗高的。']

Cluster  4
['這個週末陽光普照!', '天朗氣清的星期天。']



DBSCAN (distance)

In [7]:
from sklearn.cluster import DBSCAN

cluster = DBSCAN(eps=2.25,min_samples=1).fit(corpus_embeddings)
cluster.labels_

array([0, 0, 1, 1, 2, 2, 3, 4])