<a href="https://colab.research.google.com/github/poojaneuusa/SBERT-Sentence-Transformers/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install sentence_transformers
!pip install requests
# K-Mean Clustering
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import requests

#download the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

#Get the data from github
response = requests.get('https://raw.githubusercontent.com/poojaneuusa/ML-Dataset/refs/heads/main/sbert-corpus.txt')

#Split the string by newline characters
corpus = response.text.split('\n') # Changed to '\n'
print(f'{len(corpus)=}')

if len(corpus) < 5: #Check to make sure there are enough samples
  print('Not enough data, adding test data')
  corpus.extend(["This is another example sentence.", "And here is one more.", "This is to ensure we have enough data."])
  print(f'Added test data. {len(corpus)=}')

#create embeddings
corpus_embeddings = model.encode(corpus)

#make sure the shape is more than 1
print(f'{corpus_embeddings.shape=}')

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters, n_init='auto') #add n_init='auto' to suppress a warning
clustering_model.fit(corpus_embeddings) #fit now works
cluster_assignment = clustering_model.labels_

#verify labels are set
print(f'{cluster_assignment=}')

# Continue with the rest of your code
clustered_sentences = [[] for i in range(num_clusters)]

for sentence_id, cluster_id in enumerate(cluster_assignment):
  clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
  print("Cluster ", i+1)
  print(cluster)
  print()

len(corpus)=12
corpus_embeddings.shape=(12, 384)
cluster_assignment=array([1, 1, 1, 2, 2, 1, 1, 0, 0, 3, 3, 4], dtype=int32)
Cluster  1
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  2
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  3
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  4
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  5
['']



In [14]:
# Agglomerative Clustering
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import requests

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Fetch corpus
response = requests.get('https://raw.githubusercontent.com/poojaneuusa/ML-Dataset/refs/heads/main/sbert-corpus.txt')
corpus = response.text.split('\n')

# Print corpus
print("Corpus:", corpus)

# Encode the corpus
corpus_embeddings = model.encode(corpus)

# Normalize the embeddings
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform Agglomerative Clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

# Debugging prints
print("Cluster Assignments:", cluster_assignment)

# Fix the IndexError
num_clusters = max(cluster_assignment) + 1  # Use maximum cluster ID + 1
clustered_sentences = [[] for _ in range(num_clusters)]

for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

# Print clusters
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i + 1)
    print(cluster)
    print()


Corpus: ['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.', '']
Cluster Assignments: [0 0 0 3 3 1 1 2 2 4 4 5]
Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  2
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  3
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  4
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  5
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  6
['']



In [18]:
# Fast Clustering
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import time

model = SentenceTransformer('all-MiniLM-L6-v2')

url = 'http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    with open('quora_duplicate_questions.tsv', 'wb') as file:
        file.write(response.content)
    df = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
    print(df.shape)
else:
    print(f"Failed to fetch data: {response.status_code}")

df.head()

sentences = df['question1'].tolist()[:1000]
len(sentences)

corpus_embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)

clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=0.5)

for i,cluster in enumerate(clusters):
  print("\nCluster {}, #{} Questions".format(i+1, len(cluster)))
  for id in cluster[0:3]:
    print("\t", sentences[id])
  print("\t", "...")





(404290, 6)


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Cluster 1, #10 Questions
	 Which are the best Hollywood thriller movies?
	 What are the most underrated and overrated movies you've seen?
	 What are the best films that take place in one room?
	 ...

Cluster 2, #9 Questions
	 What are your views on Modi governments decision to demonetize 500 and 1000 rupee notes? How will this affect economy?
	 What's your opinion about the decision on removal of 500 and 1000 rupees currency notes?
	 How will Indian GDP be affected from banning 500 and 1000 rupees notes?
	 ...

Cluster 3, #8 Questions
	 What is best way to make money online?
	 How can I make money through the Internet?
	 What are the easy ways to earn money online?
	 ...

Cluster 4, #7 Questions
	 What are the most important things for living a good life?
	 What is most important in life - money or values?
	 What is the best lesson in life?
	 ...

Cluster 5, #6 Questions
	 What is our stance against Pakistan?
	 What is the reason Pakistan supports terrorism?
	 If there will be a war b