<a href="https://colab.research.google.com/github/sau-coder/NLP/blob/master/Sentence_transform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer,util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = ["This is an example sentence", "Each sentence is converted"]
embedding = model.encode(sentences)

In [None]:
for sentence, embedding in zip(sentences, embedding):
  print(sentence)
  print(embedding)
  print("")

In [None]:
emb1 = model.encode("I am eating apple")
emb2 = model.encode("I like dancing")

cos_sim = util.cos_sim(emb1, emb2)

In [None]:
cos_sim

In [None]:
sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]

embeddings = model.encode(sentences)

cos_sim = util.cos_sim(embeddings, embeddings)

cos_sim

In [None]:
all_sentence_combinations = []

for i in range(len(cos_sim)-1):
  for j in range(i+1, len(cos_sim)):
    all_sentence_combinations.append((cos_sim[i][j], i, j))

all_sentence_combinations

In [None]:
all_sentence_combinations = sorted(all_sentence_combinations, key = lambda x : x[0], reverse = True)

In [None]:
all_sentence_combinations

In [None]:
for score, i, j in all_sentence_combinations[0:5]:
  print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

A man is eating food. 	 A man is eating a piece of bread. 	 0.7553
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	 0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	 0.6433
A woman is playing violin. 	 Someone in a gorilla costume is playing a set of drums. 	 0.2564
A man is eating food. 	 A man is riding a horse. 	 0.2474


Semantic Search

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('clips/mfaq')

In [None]:
question = "How many models can I host on HuggingFace?"
answer_1 = "All plans come with unlimited private models and datasets."
answer_2 = "AutoNLP is an automatic way to train and deploy state-of-the-art NLP models, seamlessly integrated with the Hugging Face ecosystem."
answer_3 = "Based on how much training data and model variants are created, we send you a compute cost and payment link - as low as $10 per job."

query_embedding = model.encode(question)
corpus_embeddings = model.encode([answer_1, answer_2, answer_3])

print(util.semantic_search(query_embedding, corpus_embeddings))

[[{'corpus_id': 0, 'score': 0.5507140159606934}, {'corpus_id': 2, 'score': 0.49931803345680237}, {'corpus_id': 1, 'score': 0.45967164635658264}]]


Clustering

In [None]:
from sklearn.cluster import KMeans
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'Horse is eating grass.',
          'A man is eating pasta.',
          'A Woman is eating Biryani.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.',
          'The cheetah is chasing a man who is riding the horse.',
          'man and women with their baby are watching cheetah in zoo'
          ]

corpus_embeddings = embedder.encode(corpus)

In [None]:
corpus_embedding = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis = 1, keepdims = True)

In [None]:
corpus_embedding[0]

In [None]:
clustering_model = KMeans(n_clusters = 4)
clustering_model.fit(corpus_embedding)
cluster_assignment = clustering_model.labels_
cluster_assignment



array([2, 2, 1, 2, 2, 0, 0, 1, 1, 3, 3, 1, 1, 1, 0], dtype=int32)

In [None]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])
clustered_sentences

{2: ['A man is eating food.',
  'A man is eating a piece of bread.',
  'A man is eating pasta.',
  'A Woman is eating Biryani.'],
 1: ['Horse is eating grass.',
  'A man is riding a horse.',
  'A man is riding a white horse on an enclosed ground.',
  'A cheetah is running behind its prey.',
  'A cheetah chases prey on across a field.',
  'The cheetah is chasing a man who is riding the horse.'],
 0: ['The girl is carrying a baby.',
  'The baby is carried by the woman',
  'man and women with their baby are watching cheetah in zoo'],
 3: ['A monkey is playing drums.',
  'Someone in a gorilla costume is playing a set of drums.']}