In [50]:
# Sentences: 

a = "She found his complete dullness interesting."
b = "He learned the important lesson that a picnic at the beach on a windy day is a bad idea."
c = "When nobody is around, the trees gossip about the people who have walked under them."

In [5]:
# Jaccard

def jaccard(x: str, y: str) -> int:
  x = set(x.split())
  y = set(y.split())

  shared = x.intersection(y)
  union = x.union(y)
  return round(len(shared) / len(union), 2)

In [6]:
jaccard(c, b)

0.07

In [7]:
import numpy as np

In [19]:
# TF-IDF
docs = [a, b, c]

def tfidf(word: str, sentence: str) -> int:
  tf = sentence.count(word) / len(sentence)
  idf = np.log10(len(docs) / sum([1 for doc in docs if word in doc]))
  return round(tf*idf, 4)

In [23]:
tfidf('the', b)

0.004

In [33]:
v = a+" "+ b +" "+ c
vocab = set(v.replace('.', '').split())

In [34]:
vocab

{'He',
 'She',
 'When',
 'a',
 'about',
 'around,',
 'at',
 'bad',
 'beach',
 'complete',
 'day',
 'dullness',
 'found',
 'gossip',
 'have',
 'his',
 'idea',
 'important',
 'interesting',
 'is',
 'learned',
 'lesson',
 'nobody',
 'on',
 'people',
 'picnic',
 'that',
 'the',
 'them',
 'trees',
 'under',
 'walked',
 'who',
 'windy'}

In [35]:
vec_a = []
vec_b = []
vec_c = []

for word in vocab:
  vec_a.append(tfidf(word, a))
  vec_b.append(tfidf(word, b))
  vec_c.append(tfidf(word, c))

In [39]:
arrays = [np.array(vec_a) ,np.array(vec_b) ,np.array(vec_c)]
tf_idf_feature = np.stack(arrays, axis=0)

In [43]:
tf_idf_feature

array([[0.    , 0.    , 0.    , 0.0108, 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.0108, 0.    , 0.    ,
        0.0108, 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.0108, 0.    , 0.0108, 0.    , 0.    , 0.0108,
        0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    , 0.    , 0.022 , 0.0108, 0.0054,
        0.    , 0.0054, 0.    , 0.0054, 0.0054, 0.    , 0.    , 0.0054,
        0.    , 0.0054, 0.0054, 0.0054, 0.    , 0.    , 0.    , 0.004 ,
        0.0054, 0.0054, 0.    , 0.    , 0.    , 0.    , 0.0054, 0.    ,
        0.    , 0.0108],
       [0.0057, 0.0057, 0.0057, 0.    , 0.0057, 0.0084, 0.    , 0.    ,
        0.0057, 0.    , 0.0057, 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.0057, 0.0057, 0.0057, 0.0063,
        0.    , 0.    , 0.    , 0.0057, 0.    , 0.0057, 0.    , 0.    ,
        0.0057, 0.    ]])

In [45]:
#BM25

docs = [a, b, c]
avgdl = sum(len(sentence) for sentence in docs) / len(docs)
N = len(docs)

def bm25(word, sentence, k=1.2, b=0.75):
  freq = sentence.count(word)
  tf = (freq * (k + 1 )) / (freq + k * (1 - b + b * len(sentence) / avgdl))
  N_q = sum([1 for doc in docs if word in doc])
  idf = np.log(((N - N_q + 0.5) / (N_q + 0.5)) + 1)
  return round(tf*idf, 4)

In [46]:
bm25('is', a)

0.0

In [49]:
#S-BERT

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [52]:
sentences = [a, b, c]
sentence_embeddings = model.encode(sentences)

In [55]:
sentence_embeddings.shape

(3, 384)

In [56]:
from sklearn.metrics.pairwise import cosine_similarity

In [57]:
scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))
for i in range(sentence_embeddings.shape[0]):
  scores[i, :] = cosine_similarity(
     [sentence_embeddings[i]], 
     sentence_embeddings 
  )[0]

In [58]:
scores

array([[1.        , 0.22377595, 0.23841505],
       [0.22377595, 1.        , 0.23110545],
       [0.23841506, 0.23110542, 1.        ]])