In [1]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from sentence_transformers import SentenceTransformer
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
model = SentenceTransformer(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [5]:
document_embedding = model.encode(documents)
document_embedding

array([[ 0.10894685,  0.0781207 ,  0.11626569, ..., -0.09545091,
         0.04451441,  0.37490645],
       [ 0.36544183,  0.41045642,  0.02045556, ...,  0.19753931,
         0.22888155,  0.11158741],
       [ 0.28732777,  0.15202941,  0.24311861, ...,  0.1656929 ,
         0.31329834,  0.10595316],
       ...,
       [ 0.3439215 ,  0.23954621,  0.16326731, ...,  0.10939934,
         0.22899213,  0.2131515 ],
       [ 0.16869698,  0.37198663,  0.2555241 , ..., -0.06428406,
         0.20327897,  0.15427303],
       [ 0.3371376 ,  0.12179305, -0.12275384, ...,  0.1774162 ,
        -0.00627331,  0.20187023]], shape=(8, 768), dtype=float32)

In [6]:
print(document_embedding.shape)

(8, 768)


In [7]:
for i,embeddings in enumerate(document_embedding):
    print(f'Document {i+1} {embeddings}')

Document 1 [ 0.10894685  0.0781207   0.11626569 -0.31912503  0.4689022   0.43514383
  0.01453738  0.44238767  0.297164   -0.18982707  0.07389054 -0.27864906
  0.21338163 -0.1207701   0.17891699 -0.00789878  0.04754862 -0.18204565
  0.3422711  -0.06994258 -0.14288741  0.5714125  -0.11153244 -0.17895408
  0.01523129  0.2610571  -0.20555831  0.05203114 -0.02810769  0.2387324
  0.0120698   0.04404927  0.02242311 -0.13895182 -0.7410038   0.25601006
  0.08149689  0.18820496 -0.4123769   0.11368618  0.28121173  0.05860886
 -0.1731878   0.33549133  0.21803683 -0.05090715 -0.05457789 -0.8738479
 -0.24082275  0.32006976  0.44761655  0.06347825  0.5357486   0.16607258
 -0.33197     0.33393636  0.2861592  -0.5419566  -0.2713242   0.24881154
 -0.23919384 -0.4692628   0.13836576  0.3784289  -0.01304429  0.019906
  0.32365087  0.45857537  0.07600261  0.25299588 -0.42938945  0.10051926
 -0.33042598 -0.6987648   0.01035903  0.05666573  0.14731245 -0.47082353
  0.08063988  0.33870456 -0.27278143  0.0451

In [8]:
query = "Natural language processing techniques enhance keyword extraction efficiency."

In [9]:
query_embedding = model.encode(query)
query_embedding

array([ 3.42821062e-01,  3.26170474e-01,  7.73253338e-03,  1.60758775e-02,
        2.38517039e-02, -2.52880841e-01,  8.15468729e-02,  1.37177363e-01,
        3.33291799e-01,  3.65743861e-02, -9.28223506e-02, -3.43725681e-01,
        7.12015927e-02,  9.48833823e-01, -9.41290110e-02, -7.96413839e-01,
       -3.20592105e-01, -2.67564505e-01,  2.08664209e-01,  7.99598843e-02,
        7.93330837e-03, -5.22297807e-02, -1.00512728e-01, -2.04229757e-01,
        2.19562620e-01,  7.67739266e-02,  1.25881523e-01,  2.90843785e-01,
       -4.99191105e-01,  2.27709621e-01, -1.94742575e-01,  7.21394643e-02,
       -4.56513371e-03,  1.21945575e-01, -1.46225691e-01, -9.64745134e-02,
        1.02258235e-01, -2.04421118e-01, -3.95657718e-01,  7.32251257e-02,
        1.10104166e-01,  4.84611154e-01,  1.04519255e-01,  3.86131734e-01,
       -1.68697253e-01, -2.11251125e-01, -6.38036430e-01, -4.32641476e-01,
        2.74459153e-01,  9.64053422e-02, -1.30053520e-01,  5.76568581e-02,
        8.07963088e-02,  

In [10]:
query_embedding.shape

(768,)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity  

In [13]:
similarities = cosine_similarity(np.array([query_embedding]), document_embedding)
similarities

array([[0.16948146, 0.4580227 , 0.56756943, 0.44123292, 0.6316117 ,
        0.75214124, 0.550352  , 0.74481654]], dtype=float32)

In [15]:
most_similar_index= np.argmax(similarities)
most_similar_index

np.int64(5)

In [16]:
documents[most_similar_index]

'Efficient keyword extraction enhances search accuracy.'

In [17]:
similarity_score = similarities[0][most_similar_index]
similarity_score

np.float32(0.75214124)

In [18]:
sorted_index = np.argsort(similarities[0])[::-1]
sorted_index

array([5, 7, 4, 2, 6, 1, 3, 0])

In [None]:
ranked_documents = [(documents[i], similarities[0][i]) for i in sorted_index] ### This is the ranked result not re-ranked
ranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.75214124)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.74481654)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.6316117)),
 ('Document analysis involves extracting keywords.', np.float32(0.56756943)),
 ('Semantic similarity improves document retrieval performance.',
  np.float32(0.550352)),
 ('Keywords are important for keyword-based search.', np.float32(0.4580227)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.44123292)),
 ('This is a list which containing sample documents.', np.float32(0.16948146))]

In [21]:
print("Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675694346427917
Rank 5: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.5503519773483276
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.45802271366119385
Rank 7: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.44123291969299316
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.16948145627975464


In [22]:
print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675694346427917


In [23]:
from rank_bm25 import  BM25Okapi
top_4_documents = [doc[0] for doc in ranked_documents[:4]]
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [26]:
tokenized_top_4_documents = [doc.split() for doc in top_4_documents] 
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [27]:
tokenized_query = query.split()
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [28]:
bm25 = BM25Okapi(corpus=tokenized_top_4_documents)

In [29]:
bm25_scores = bm25.get_scores(query=tokenized_query)
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [30]:
re_ranked_sorted_indices = np.argsort(bm25_scores)[::-1]
re_ranked_sorted_indices

array([0, 2, 1, 3])

In [32]:
reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in re_ranked_sorted_indices]

In [33]:
print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0


In [34]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [35]:
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [36]:
query

'Natural language processing techniques enhance keyword extraction efficiency.'

In [37]:
pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])

In [38]:
pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [39]:
scores = cross_encoder.predict(pairs)
scores

array([ 3.1378732,  0.8421646, -2.9193015, -2.8781915], dtype=float32)

In [40]:
scored_docs = zip(scores, top_4_documents)
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)
reranked_document_cross_encoder

[(np.float32(3.1378732),
  'Efficient keyword extraction enhances search accuracy.'),
 (np.float32(0.8421646),
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (np.float32(-2.8781915), 'Document analysis involves extracting keywords.'),
 (np.float32(-2.9193015),
  'Understanding document structure aids in keyword extraction.')]