In [1]:
! pip install sentence_transformers



In [2]:

documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [3]:

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:


# Load pre-trained Sentence Transformer model
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'

In [5]:


model = SentenceTransformer(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [7]:
documents

['This is a list which containing sample documents.',
 'Keywords are important for keyword-based search.',
 'Document analysis involves extracting keywords.',
 'Keyword-based search relies on sparse embeddings.',
 'Understanding document structure aids in keyword extraction.',
 'Efficient keyword extraction enhances search accuracy.',
 'Semantic similarity improves document retrieval performance.',
 'Machine learning algorithms can optimize keyword extraction methods.']

In [8]:
len(documents)

8

In [9]:
document_embeddings =model.encode(documents)
document_embeddings

array([[ 0.10894684,  0.07812065,  0.11626569, ..., -0.09545085,
         0.04451444,  0.37490645],
       [ 0.36544177,  0.41045642,  0.02045545, ...,  0.19753921,
         0.22888167,  0.1115875 ],
       [ 0.28732774,  0.15202942,  0.24311854, ...,  0.16569284,
         0.3132983 ,  0.10595309],
       ...,
       [ 0.34392148,  0.23954616,  0.16326736, ...,  0.10939949,
         0.22899213,  0.2131516 ],
       [ 0.16869688,  0.37198666,  0.25552407, ..., -0.06428422,
         0.20327894,  0.15427306],
       [ 0.3371375 ,  0.12179313, -0.12275392, ...,  0.17741631,
        -0.00627342,  0.2018703 ]], dtype=float32)

In [10]:


len(document_embeddings[0])

768

In [11]:

query = "Natural language processing techniques enhance keyword extraction efficiency."

In [14]:

query_embedding = model.encode(query)
len(query_embedding)

768

In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [16]:

similarities = cosine_similarity(np.array([query_embedding]), document_embeddings)

In [17]:
similarities

array([[0.16948152, 0.45802277, 0.5675694 , 0.44123298, 0.6316117 ,
        0.7521413 , 0.550352  , 0.74481654]], dtype=float32)

In [19]:
most_similar_index = np.argmax(similarities)
most_similar_index

5

In [20]:
most_similar_document = documents[most_similar_index]
most_similar_document

'Efficient keyword extraction enhances search accuracy.'

In [22]:
similarity_score = similarities[0][most_similar_index]
similarity_score

0.7521413

In [23]:

sorted_indices = np.argsort(similarities[0])[::-1]
sorted_indices

array([5, 7, 4, 2, 6, 1, 3, 0], dtype=int64)

In [24]:


ranked_documents = [(documents[i], similarities[0][i]) for i in sorted_indices]
ranked_documents

[('Efficient keyword extraction enhances search accuracy.', 0.7521413),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.74481654),
 ('Understanding document structure aids in keyword extraction.', 0.6316117),
 ('Document analysis involves extracting keywords.', 0.5675694),
 ('Semantic similarity improves document retrieval performance.', 0.550352),
 ('Keywords are important for keyword-based search.', 0.45802277),
 ('Keyword-based search relies on sparse embeddings.', 0.44123298),
 ('This is a list which containing sample documents.', 0.16948152)]

In [26]:
query

'Natural language processing techniques enhance keyword extraction efficiency.'

In [25]:

print("Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412968635559
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.567569375038147
Rank 5: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.5503519773483276
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.4580227732658386
Rank 7: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.44123297929763794
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.16948151588439941


In [30]:

print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412968635559
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.567569375038147


In [28]:

! pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [29]:
from rank_bm25 import BM25Okapi

In [32]:
top_4_documents = [doc[0] for doc in ranked_documents[:4]]
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [34]:
tokenized_top_4_documents = [doc.split() for doc in top_4_documents]
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [35]:
tokenized_query = query.split()
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [37]:

bm25=BM25Okapi(tokenized_top_4_documents)
bm25

<rank_bm25.BM25Okapi at 0x2d9bf837890>

In [38]:


bm25_scores = bm25.get_scores(tokenized_query)
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [44]:

sorted_indices2 = np.argsort(bm25_scores)[::-1]
sorted_indices2

array([0, 2, 1, 3], dtype=int64)

In [45]:

reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices2]
reranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  0.19079979534096053),
 ('Understanding document structure aids in keyword extraction.',
  0.1780325227902643),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.1668667199671815),
 ('Document analysis involves extracting keywords.', 0.0)]

In [42]:


print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0


In [46]:

ranked_documents[:4]

[('Efficient keyword extraction enhances search accuracy.', 0.7521413),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.74481654),
 ('Understanding document structure aids in keyword extraction.', 0.6316117),
 ('Document analysis involves extracting keywords.', 0.5675694)]

## Cross Encoder

In [47]:


from sentence_transformers import CrossEncoder

In [48]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [49]:

top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [50]:
query

'Natural language processing techniques enhance keyword extraction efficiency.'

In [51]:

pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])

pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [52]:

scores = cross_encoder.predict(pairs)
scores

array([ 3.137872 ,  0.8421651, -2.919301 , -2.8781886], dtype=float32)

In [53]:

scored_docs = zip(scores, top_4_documents)
scored_docs

<zip at 0x2d9bf924400>

In [55]:
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)
reranked_document_cross_encoder

[(3.137872, 'Efficient keyword extraction enhances search accuracy.'),
 (0.8421651,
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (-2.8781886, 'Document analysis involves extracting keywords.'),
 (-2.919301, 'Understanding document structure aids in keyword extraction.')]

## BM_25