In [1]:
!pip install sentence_transformers cohere rank_bm25 -q

# Sentence Transformer

In [2]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [3]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Sentence Transformer model
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'

model = SentenceTransformer(model_name)

In [4]:
document_embeddings = model.encode(documents)

In [5]:
len(document_embeddings[0])

768

In [6]:
document_embeddings[0]

array([ 0.10894689,  0.07812073,  0.11626551, -0.31912497,  0.4689025 ,
        0.43514422,  0.01453746,  0.44238743,  0.29716405, -0.18982708,
        0.07389075, -0.2786492 ,  0.21338147, -0.12077028,  0.17891683,
       -0.00789893,  0.04754868, -0.1820457 ,  0.3422713 , -0.06994253,
       -0.14288716,  0.5714125 , -0.1115325 , -0.17895411,  0.01523139,
        0.26105717, -0.20555817,  0.05203109, -0.02810766,  0.23873205,
        0.01206989,  0.04404955,  0.02242325, -0.13895158, -0.7410038 ,
        0.25601026,  0.08149662,  0.1882047 , -0.41237688,  0.11368611,
        0.28121182,  0.05860891, -0.17318787,  0.33549157,  0.21803695,
       -0.05090727, -0.05457785, -0.87384796, -0.24082269,  0.32007015,
        0.44761685,  0.06347837,  0.5357485 ,  0.16607259, -0.33197004,
        0.33393645,  0.28615928, -0.5419567 , -0.27132422,  0.24881186,
       -0.23919372, -0.46926293,  0.13836573,  0.3784288 , -0.01304426,
        0.01990607,  0.32365105,  0.4585755 ,  0.07600269,  0.25

In [7]:
query = "Natural language processing techniques enhance keyword extraction efficiency."

In [8]:
query_embedding = model.encode(query)

In [9]:
len(query_embedding)

768

In [10]:
print("Query embedding:", query_embedding)

Query embedding: [ 3.42821240e-01  3.26170504e-01  7.73252500e-03  1.60758570e-02
  2.38518696e-02 -2.52880484e-01  8.15469846e-02  1.37177378e-01
  3.33291799e-01  3.65743376e-02 -9.28227156e-02 -3.43726009e-01
  7.12014735e-02  9.48833764e-01 -9.41291302e-02 -7.96414316e-01
 -3.20592374e-01 -2.67564595e-01  2.08664402e-01  7.99599960e-02
  7.93344434e-03 -5.22296093e-02 -1.00512579e-01 -2.04229817e-01
  2.19562709e-01  7.67738372e-02  1.25881582e-01  2.90844023e-01
 -4.99190927e-01  2.27709368e-01 -1.94742531e-01  7.21396133e-02
 -4.56500147e-03  1.21945709e-01 -1.46225870e-01 -9.64743793e-02
  1.02258243e-01 -2.04421282e-01 -3.95658016e-01  7.32253566e-02
  1.10103890e-01  4.84611511e-01  1.04519367e-01  3.86131942e-01
 -1.68697596e-01 -2.11251274e-01 -6.38036370e-01 -4.32641715e-01
  2.74459064e-01  9.64056700e-02 -1.30053535e-01  5.76568209e-02
  8.07963684e-02  6.48148209e-02 -2.14203843e-03  2.29773849e-01
 -6.66051283e-02  3.07675242e-01  2.11779490e-01 -1.57844380e-01
  9.7886

## Ranking

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
similarities = cosine_similarity(np.array([query_embedding]), document_embeddings)

In [13]:
similarities

array([[0.16948138, 0.45802265, 0.5675692 , 0.44123262, 0.6316117 ,
        0.75214124, 0.550352  , 0.74481654]], dtype=float32)

In [14]:
most_similar_index = np.argmax(similarities)

In [15]:
most_similar_index

np.int64(5)

In [16]:
most_similar_document = documents[most_similar_index]

In [17]:
most_similar_document

'Efficient keyword extraction enhances search accuracy.'

In [18]:
similarity_score = similarities[0][most_similar_index]

In [19]:
similarity_score

np.float32(0.75214124)

In [20]:
sorted_indices = np.argsort(similarities[0])[::-1]

In [21]:
sorted_indices

array([5, 7, 4, 2, 6, 1, 3, 0])

In [22]:
ranked_documents = [(documents[i], similarities[0][i]) for i in sorted_indices]

In [23]:
print("Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675691962242126
Rank 5: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.5503519773483276
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.4580226540565491
Rank 7: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.4412326216697693
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.16948138177394867


In [24]:
print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448165416717529
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.631611704826355
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675691962242126


# BM_25

In [25]:
from rank_bm25 import BM25Okapi

top_4_documents = [doc[0] for doc in ranked_documents[:4]]
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [26]:
tokenized_top_4_documents = [doc.split() for doc in top_4_documents]

In [27]:
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [28]:
tokenized_query = query.split()

In [29]:
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [30]:
bm25=BM25Okapi(tokenized_top_4_documents)

In [31]:
bm25

<rank_bm25.BM25Okapi at 0x7cbd69cf1fd0>

In [32]:
bm25_scores = bm25.get_scores(tokenized_query)

In [33]:
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [34]:
sorted_indices2 = np.argsort(bm25_scores)[::-1]

In [35]:
sorted_indices2

array([0, 2, 1, 3])

In [36]:
reranked_documents_bm25 = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices2]

In [37]:
print(*reranked_documents_bm25,sep="\n")

('Efficient keyword extraction enhances search accuracy.', np.float64(0.19079979534096053))
('Understanding document structure aids in keyword extraction.', np.float64(0.1780325227902643))
('Machine learning algorithms can optimize keyword extraction methods.', np.float64(0.1668667199671815))
('Document analysis involves extracting keywords.', np.float64(0.0))


# Cross-Encoder

In [38]:
from sentence_transformers import CrossEncoder

In [39]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [40]:
pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])

In [41]:
pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [42]:
scores = cross_encoder.predict(pairs)
scores

array([ 3.1378722,  0.842167 , -2.919299 , -2.8781896], dtype=float32)

In [43]:
scored_docs = zip(top_4_documents,  scores)

scored_docs

<zip at 0x7cbd69d87080>

In [44]:
reranked_document_cross_encoder = sorted(scored_docs, key=lambda x: x[1], reverse=True)

In [45]:
reranked_document_cross_encoder

[('Efficient keyword extraction enhances search accuracy.',
  np.float32(3.1378722)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.842167)),
 ('Document analysis involves extracting keywords.', np.float32(-2.8781896)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(-2.919299))]

In [46]:
from IPython.display import display, Markdown

def display_reranked_docs(reranked_documents, method_name="Ranking", top_k=4):
    md_text = f"## ðŸ“‘ Rerank of Top Documents ({method_name})\n"
    for rank, (document, similarity) in enumerate(reranked_documents[:top_k], start=1):
        md_text += f"- **Rank {rank}**  \n"
        md_text += f"  ðŸ“„ *Document:* `{document}`  \n"
        md_text += f"  ðŸ”¢ *Similarity Score:* `{similarity:.4f}`\n\n"
    display(Markdown(md_text))


In [47]:
display_reranked_docs(reranked_documents_bm25, method_name="BM25")
display_reranked_docs(reranked_document_cross_encoder, method_name="Cross Encoder")

## ðŸ“‘ Rerank of Top Documents (BM25)
- **Rank 1**  
  ðŸ“„ *Document:* `Efficient keyword extraction enhances search accuracy.`  
  ðŸ”¢ *Similarity Score:* `0.1908`

- **Rank 2**  
  ðŸ“„ *Document:* `Understanding document structure aids in keyword extraction.`  
  ðŸ”¢ *Similarity Score:* `0.1780`

- **Rank 3**  
  ðŸ“„ *Document:* `Machine learning algorithms can optimize keyword extraction methods.`  
  ðŸ”¢ *Similarity Score:* `0.1669`

- **Rank 4**  
  ðŸ“„ *Document:* `Document analysis involves extracting keywords.`  
  ðŸ”¢ *Similarity Score:* `0.0000`



## ðŸ“‘ Rerank of Top Documents (Cross Encoder)
- **Rank 1**  
  ðŸ“„ *Document:* `Efficient keyword extraction enhances search accuracy.`  
  ðŸ”¢ *Similarity Score:* `3.1379`

- **Rank 2**  
  ðŸ“„ *Document:* `Machine learning algorithms can optimize keyword extraction methods.`  
  ðŸ”¢ *Similarity Score:* `0.8422`

- **Rank 3**  
  ðŸ“„ *Document:* `Document analysis involves extracting keywords.`  
  ðŸ”¢ *Similarity Score:* `-2.8782`

- **Rank 4**  
  ðŸ“„ *Document:* `Understanding document structure aids in keyword extraction.`  
  ðŸ”¢ *Similarity Score:* `-2.9193`



# Cohere




```python
    import cohere

    co = cohere.Client("nbDqU1hTVxWmXGbLYI6OnYhp4Cx40MZ5hOmO5oKX")

    response = co.rerank(
      model="rerank-english-v3.0",
      query="Natural language processing techniques enhance keyword extraction efficiency.",
      documents=top_4_documents,
      return_documents=True
  )

    response.results[0].document.text

    response.results[0].relevance_score
    
```

