In [17]:
pip install rank_bm25 sentence-transformers faiss-cpu


Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd, ast
df_meta = pd.read_csv("xmlAndHTML_data.csv")
df_meta["Para_list"] = df_meta["Para_list"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_long = df_meta.explode("Para_list", ignore_index=True)
paragraphs = df_long["Para_list"].astype(str).tolist()

In [20]:
from rank_bm25 import BM25Okapi

# `paragraphs` is your list of all paragraph texts (len ≈ 110 k)
tokenized_corpus = [p.split() for p in paragraphs]
bm25 = BM25Okapi(tokenized_corpus)


In [21]:
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

# Contriever is an open-source dense retriever from Meta
densifier = SentenceTransformer("facebook/contriever")  
# (optionally normalize for cosine)              
# build FAISS index if you haven’t:
contriever_embs = densifier.encode(paragraphs, convert_to_numpy=True, normalize_embeddings=True)
index_dense = faiss.IndexFlatIP(contriever_embs.shape[1])
index_dense.add(contriever_embs)


No sentence-transformers model found with name C:\Users\hp/.cache\torch\sentence_transformers\facebook_contriever. Creating a new one with MEAN pooling.


In [22]:
from sentence_transformers import CrossEncoder

# Cross-encoder for re-ranking (e.g. MS-MARCO MiniLM)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def hybrid_retrieve(query, top_bm25=100, top_dense=100, top_final=10):
    # 1) BM25 hits
    q_tokens = query.split()
    bm25_scores = bm25.get_scores(q_tokens)
    bm25_ids    = np.argsort(bm25_scores)[-top_bm25:][::-1]

    # 2) Dense hits
    q_emb       = densifier.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    Dd, dense_ids = index_dense.search(q_emb, top_dense)

    # 3) Union IDs (dedupe while preserving order)
    union_ids = []
    for arr in (dense_ids[0], bm25_ids):
        for i in arr:
            if i not in union_ids:
                union_ids.append(i)

    # 4) Re-rank with cross-encoder
    pairs = [(query, paragraphs[i]) for i in union_ids]
    rerank_scores = reranker.predict(pairs)
    ranked = sorted(zip(union_ids, rerank_scores), key=lambda x: x[1], reverse=True)[:top_final]

    # 5) Return (score, paragraph)
    return [(float(score), paragraphs[idx]) for idx, score in ranked]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [23]:
for score, para in hybrid_retrieve(
    "How does crack self-healing vary with annealing temperature and time?",
    top_bm25=200, top_dense=200, top_final=5
):
    print(f"[{score:.3f}] {para[:200]}…\n")


[5.290] How does crack self-healing vary with annealing Taand ta?…

[4.774] The paper is organized in a way that seeks answers to the following questions:1.How does crack self-healing vary with annealing Taand ta?2.What is the primary mechanism of crack self-healing?3.How do …

[4.108] The results of tensile tests, including fractography, at RT and -150 °C clearly show that high temperature, short time anneals self-heal the pre-existing MCs in NFA-1. Self-healing begins in the 111-c…

[4.101] Here, we investigate a newly discovered mechanism for crack self-healing in NFA-1 under high temperature annealing. This mechanism is associated with Ti-carbooxinitride (TiCON) precipitate growth and …

[3.555] Based on the mechanical property trends, as confirmed by the microstructural characterization and fractographic studies, we conclude that crack self-healing starts at 1100 °C, and is essentially compl…

