In [5]:
import pandas as pd, ast

# 1) Read the raw metadata CSV
df_meta = pd.read_csv("xmlAndHTML_data.csv")

# 2) Convert the “Para_list” strings back to Python lists
df_meta["Para_list"] = df_meta["Para_list"].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else []
)

# 3) Explode so each paragraph is its own row, rename to “text”
df_long = (
    df_meta[["DOI","Title","Para_list"]]
      .explode("Para_list", ignore_index=True)
      .rename(columns={"Para_list":"text"})
)

# 4) Add the row_id index
df_long = df_long.reset_index().rename(columns={"index":"row_id"})

# Quick sanity check
print(df_long.columns)   # should include 'row_id' and 'text'
print(df_long.head(2))


Index(['row_id', 'DOI', 'Title', 'text'], dtype='object')
   row_id                     DOI  \
0       0  10.1006/jcis.1996.4536   
1       1  10.1006/jssc.1999.8570   

                                               Title text  
0  Characterization of the Interface between a Ro...  NaN  
1  Relationships between Structure and Physical P...  NaN  


In [6]:
all_texts = df_long["text"].tolist()


In [9]:
!pip install opensearch-py


Collecting opensearch-py
  Downloading opensearch_py-2.8.0-py3-none-any.whl.metadata (6.9 kB)
Collecting Events (from opensearch-py)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Downloading opensearch_py-2.8.0-py3-none-any.whl (353 kB)
Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Installing collected packages: Events, opensearch-py

   -------------------- ------------------- 1/2 [opensearch-py]
   -------------------- ------------------- 1/2 [opensearch-py]
   -------------------- ------------------- 1/2 [opensearch-py]
   -------------------- ------------------- 1/2 [opensearch-py]
   -------------------- ------------------- 1/2 [opensearch-py]
   -------------------- ------------------- 1/2 [opensearch-py]
   ---------------------------------------- 2/2 [opensearch-py]

Successfully installed Events-0.5 opensearch-py-2.8.0


In [10]:
from sentence_transformers import SentenceTransformer, InputExample
import faiss, numpy as np, pandas as pd, ast

# 1. Load FAISS index + MiniLM texts (384-dim)
index      = faiss.read_index("paragraph_index.faiss")
paragraphs = np.load("paragraph_texts.npy", allow_pickle=True).tolist()

# 2. Load the *same* encoder you used originally
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve_mini(query, top_k=5):
    q_emb = model.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    D, I = index.search(q_emb, top_k)
    return [(float(D[0][i]), paragraphs[I[0][i]]) for i in range(len(I[0])) if I[0][i] != -1]

# 3. Rebuild df_long
df_meta = pd.read_csv("xmlAndHTML_data.csv")
df_meta["Para_list"] = df_meta["Para_list"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else [])
df_long  = (df_meta[["DOI","Title","Para_list"]]
            .explode("Para_list", ignore_index=True)
            .rename(columns={"Para_list":"text"}))
df_long  = df_long.reset_index().rename(columns={"index":"row_id"})

In [11]:
# 1) Your 50 anchor queries
queries = [
    # Thermal properties
    "How does heat accumulation affect microstructure formation?",
    "What is the thermal conductivity of polymer composites?",
    "How does annealing temperature influence grain growth?",
    "What role does temperature gradient play in phase transformation?",
    "How is heat capacity measured in metal alloys?",
    "How does cooling rate affect dendrite arm spacing?",
    "What determines thermal diffusivity in ceramics?",
    "How do porosity and thermal conductivity correlate?",
    "What is the effect of heat treatment on hardness?",
    "How does thermal expansion mismatch cause stress?",
    # Mechanical properties
    "What is the Young’s modulus of perovskite materials?",
    "How is tensile strength affected by CNT reinforcement?",
    "What influences fracture toughness in NiTi alloys?",
    "How does creep behavior vary with temperature?",
    "What is the hardness of nanostructured metallic glass?",
    "How is fatigue life measured in aluminum composites?",
    "What factors control yield strength in steels?",
    "How does microstructure affect compressive strength?",
    "What is the relationship between grain size and strength?",
    "How does surface roughness influence wear resistance?",
    # Electrical properties
    "How does CNT film thickness affect composite resistivity?",
    "What is the dielectric constant of polymer electrolytes?",
    "How does impurity concentration influence conductivity?",
    "What is the breakdown voltage of ceramic dielectrics?",
    "How is electrical resistivity measured in thin films?",
    "What determines carrier mobility in semiconductors?",
    "How does doping level affect band gap energy?",
    "What is the electrical conductivity of graphene?",
    "How does temperature affect semiconductor conductivity?",
    "What is the contact resistance of metal–semiconductor junctions?",
    # Electronics properties
    "How is capacitance measured in nanostructured capacitors?",
    "What influences leakage current in dielectrics?",
    "How does film thickness affect transistor on-off ratio?",
    "What is the switching speed of organic LEDs?",
    "How does channel length affect MOSFET performance?",
    "What determines the cutoff frequency of RF amplifiers?",
    "How is inductance measured in micro-coils?",
    "What is the quality factor of microwave resonators?",
    "How does doping affect transistor threshold voltage?",
    "What influences power dissipation in ICs?",
    # Magnetic & optical
    "How is magnetic permeability measured in ferrites?",
    "What determines saturation magnetization in alloys?",
    "How does temperature affect Curie temperature?",
    "What is the refractive index of optical glasses?",
    "How does surface plasmon resonance depend on particle size?",
    "What is the photoconductivity of perovskite solar cells?",
    "How does layer thickness affect optical absorption?",
    "What influences luminescence lifetime in phosphors?",
    "How is thermal quenching measured in quantum dots?",
    "What determines the band gap in quantum wells?",
    # General materials & process
    "How does sintering temperature affect density?",
    "What is the effect of pressure on phase stability?",
    "How is diffusion coefficient measured in solids?",
    "What determines oxidation rate at elevated temperatures?",
    "How does grain boundary chemistry affect corrosion?",
    "What is the thermal shock resistance of composites?",
    "How is surface energy measured in thin films?",
    "What influences adhesion strength of coatings?",
    "How does laser power affect ablation rate?",
    "What determines etch rate in plasma processing?"
]


In [12]:
# 5. Generate examples
examples = []
for q in queries:
    for score, para in retrieve_mini(q, top_k=5):
        examples.append(InputExample(texts=[q, para]))

print(f"Generated {len(examples)} pairs with MiniLM retrieval.")

Generated 300 pairs with MiniLM retrieval.


In [None]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, evaluation
import random

# 1) Shuffle and split your 300 examples into train/dev
random.shuffle(examples)
split = int(0.9 * len(examples))
train_examples = examples[:split]   # ~270
dev_examples   = examples[split:]   # ~30

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
dev_dataloader   = DataLoader(dev_examples,   shuffle=False, batch_size=16)

# 2) Build the evaluator’s inputs
sentences1 = [ex.texts[0] for ex in dev_examples]
sentences2 = [ex.texts[1] for ex in dev_examples]
# since they’re all positives, give them a score of 1.0
scores     = [1.0] * len(dev_examples)

evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=sentences1,
    sentences2=sentences2,
    scores=scores
)

# 3) Load your base model
model = SentenceTransformer("microsoft/deberta-v3-large")

# 4) Use MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(model)

# 5) Fine-tune!
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=3,
    warmup_steps=50,
    output_path="deberta-v3-materials-finetuned"
)


No sentence-transformers model found with name C:\Users\hp/.cache\torch\sentence_transformers\microsoft_deberta-v3-large. Creating a new one with MEAN pooling.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]