In [2]:
import numpy as np
paragraphs = np.load("paragraph_texts.npy", allow_pickle=True).tolist()
print(f"Loaded {len(paragraphs):,} paragraphs")


Loaded 110,280 paragraphs


In [3]:
import pandas as pd, ast

df_meta = pd.read_csv("xmlAndHTML_data.csv")
df_meta['Para_list'] = df_meta['Para_list'].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else [])
df_long = df_meta.explode('Para_list', ignore_index=True)
paragraphs = df_long['Para_list'].astype(str).tolist()
print(f"Loaded {len(paragraphs):,} paragraphs from CSV")


Loaded 110,280 paragraphs from CSV


In [4]:
MODEL_NAME = "m3rg-iitd/matscibert"
model = SentenceTransformer(MODEL_NAME)

paragraphs = [str(p) for p in paragraphs]  # make sure they’re strings
# … then batch‑encode, build index, etc.


No sentence-transformers model found with name C:\Users\hp/.cache\torch\sentence_transformers\m3rg-iitd_matscibert. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\hp/.cache\torch\sentence_transformers\m3rg-iitd_matscibert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 1) install once
# !pip install sentence-transformers

from sentence_transformers import SentenceTransformer
import numpy as np, faiss, gc
from tqdm.auto import tqdm

MODEL_NAME = "m3rg-iitd/matscibert"   # ← swap here
BATCH_SIZE = 64                       # depends on GPU/CPU RAM

model = SentenceTransformer(MODEL_NAME)

# paragraphs comes from your 110 k df_long
paragraphs = [str(p) for p in paragraphs]          # ensure strings

# ---- re‑encode ----
embs = []
for i in tqdm(range(0, len(paragraphs), BATCH_SIZE)):
    embs.append(
        model.encode(
            paragraphs[i:i+BATCH_SIZE],
            convert_to_numpy=True,
            normalize_embeddings=True,   # cosine
            show_progress_bar=False
        ).astype("float32")
    )
embeddings = np.vstack(embs); del embs; gc.collect()

# ---- rebuild FAISS (dim = 768) ----
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "matscibert_index.faiss")
np.save("matscibert_texts.npy", np.array(paragraphs, dtype=object))
print("Saved index & texts with MatSciBERT.")


No sentence-transformers model found with name C:\Users\hp/.cache\torch\sentence_transformers\m3rg-iitd_matscibert. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\hp/.cache\torch\sentence_transformers\m3rg-iitd_matscibert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1724/1724 [4:56:33<00:00, 10.32s/it]  


Saved index & texts with MatSciBERT.


In [None]:
from openai import OpenAI
client = OpenAI()

def retrieve_matsci(query, k=5):
    q = model.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    D, I = index.search(q, k)
    paras = np.load("matscibert_texts.npy", allow_pickle=True)
    return [(float(D[0][j]), paras[I[0][j]]) for j in range(k)]

print("\n".join(p[:200] for _, p in retrieve_matsci(
    "How does oxygen vacancy concentration influence perovskite conductivity?", 3)))


How does heat accumulation affect temperatures and the resulting microstructure formation?
How does crack self-healing vary with annealing Taand ta?
CNT film thickness affects resistivity and conductivity of composite materials


In [7]:
# --- put this in a notebook cell ---
questions = [
    "How does heat accumulation affect temperatures and the resulting microstructure formation?",
    "How does crack self‑healing vary with annealing temperature and time?",
    "How does CNT film thickness affect the resistivity and conductivity of composite materials?"
]

for q in questions:
    hits = retrieve_matsci(q, k=5)          # k = number of paragraphs you want
    print(f"\n🔎  Query: {q}\n")
    for score, para in hits:
        print(f"[{score:.3f}] {para[:250]} …\n")   # print the first 250 chars



🔎  Query: How does heat accumulation affect temperatures and the resulting microstructure formation?

[1.000] How does heat accumulation affect temperatures and the resulting microstructure formation? …

[0.830] What is the effect of reheating and heat accumulation on microsegregation? …

[0.814] How does crack self-healing vary with annealing Taand ta? …

[0.805] Thermal conductivity increases with an increase in temperature for all modified CNTs …

[0.793] What is the primary mechanism of crack self-healing? …


🔎  Query: How does crack self‑healing vary with annealing temperature and time?

[0.866] How does crack self-healing vary with annealing Taand ta? …

[0.865] Model accurately predicts grain size evolution in graphene-reinforced aluminum composites …

[0.854] Cold welding leads to heterogeneous particle size distributions in powder mixtures …

[0.853] Convective heat transfer rate between absorber and water (W) …

[0.851] highly deformed regions near particle–particle interfa

In [35]:
!pip install pint regex tqdm

Collecting pint
  Downloading Pint-0.24.4-py3-none-any.whl.metadata (8.5 kB)
Collecting flexcache>=0.3 (from pint)
  Downloading flexcache-0.3-py3-none-any.whl.metadata (7.0 kB)
Collecting flexparser>=0.4 (from pint)
  Downloading flexparser-0.4-py3-none-any.whl.metadata (18 kB)
Downloading Pint-0.24.4-py3-none-any.whl (302 kB)
Downloading flexcache-0.3-py3-none-any.whl (13 kB)
Downloading flexparser-0.4-py3-none-any.whl (27 kB)
Installing collected packages: flexparser, flexcache, pint

   -------------------------- ------------- 2/3 [pint]
   -------------------------- ------------- 2/3 [pint]
   -------------------------- ------------- 2/3 [pint]
   -------------------------- ------------- 2/3 [pint]
   -------------------------- ------------- 2/3 [pint]
   ---------------------------------------- 3/3 [pint]

Successfully installed flexcache-0.3 flexparser-0.4 pint-0.24.4


In [36]:
import re, pandas as pd, numpy as np
from tqdm.auto import tqdm
from pint import UnitRegistry

ureg = UnitRegistry()
ureg.default_format = "~P"      # nice compact unit printing

# 1) Build a pattern for SI symbols + prefixes
#    (m, mm, µm, kg, kPa, GPa, s, ms, °C, K, A, V, Ω, W, J, mol, cd…)
prefix = r"(?:[fpnumcdkMGT]?)(?:\s*|·|-|×)?"
unit_symbols = (
    "m|g|kg|s|ms|µs|A|K|°C|mol|cd|Pa|kPa|MPa|GPa|N|J|W|V|Ω|F|H|C|T|lx|Hz"
)
pattern = re.compile(
    rf"(?P<value>[+-]?(\d+(\.\d+)?|\.\d+)([eE][+-]?\d+)?)\s*"
    rf"(?P<unit>{prefix}(?:{unit_symbols}))\b"
)

records = []
for i, (para_id, txt) in tqdm(enumerate(df_long["text"].items()), total=len(df_long)):
    for m in pattern.finditer(txt):
        val, unit = m.group("value"), m.group("unit")
        try:
            quantity = (float(val) * ureg(unit)).to_base_units()
            norm_unit = f"{quantity.magnitude:g} {quantity.units}"
        except Exception:
            norm_unit = None
        records.append({
            "row_id": para_id,
            "value": val,
            "unit_raw": unit,
            "unit_normalized": norm_unit,
        })

df_units = pd.DataFrame(records)
print(df_units.head())
print(f"Found {len(df_units):,} number‑unit pairs.")
df_units.to_csv("si_units_extracted.csv", index=False)


Use ureg.formatter.default_format
  ureg.default_format = "~P"      # nice compact unit printing
100%|██████████| 110280/110280 [00:15<00:00, 7192.12it/s]


   row_id value unit_raw unit_normalized
0      88     0        K             0 K
1      89   300        K           300 K
2      92   298        K           298 K
3      98     0        K             0 K
4      98     0        K             0 K
Found 158,096 number‑unit pairs.


In [38]:
!pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [39]:
import pandas as pd

# 1) Load your extracted-units CSV (adjust path if needed)
df_units = pd.read_csv("si_units_extracted.csv")

# 2) Convert the ‘value’ column to float for aggregation
df_units["value"] = df_units["value"].astype(float)

# 3) Build a pivot summarizing count, mean, median per unit
pivot = df_units.pivot_table(
    index="unit_raw",
    values="value",
    aggfunc=["count", "mean", "median"]
)
pivot.columns = ["count", "mean", "median"]
pivot = pivot.sort_values("count", ascending=False)

# 4) Save to Excel
pivot.to_excel("units_summary.xlsx", index=True)

print("Pivot saved to units_summary.xlsx")


  df_units = pd.read_csv("si_units_extracted.csv")


Pivot saved to units_summary.xlsx


In [42]:
import pandas as pd
import ast

# 1) Load and explode the metadata table
df_meta = pd.read_csv("xmlAndHTML_data.csv")
df_meta["Para_list"] = df_meta["Para_list"].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else []
)
df_long = (
    df_meta[["DOI", "Title", "Para_list"]]
    .explode("Para_list", ignore_index=True)
    .rename(columns={"Para_list": "text"})
)

# now df_long has DOI, Title, text

# 2) Load your SI-unit extractions
df_units = pd.read_csv("si_units_extracted.csv")  # row_id, value, unit_raw, ...

# 3) Ensure row_id lines up
df_long = df_long.reset_index().rename(columns={"index": "row_id"})

# 4) Merge units back to metadata
df_merged = df_units.merge(
    df_long[["row_id", "DOI", "Title"]],
    on="row_id",
    how="left"
)

# 5) For each unit, list the top 5 most frequent paper titles
unit_to_titles = (
    df_merged
      .groupby("unit_raw")["Title"]
      .apply(lambda ts: ts.value_counts().head(5).index.tolist())
      .reset_index(name="top_material_titles")
)

# 6) Inspect and save
print(unit_to_titles.head(10))
unit_to_titles.to_csv("unit_to_materials.csv", index=False)
print("Saved mappings to unit_to_materials.csv")


  unit_raw                                top_material_titles
0       -A  [Nonlocal modeling and analysis of spatiotempo...
1       -C  [Copper melt filtration with carbon-bonded alu...
2       -F  [Enhancing coatings mechanical performance by ...
3       -H  [Functionalized metal oxide particles with ant...
4      -Hz  [Improving the high-temperature oxidation resi...
5       -J  [Sheet thickness dependence of magnetization p...
6       -K  [Oxidation limited thermal boundary conductanc...
7       -N  [Dual-phase high-entropy ultra-high temperatur...
8       -T  [Electronic, magnetic, optical properties, and...
9       -V  [Porous-anodic-alumina-templated Ta-Nb-alloy/o...
Saved mappings to unit_to_materials.csv


  df_units = pd.read_csv("si_units_extracted.csv")  # row_id, value, unit_raw, ...


In [43]:
valid_units = {
    "m", "g", "kg", "s", "A", "K", "mol", "cd",
    "Hz", "N", "Pa", "J", "W", "C", "V", "F", "Ω", "T",
    "°C", "lx"
}


In [44]:
df_filtered = df_merged[df_merged["unit_raw"].isin(valid_units)]


In [45]:
unit_to_titles_clean = (
    df_filtered
      .groupby("unit_raw")["Title"]
      .apply(lambda ts: ts.value_counts().head(5).index.tolist())
      .reset_index(name="top_material_titles")
)

print(unit_to_titles_clean)
unit_to_titles_clean.to_csv("unit_to_materials_clean.csv", index=False)


   unit_raw                                top_material_titles
0         A  [An analysis of microstructural morphology, su...
1         C  [Interface and mechanical/thermal properties o...
2         F  [Repeatability and reproducibility of liquid-p...
3        Hz  [Poly(amide-triazole)s obtained by regioselect...
4         J  [Developing thermoplastic hybrid titanium comp...
5         K  [New insights into thermal processes of metal ...
6         N  [A first-principles study of the effects of at...
7        Pa  [Novel class of nanostructured metallic glass ...
8         T  [1T MoS2 nanosheets with extraordinary sodium ...
9         V  [Findings and perspectives of β-Ti alloys with...
10        W  [Resolving the porosity-unmelted inclusion dil...
11       cd  [Physical characterization of Ag:WO3 cermet fi...
12        g  [Metal Nanoparticle Harvesting by Continuous R...
13       kg  [Thermal conductivity of different materials n...
14        m  [On new solvatomorphs of the metalloligand

In [52]:
import re

def extract_property(text, val, unit):
    # cast to strings
    val_str, unit_str = str(val), str(unit)
    # look for up to 5 words immediately before the value+unit
    # we use a non-capture group and word boundaries
    pattern = (
        r"(?:\b\w+\b[\s,.:;–-]?){0,5}"  # up to 5 words or numbers + optional separator
        + re.escape(val_str)
        + r"\s*"
        + re.escape(unit_str)
    )
    match = re.search(pattern, text)
    if match:
        snippet = match.group(0)
        # return only the words before the number
        # split on the value so we only keep the left side
        return snippet.split(val_str)[0].strip()
    return None


In [53]:
# After merging df_units with df_long …
df_units["prop_phrase"] = df_units.apply(
    lambda r: extract_property(r["text"], r["value"], r["unit_raw"]),
    axis=1
)

# Then group by unit
grouped = df_units.groupby("unit_raw").agg({
    "prop_phrase": lambda ps: pd.Series(ps.dropna()).value_counts().head(3).index.tolist(),
    "Title":       lambda ts: pd.Series(ts).value_counts().head(3).index.tolist()
}).rename(columns={
    "prop_phrase":"top_property_phrases",
    "Title":"top_material_titles"
})

print(grouped)
grouped.to_csv("unit_properties_and_materials.csv")


                                       top_property_phrases  \
unit_raw                                                      
-A               [, 8k-A, fluctuations are stable when8k-A]   
-C                                        [, KIC-2, N00014]   
-F        [, MMC coatings with a Colmonoy, previously ob...   
-H                   [, and single layer graphene on, FZ-T]   
-Hz             [with 30-μs pulses and, were provided by a]   
...                                                     ...   
×K                                                       []   
×T                                                       []   
×V                                                       []   
×g        [and precipitated by centrifugation at, , the ...   
Ω                                               [, and, to]   

                                        top_material_titles  
unit_raw                                                     
-A        [Nonlocal modeling and analysis of spatiotempo

In [54]:
import pandas as pd, re, ast

# 1) Define prefixes and an expanded set of base units
prefixes   = ["", "k", "M", "G", "m", "µ", "n"]  # kilo, Mega, Giga, milli, micro, nano

# Base units covering thermal, mechanical, electrical, electronics, magnetic, etc.
base_units = {
    # Temperature
    "K", "°C",
    # Mechanics (stress, modulus)
    "Pa", "N",
    # Electro-magnetic
    "V", "A", "Ω", "Hz", "F", "S", "H",  # Henry for inductance, Siemens for conductance
    # Derived electrical per-length or per-area
    "S/m", "Ω·m", "F/m", 
    # Energy / power
    "J", "W",
    # Magnetic flux density
    "T",
    # Charge
    "C",
}

# Build whitelist of all prefixed variants
valid_units = {
    f"{p}{u}"
    for p in prefixes
    for u in base_units
}

# 2) Reload the unit-extraction and metadata
df_units = pd.read_csv("si_units_extracted.csv")      # row_id, value, unit_raw
df_meta  = pd.read_csv("xmlAndHTML_data.csv")         # DOI, Title, Para_list

# 3) Explode metadata to get row_id → text, DOI, Title
df_meta["Para_list"] = df_meta["Para_list"].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else []
)
df_long = (
    df_meta[["DOI","Title","Para_list"]]
    .explode("Para_list", ignore_index=True)
    .rename(columns={"Para_list":"text"})
).reset_index().rename(columns={"index":"row_id"})

# 4) Merge and filter to only our valid units
df = (
    df_units
      .merge(df_long[["row_id","Title","text"]], on="row_id", how="left")
      .loc[lambda d: d["unit_raw"].isin(valid_units)]
)

# 5) Helper to grab the few words before each occurrence
def extract_prop(text, val, unit):
    val, unit = str(val), str(unit)
    pat = (
        r"(?:\b\w+\b[\s,.:;–-]?){0,5}"
        + re.escape(val)
        + r"\s*"
        + re.escape(unit)
    )
    m = re.search(pat, text)
    if not m:
        return None
    snippet = m.group(0)
    return snippet.split(val)[0].strip()

df["prop_phrase"] = df.apply(
    lambda r: extract_prop(r["text"], r["value"], r["unit_raw"]),
    axis=1
)

# 6) Aggregate top-3 property phrases & material titles per unit
result = df.groupby("unit_raw").agg({
    "prop_phrase": lambda ps: pd.Series(ps.dropna())
                                   .value_counts()
                                   .head(3)
                                   .index
                                   .tolist(),
    "Title":       lambda ts: pd.Series(ts)
                                   .value_counts()
                                   .head(3)
                                   .index
                                   .tolist()
}).rename(columns={
    "prop_phrase":"top_property_phrases",
    "Title":"top_material_titles"
}).reset_index()

# 7) Save and display
result.to_csv("unit_properties_and_materials_electronics.csv", index=False)
print("Done! See unit_properties_and_materials_electronics.csv for the full mapping.")
print(result)


  df_units = pd.read_csv("si_units_extracted.csv")      # row_id, value, unit_raw


Done! See unit_properties_and_materials_electronics.csv for the full mapping.
   unit_raw                               top_property_phrases  \
0         A                                    [, and, Figure]   
1         C                                [, Figure, Figures]   
2         F                                    [, JSM, Figure]   
3        GA                                                 []   
4        GC                                                 []   
..      ...                                                ...   
65       nV                                                 []   
66       nW         [, starting from 30,, to a power level of]   
67       nΩ  [, copper resistivity down to about, showed a ...   
68       °C                                    [, C and, C to]   
69        Ω                                        [, and, to]   

                                  top_material_titles  
0   [An analysis of microstructural morphology, su...  
1   [Interface an

In [55]:
import pandas as pd, re, ast

# 1) Reload & explode metadata
df_meta = pd.read_csv("xmlAndHTML_data.csv")
df_meta["Para_list"] = df_meta["Para_list"].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else []
)
df_long = (
    df_meta[["DOI","Title","Para_list"]]
    .explode("Para_list", ignore_index=True)
    .rename(columns={"Para_list":"text"})
).reset_index().rename(columns={"index":"row_id"})

# 2) Load extractions & merge
df_units = pd.read_csv("si_units_extracted.csv")
df = df_units.merge(
    df_long[["row_id","Title","text"]], on="row_id", how="left"
)

# 3) Whitelist (as before)
prefixes   = ["", "k", "M", "G", "m", "µ", "n"]
base_units = {"K","°C","Pa","N","V","A","Ω","Hz","F","S","H","S/m","Ω·m","F/m","J","W","T","C"}
valid_units = {f"{p}{u}" for p in prefixes for u in base_units}
df = df[df["unit_raw"].isin(valid_units)]

# 4) Extract the property phrase (optional, reuse your extract_prop)
def extract_prop(text, val, unit):
    val, unit = str(val), str(unit)
    pat = (r"(?:\b\w+\b[\s,.:;–-]?){0,5}"
           + re.escape(val) + r"\s*" + re.escape(unit))
    m = re.search(pat, text)
    return m.group(0).split(val)[0].strip() if m else None

df["prop_phrase"] = df.apply(
    lambda r: extract_prop(r["text"], r["value"], r["unit_raw"]), axis=1
)

# 5) Aggregate per unit
agg = df.groupby("unit_raw").agg(
    occurrences       = ("unit_raw",     "size"),                # total matches
    paragraph_count   = ("row_id",       lambda s: s.nunique()),  # distinct paragraphs
    top_property_phrases = ("prop_phrase", lambda ps: pd.Series(ps.dropna())
                                              .value_counts()
                                              .head(3)
                                              .index
                                              .tolist()),
    top_material_titles   = ("Title",       lambda ts: pd.Series(ts)
                                              .value_counts()
                                              .head(3)
                                              .index
                                              .tolist())
).reset_index()

# 6) Save & inspect
agg.to_csv("unit_summary_with_counts.csv", index=False)
print(agg)


  df_units = pd.read_csv("si_units_extracted.csv")


   unit_raw  occurrences  paragraph_count  \
0         A         6988             3901   
1         C         3309             1825   
2         F         1031              755   
3        GA            1                1   
4        GC           17               11   
..      ...          ...              ...   
65       nV            1                1   
66       nW           16                9   
67       nΩ           12                9   
68       °C        36458            15214   
69        Ω         1046              560   

                                 top_property_phrases  \
0                                     [, and, Figure]   
1                                 [, Figure, Figures]   
2                                     [, JSM, Figure]   
3                                                  []   
4                                                  []   
..                                                ...   
65                                                 []   
66 