In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = Path('article_paragraphs.csv')
df = pd.read_csv(DATA_PATH)


In [4]:
print(df.columns.tolist())
df.head()


['doi', 'title', 'abstract', 'paragraphs', 'n_paras']


Unnamed: 0,doi,title,abstract,paragraphs,n_paras
0,10.1006/jcis.1996.4536,Characterization of the Interface between a Ro...,"We introduce a technique to characterize,in si...",,0
1,10.1006/jssc.1999.8570,Relationships between Structure and Physical P...,The crystal structure of SmNi1−x Co x O3 perov...,,0
2,10.1006/jssc.1999.8590,Metal–Insulator Transition and Magnetic Proper...,"This paper reports X-ray diffraction patterns,...",,0
3,10.1006/spmi.1999.0720,Tunable supercurrent in superconductor/normal ...,When two superconductors are connected by a we...,,0
4,10.1006/spmi.2000.0877,Interface-charged impurity scattering in semic...,We present the results on the anomalous 2D tra...,,0


In [5]:
df = pd.read_csv('article_paragraphs.csv')
print(df.columns)
df.head()


Index(['doi', 'title', 'abstract', 'paragraphs', 'n_paras'], dtype='object')


Unnamed: 0,doi,title,abstract,paragraphs,n_paras
0,10.1006/jcis.1996.4536,Characterization of the Interface between a Ro...,"We introduce a technique to characterize,in si...",,0
1,10.1006/jssc.1999.8570,Relationships between Structure and Physical P...,The crystal structure of SmNi1−x Co x O3 perov...,,0
2,10.1006/jssc.1999.8590,Metal–Insulator Transition and Magnetic Proper...,"This paper reports X-ray diffraction patterns,...",,0
3,10.1006/spmi.1999.0720,Tunable supercurrent in superconductor/normal ...,When two superconductors are connected by a we...,,0
4,10.1006/spmi.2000.0877,Interface-charged impurity scattering in semic...,We present the results on the anomalous 2D tra...,,0


In [10]:
import ast, pandas as pd

# 1 ) Load the file with paragraph lists
df_meta = pd.read_csv('xmlAndHTML_data.csv')

# 2 ) Turn the string representation of a list into an actual list
df_meta['Para_list'] = df_meta['Para_list'].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() else [])

# 3 ) Explode so each paragraph is its own row
df_long = df_meta.explode('Para_list', ignore_index=True)

# 4 ) Rename for clarity and drop empties
df_long = (
    df_long.rename(columns={'Para_list': 'text'})
            .assign(text=lambda d: d['text'].str.strip())
            .loc[lambda d: d['text'] != '']
)

# Ensure every entry is a plain string
paragraphs = [str(p) for p in paragraphs]
print(f"Total paragraphs: {len(df_long):,}")
paragraphs = df_long['text'].tolist()


Total paragraphs: 110,280


In [12]:
# Ensure every entry is a plain string
paragraphs = [str(p) for p in paragraphs]

In [13]:
# 🔑 1. Pick / install the model once (comment out after first run)
# !pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
import faiss, os, gc

# -----------------------------------------------------------
# CONFIG
MODEL_NAME  = "sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE  =  128          # tweak up/down for RAM vs speed
EMB_PATH    = "paragraph_embeddings.npy"
INDEX_PATH  = "paragraph_index.faiss"
TEXTS_PATH  = "paragraph_texts.npy"
# -----------------------------------------------------------

# 2. Load model (GPU if available, else CPU)
model = SentenceTransformer(MODEL_NAME)

# 3. Encode in batches → list of np.arrays → vertical stack
all_embeds = []
for i in tqdm(range(0, len(paragraphs), BATCH_SIZE)):
    batch = paragraphs[i:i + BATCH_SIZE]
    emb   = model.encode(
        batch,
        convert_to_numpy=True,
        normalize_embeddings=True,   # cosine similarity ready
        show_progress_bar=False
    )
    all_embeds.append(emb.astype("float32"))   # FAISS likes float32

# 4. Combine & free memory
embeddings = np.vstack(all_embeds)
del all_embeds; gc.collect()
print("Embeddings shape:", embeddings.shape)   # (110280, 384)

# 5. Save raw embeddings (optional but handy)
np.save(EMB_PATH, embeddings)
print(f"Saved raw embeddings → {EMB_PATH}")

# 6. Build FAISS index (cosine sim via inner product)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, INDEX_PATH)
print(f"FAISS index with {index.ntotal:,} vectors saved → {INDEX_PATH}")

# 7. Persist texts so you can reload quickly later
np.save(TEXTS_PATH, np.array(paragraphs, dtype=object))
print(f"Paragraph texts saved → {TEXTS_PATH}")


100%|██████████| 862/862 [27:42<00:00,  1.93s/it]


Embeddings shape: (110280, 384)
Saved raw embeddings → paragraph_embeddings.npy
FAISS index with 110,280 vectors saved → paragraph_index.faiss
Paragraph texts saved → paragraph_texts.npy


In [14]:
import numpy as np, faiss
from sentence_transformers import SentenceTransformer

# 1. Load index & texts
index      = faiss.read_index("paragraph_index.faiss")
paragraphs = np.load("paragraph_texts.npy", allow_pickle=True)
model      = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve(query, top_k=5):
    q_emb = model.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    D, I  = index.search(q_emb, top_k)
    return [(float(D[0][i]), paragraphs[I[0][i]]) for i in range(top_k)]

for score, para in retrieve("How does interface‑charged impurity scattering affect 2‑D transport?", top_k=3):
    print(f"[{score:.3f}] {para[:250]}…\n")


[0.664] The first point we make is that the observed metal–insulator transition occurs at electron densities where the ionized impurity scattering dominates. In particular, the 2D M-I-T is only seen in samples with highμmax, and usually withnc≪nmax. In our o…

[0.627] We now consider as an example[31]a direct consequence of our observation that the 2D M-I-Talwaysoccurs in a regime dominated by random long range charged impurity scattering (and not by the short range interface roughness scattering, which is effecti…

[0.621] In this paper we will focus on the low-temperature transport properties of metals determined by the elastic scattering of electrons from static impurities. In weakly disordered samples, when the wavelength of the electrons is much smaller than their …



In [17]:
!pip install --upgrade openai

