In [6]:
from src.file_handling import file_location

data = file_location.FolderPathOfASME()
data_path = data.data
md_path = data.asme_jmd / 'markdown'
embeddings_path = data.asme_jmd / 'embeddings'

In [None]:
for md in md_path.rglob('*.md'):
    test_md = md
    break

from markdownify import markdownify as md_to_text

with open(test_md, "r", encoding="utf-8") as f:
    markdown_content = f.read()

plain_text, doi = md_to_text(markdown_content), test_md.stem

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("globuslabs/ScholarBERT-XL", add_pooling_layer=False)
model = AutoModel.from_pretrained("globuslabs/ScholarBERT-XL", dtype=torch.float16).to("cuda")
model.eval()

inputs = tokenizer(plain_text, return_tensors="pt", truncation=True, padding=True).to("cuda")

def get_embedding(texts):
    with torch.no_grad():
        outputs = model(**inputs)
        pooled = outputs.last_hidden_state.mean(dim=1).cpu().half()
    return pooled

In [None]:
for md in md_path.rglob('*.md'):
    doi = md.stem
    print(md)

In [5]:
import torch
from pathlib import Path

def load_embedding(doi: str, embeddings_path: Path):
    """
    Loads a saved embedding tensor from disk using DOI.
    """
    file_path = embeddings_path / f"{doi}.pt"
    if not file_path.exists():
        print(f"⚠️ Embedding file not found: {file_path}")
        return None

    try:
        embedding = torch.load(file_path)
        print(f"✅ Loaded embedding: {file_path}")
        return embedding
    except Exception as e:
        print(f"❌ Failed to load embedding for {doi}: {e}")
        return None


In [7]:
for emb in embeddings_path.rglob('*.pt'):
    print(load_embedding(emb.stem, embeddings_path))

✅ Loaded embedding: F:\One Drives\OneDrive - Nanyang Technological University\Academics\fyp\data\asme_jmd\embeddings\doi_10.1115_1.1286084.pt
tensor([[ 0.2386,  0.3782,  0.1548,  ..., -0.0610,  0.5166,  0.4670]],
       dtype=torch.float16)
✅ Loaded embedding: F:\One Drives\OneDrive - Nanyang Technological University\Academics\fyp\data\asme_jmd\embeddings\doi_10.1115_1.1286124.pt
tensor([[ 0.0533,  0.0251, -0.0480,  ...,  0.0274,  0.3984,  0.4343]],
       dtype=torch.float16)
✅ Loaded embedding: F:\One Drives\OneDrive - Nanyang Technological University\Academics\fyp\data\asme_jmd\embeddings\doi_10.1115_1.1286188.pt
tensor([[ 0.2637,  0.2335,  0.0144,  ..., -0.1129,  0.2954,  0.5605]],
       dtype=torch.float16)
✅ Loaded embedding: F:\One Drives\OneDrive - Nanyang Technological University\Academics\fyp\data\asme_jmd\embeddings\doi_10.1115_1.1286189.pt
tensor([[0.2903, 0.1486, 0.0653,  ..., 0.0381, 0.3359, 0.6816]],
       dtype=torch.float16)
✅ Loaded embedding: F:\One Drives\OneDrive