In [16]:
import re
from pathlib import Path
from markdown import markdown
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util, models

from src.file_handling import file_location

data = file_location.FolderPathOfASME()
data_path = data.data
md_path = data.asme_jmd / 'markdown'


model_id = "microsoft/Phi-4-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

def prompt_keywords(text: str, top_n: int = 20):
    prompt = f"""
Extract the top {top_n} technical keywords or keyphrases from the following scientific text.
Focus on domain-specific concepts, methods, equations, and recurring themes in mechanical or systems design.
Avoid generic terms and prioritize phrases that reflect unique contributions, techniques, or models.
Return the output as a numbered list, sorted by relevance.

Text:
{text}
"""
    response = llm(prompt, max_new_tokens=200, temperature=0.3)[0]["generated_text"]
    return response

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


In [None]:
def markdown_to_text(md_content: str) -> str:
    html = markdown(md_content)
    text = re.sub(r"<[^>]+>", " ", html)
    return re.sub(r"\s+", " ", text).strip()


def parse_keywords(raw_output: str):
    lines = raw_output.split("\n")
    keywords = []
    for line in lines:
        match = re.match(r"\d+\.\s*(.+)", line)
        if match:
            kw = match.group(1).lower().strip()
            kw = re.sub(r"[^\w\s\-]", "", kw)
            keywords.append(kw)
    return keywords

### vram exceeds 32GB

In [None]:
results = {}
test_size = 10
n = 0
for md_path in md_path.glob("*.md"):
    text = markdown_to_text(md_path.read_text(encoding="utf-8"))
    raw_output = prompt_keywords(text)
    keywords = parse_keywords(raw_output)
    results[md_path.stem] = keywords
    n += 1
    if n == test_size:
        break

### chunk and llm

In [24]:
def chunk_markdown(text: str, min_length=300):
    sections = re.split(r"\n#{1,3}\s+", text)
    return [s.strip() for s in sections if len(s.strip()) > min_length]

word_embedding_model = models.Transformer("globuslabs/ScholarBERT-XL")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of BertModel were not initialized from the model checkpoint at globuslabs/ScholarBERT-XL and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def process_markdown(md_path, top_n=10):
    text = markdown_to_text(md_path.read_text(encoding="utf-8"))
    chunks = chunk_markdown(text)
    all_keywords = []

    for chunk in chunks:
        raw = prompt_keywords(chunk, top_n=top_n)
        keywords = parse_keywords(raw)
        all_keywords.extend(keywords)

    return all_keywords

### Over 10 minutes and not done

In [26]:
test_results = {}
for test_md in list(md_path.glob('*.md'))[:5]:
    process_markdown(test_md)
    test_results[test_md.stem] = process_markdown(test_md)
print(test_results)

KeyboardInterrupt: 

### 12s for 10 markdowns
### Works, remove noise from md

In [28]:
from src.file_handling import file_location
folder_path = file_location.FolderPathOfASME()
md_path = folder_path.asme_jmd / 'markdown'


from keybert import KeyBERT

model_name = "allenai/specter2_base"
kw_model = KeyBERT(model=model_name)


test_mds = list(md_path.glob("*.md"))[:10]
test_results = {}

for md in test_mds:
    with open(md, "r", encoding="utf-8") as f:
        doc = f.read()
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        top_n=30
    )
    test_results[md.stem] = keywords

print(test_results)

No sentence-transformers model found with name allenai/specter2_base. Creating a new one with mean pooling.


{'doi_10.1115_1.1286084': [('workspace boundary point', 0.7851), ('manipulator accessibility boundary', 0.7746), ('points wrist workspaces', 0.7746), ('manipulator workspaces workspaces', 0.7745), ('workspace plane boundary', 0.7741), ('manipulator workspace boundary', 0.7719), ('workspace boundary constraint', 0.7718), ('workspace contains points', 0.7711), ('workspace boundary surface', 0.7707), ('workspace boundary constraints', 0.7686), ('manipulator contains points', 0.7681), ('workspaces different manipulators', 0.7663), ('intersection region wrist', 0.766), ('coordinates point surface', 0.766), ('sections workspace boundary', 0.7654), ('reachable workspace surface', 0.7654), ('region wrist boundary', 0.7648), ('manipulator workspace asme', 0.7646), ('boundary points wrist', 0.7646), ('sphere intersecting wrist', 0.7645), ('point centerline', 0.7644), ('workspace boundary general', 0.7643), ('robot trajectories point', 0.7641), ('dexterity respect trajectory', 0.7641), ('intersec

In [None]:
from src.file_handling import file_location
folder_path = file_location.FolderPathOfASME()
md_path = folder_path.asme_jmd / 'markdown'


from keybert import KeyBERT

model_name = "allenai/specter2_base"
kw_model = KeyBERT(model=model_name)


model_name = "allenai/specter2_base"
kw_model = KeyBERT(model=model_name)

In [62]:
def remove_html_comments(text):
    return re.sub(r"<!--.*?-->", "", text)

def strip_markdown_syntax(text):
    text = re.sub(r"\[.*?\]\(.*?\)", "", text)
    text = re.sub(r"!\[.*?\]\(.*?\)", "", text)
    text = re.sub(r"`{1,3}.*?`{1,3}", "", text)
    text = re.sub(r"^```[\s\S]*?```", "", text, flags=re.MULTILINE)
    text = re.sub(r"[#*>_~\-]{2,}", "", text)
    return text

def normalize_whitespace(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\.{3,}", ".", text)
    text = re.sub(r"[^\w\s.,;:()\-]", "", text)
    return text.strip()

def extract_core_sections(text):
    start = re.search(r"^\s*##\s*\d*\s*(abstract|introduction)", text, re.IGNORECASE | re.MULTILINE)
    end = re.search(r"^\s*##\s*\d*\s*(references?|bibliography|acknowledgments?|funding|appendix|supplementary|conflict of interests?|authors? contributions)", text, re.IGNORECASE | re.MULTILINE)
    if start and end:
        return text[start.start():end.start()]
    elif start:
        return text[start.start():]
    elif end:
        return text[:end.start()]
    else:
        return text

def md_noise_reduction(raw_text):
    text = remove_html_comments(raw_text)
    text = extract_core_sections(text)
    text = strip_markdown_syntax(text)
    text = normalize_whitespace(text)
    return text

In [63]:
test_mds = list(md_path.glob("*.md"))[:10]
test_results = {}

for md in test_mds:
    with open(md, "r", encoding="utf-8") as f:
        doc = f.read()
    print(md.stem)
    print(md_noise_reduction(doc))
    print()

doi_10.1115_1.1286084
1 Introduction The goal of this investigation is to determine the exact point accessibility of robotic manipulators. This information is useful for engineers to optimally posit robot cells and to adequately plan robot trajectories. The point accessibility of a robot to an operating point is defined as the set of all directions from there the point is accessible by the robots end-effector. This set of directions, as the accessibility regions, constructed on the service sphere of the point. The service sphere  1,2  of an operating point is the sphere centered at the point with the radius the length of the robots hand size. wrist design encountered limits of joint rotation. Yang et al.  8  introduced a method of disassembling the manipulator into the regional and wrist structures for determining if a point is accessible from a particular direction. Emiris and Tourassis  17  and Basavaraj and Duffy  18  derived measures which indicate the percentage of directions from

In [10]:
import polars as pl
from pathlib import Path

from src.file_handling.file_location import FolderPathOfASME

folder_path = FolderPathOfASME()
keywords_folder_path = folder_path.asme_jmd / 'keywords'
keybert_bert_folder_path = keywords_folder_path / 'keybert_bert'

dfs :list[pl.DataFrame] = []
for f in keybert_bert_folder_path.glob('*.parquet'):
    df = pl.read_parquet(f)
    dfs.append(df)


dfs_concat = pl.concat(dfs)
dfs_concat

doi,keywords,scores
str,list[str],list[f64]
"""doi_10.1115_1.1286084""","[""joint constraint point"", ""using manipulator kinematic"", … ""freedom manipulators kumar""]","[0.6197, 0.609, … 0.5832]"
"""doi_10.1115_1.1286124""","[""design gears using"", ""realize desired contact"", … ""gear grinding""]","[0.693, 0.6909, … 0.6558]"
"""doi_10.1115_1.1286188""","[""stress results piston"", ""piston model uses"", … ""piston surface""]","[0.6845, 0.6804, … 0.6489]"
"""doi_10.1115_1.1286189""","[""dynamic response gears"", ""model planetary gears"", … ""lumped modeling 10""]","[0.6896, 0.6865, … 0.6509]"
"""doi_10.1115_1.1286236""","[""point contact line"", ""surfaces make contact"", … ""rotation contact point""]","[0.5836, 0.5833, … 0.5556]"
…,…,…
"""doi_10.1115_1.2717233""","[""designs dispensing systems"", ""accurate dispensing performance"", … ""fluid dispensing""]","[0.6494, 0.6485, … 0.5949]"
"""doi_10.1115_1.2717234""","[""parallel manipulator mechanism"", ""prismatic actuator"", … ""isolating fig""]","[0.6038, 0.6004, … 0.5538]"
"""doi_10.1115_1.2717323""","[""figures appear poorly"", ""flipping hard copy"", … ""jmd highest quality""]","[0.6424, 0.6071, … 0.5583]"
"""doi_10.1115_1.2717611""","[""3t1r parallel manipulators"", ""motions 3t1r motions"", … ""parallel parallel manipulator""]","[0.6269, 0.626, … 0.5708]"
