In [None]:
import os

folders = [
    "data/pdfs",
    "data/extracted_text",
    "data/structured_sections",
    "data/comparisons"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("Folders created")


Folders created


In [None]:
import fitz
import re
import json
from collections import Counter


In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text


In [None]:
pdf_path = "/content/data/pdfs/2512.00419v1.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)


Hardware-aware Lightweight Photonic Spiking Neural 
Network for Pattern Classification 
Shuiying Xiang1*, Yahui Zhang1, Shangxuan Shi1, Haowen Zhao1, Dianzhuang Zheng1, 
Xingxing Guo1, Yanan Han1, Ye Tian1, Liyue Zhang2, Yuechun Shi3, & Yue Hao1 
1State Key Laboratory of Integrated Service Networks, State Key Discipline Laboratory of Wide Bandgap Semiconductor Technology, 
Xidian University, Xi'an 710071, China; 
2Key Laboratory of Photonic-Electronic Integration and Communication-Sensing Convergence (Ministry of Education), Southwest 
Jiaotong University, Sichuan, 611756, China; 
3Yongjiang laboratory, No. 1792 Cihai South Road, Ningbo 315202, China. 
*Corresponding author: syxiang@xidian.edu.cn 
Received 19 Nov. 2025; revised XX Month, XXXX; accepted XX Month XXXX; posted XX Month XXXX (Doc. ID XXXXX); published XX Month XXXX 
There exists a significant scale gap between photonic neural network integrated chips and neural networks, which hinders the deployment 
and application of pho

In [None]:
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+', '', text)
    return text.strip()


In [None]:
SECTIONS = [
    "abstract",
    "introduction",
    "related work",
    "methodology",
    "methods",
    "experiments",
    "results",
    "conclusion"
]

def detect_sections(text):
    sections = {}
    text_lower = text.lower()

    for i, sec in enumerate(SECTIONS):
        start = text_lower.find(sec)
        if start == -1:
            continue

        end = len(text)
        for next_sec in SECTIONS[i+1:]:
            pos = text_lower.find(next_sec, start + 1)
            if pos != -1:
                end = pos
                break

        sections[sec] = text[start:end].strip()

    return sections


In [None]:
KEYWORDS = [
    "propose", "introduce", "improve",
    "results show", "outperform", "significant"
]

def extract_key_findings(text, limit=5):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    findings = []

    for s in sentences:
        if any(k in s.lower() for k in KEYWORDS):
            findings.append(s)
        if len(findings) == limit:
            break

    return findings


In [None]:
all_findings = {}

for pdf in os.listdir("data/pdfs"):
    if not pdf.endswith(".pdf"):
        continue

    print("Processing:", pdf)
    path = f"data/pdfs/{pdf}"

    raw = extract_text_from_pdf(path)
    clean = clean_text(raw)

    # Save full text
    with open(f"data/extracted_text/{pdf}.json", "w") as f:
        json.dump({"text": clean}, f, indent=4)

    sections = detect_sections(clean)

    # Save sections
    with open(f"data/structured_sections/{pdf}_sections.json", "w") as f:
        json.dump(sections, f, indent=4)

    all_findings[pdf] = extract_key_findings(clean)


Processing: 2512.19182v1.pdf
Processing: 2512.00427v1.pdf
Processing: 2512.00419v1.pdf


In [None]:
counter = Counter()

for paper, findings in all_findings.items():
    for sentence in findings:
        counter.update(sentence.lower().split())

common_words = counter.most_common(15)

with open("data/comparisons/comparison.json", "w") as f:
    json.dump(common_words, f, indent=4)

common_words


[('the', 23),
 ('of', 14),
 ('and', 11),
 ('photonic', 10),
 ('neural', 9),
 ('spiking', 8),
 ('a', 8),
 ('to', 8),
 ('architecture', 7),
 ('proposed', 6),
 ('on', 6),
 ('in', 6),
 ('an', 5),
 ('network', 5),
 ('we', 4)]