In [1]:
import json
import os
import re
from typing import Dict, Any, List

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!git clone https://github.com/srinisvas/optimized-summarization.git

fatal: destination path 'optimized-summarization' already exists and is not an empty directory.


In [12]:
!cp -r optimized-summarization/ /content/drive/MyDrive/optimized-summarization
INPUT_DIR = '/content/drive/MyDrive/optimized-summarization/optimized-summarization/papers_json/'

In [2]:
# ---------- Regex Definitions ----------
BRACKETED_CITATION_RE = re.compile(
    r"""\[
        \s*
        (?:\d+(?:\s*[-–]\s*\d+)?)
        (?:\s*,\s*\d+(?:\s*[-–]\s*\d+)?)*
        \s*
    \]""",
    re.VERBOSE,
)

PAREN_YEAR_CITATION_RE = re.compile(
    r"""\(
        \s*
        (?:[A-Z][A-Za-z\-\.' ]+(?:et\ al\.)?,\s*)?
        (?:19|20)\d{2}[a-z]?
        (?:\s*;\s*(?:[A-Z][A-Za-z\-\.' ]+(?:et\ al\.)?,\s*)?(?:19|20)\d{2}[a-z]?)*
        \s*
    \)""",
    re.VERBOSE,
)

LEADING_NUMBERING_RE = re.compile(
    r"""^\s*
        (?:
            (?:[IVXLCDM]+\.?)|
            (?:\d+(?:\.\d+)*\.?)|
            (?:[A-Z]\.?)|
            (?:\(?\d+\)?)
        )
        [\s:)\.-]*
    """,
    re.VERBOSE,
)

LINE_LEADING_ENUM_RE = re.compile(r"(?m)^\s*(?:[a-zA-Z]|\d+)[\)\.]\s+")
MULTISPACE_RE = re.compile(r"[ \t]+")
MULTINEWLINE_RE = re.compile(r"\n{3,}")

In [3]:
def clean_heading(h: str) -> str:
    if not h:
        return ""
    # skip headings that are just citation lists like [32][33]
    if re.fullmatch(r"(?:\s*\[\s*\d+(?:\s*[-–]\s*\d+)?\s*\]\s*){2,}", h or ""):
        return ""
    cleaned = LEADING_NUMBERING_RE.sub("", h).strip(" :.-\u2013\u2014")
    return cleaned if cleaned else h.strip()

In [4]:
def clean_text(t: str) -> str:
    if not t:
        return ""
    t = BRACKETED_CITATION_RE.sub("", t)
    t = PAREN_YEAR_CITATION_RE.sub("", t)
    t = re.sub(r"\[\s*\]", "", t)
    t = LINE_LEADING_ENUM_RE.sub("", t)
    t = MULTISPACE_RE.sub(" ", t)
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = MULTINEWLINE_RE.sub("\n\n", t)
    return t.strip()

In [5]:
def simplify_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    title = paper.get("title", "").strip()
    out_sections: List[Dict[str, str]] = []

    # include abstract as section if present
    abstract = (paper.get("abstract") or "").strip()
    if abstract:
        out_sections.append({"title": "Abstract", "content": clean_text(abstract)})

    for sec in paper.get("sections", []):
        heading = clean_heading(sec.get("heading", ""))
        if heading.lower() in {"references", "reference", "bibliography"}:
            continue
        text = clean_text(sec.get("text", ""))
        if (heading and heading.strip()) or (text and text.strip()):
            out_sections.append({"title": heading or "Section", "content": text})

    return {"title": title, "sections": out_sections}

In [15]:
cleaned_papers = {}
for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".json"):
        full_path = os.path.join(INPUT_DIR, filename)
        try:
            with open(full_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            cleaned_papers[filename] = simplify_paper(data)
            print(f"Cleaned: {filename}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

example_key = next(iter(cleaned_papers))
print(json.dumps(cleaned_papers[example_key], indent=2)[:1500])

Cleaned: A Bibliometric View of AI Ethics Development.json
Cleaned: A Model for Using Ethical Theory to Specify Epistemic Goals for Explainable AI.json
Cleaned: A Privacy Impact Assessment Tool for Cloud Computing.json
Cleaned: A Privacy Maturity Model for Cloud Storage Services.json
Cleaned: A Privacy-Leakage-Tolerance Based Noise Enhancing Strategy for Privacy Protection in Cloud Computing.json
Cleaned: A novel framework to prevent privacy breach in cloud data storage area service.json
Cleaned: A privacy-preserving mechanism based on local differential privacy in edge computing.json
Cleaned: AI Ethics Impact Assessment based on Requirement Engineering.json
Cleaned: AI Ethics and Data Privacy compliance.json
Cleaned: AI Ethics in Healthcare - A Survey.json
Cleaned: AI Ethics in IoT Involving Society in the Discussion.json
Cleaned: AI Governance A General Perspective.json
Cleaned: AI Governance and Ethics in Public Procurement Bridging the Gap Between Theory and Practice.json
Cleaned: 