In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

repo_path = '/content/optimized-summarization'
if not os.path.exists(repo_path):
    !git clone https://github.com/srinisvas/optimized-summarization.git
else:
    print("Repo already exists, skipping clone.")

# Check files inside
os.listdir(repo_path)


Cloning into 'optimized-summarization'...
remote: Enumerating objects: 276, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (149/149), done.[K
remote: Total 276 (delta 24), reused 111 (delta 1), pack-reused 126 (from 2)[K
Receiving objects: 100% (276/276), 84.25 MiB | 37.89 MiB/s, done.
Resolving deltas: 100% (26/26), done.


['optimized-summarization', '.idea', '.git', 'README.md']

In [4]:
INPUT_DIR = '/content/optimized-summarization/optimized-summarization/papers_json/'
OUTPUT_DIR = '/content/optimized-summarization/normalized_papers/'

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Verify files
print("Input files:", os.listdir(INPUT_DIR)[:5])
print("Output folder will be:", OUTPUT_DIR)


Input files: ['Trading-Off Privacy- Utility- and Explainability in Deep Learning-Based Image Data Analysis.json', 'Privacy-Preserved Mobile Sensing through Hybrid Cloud Trust Framework.json', 'AI Ethics in Healthcare - A Survey.json', 'WIP Using Stories from Traditional Culture to Teach Virtue-Based Engineering Ethics.json', 'Exploring Translational Ethical Competency -TEC- in AI Design Conceptual Analysis and Methodological Reflections from a Pilot Study.json']
Output folder will be: /content/optimized-summarization/normalized_papers/


In [7]:
import re
import json
from typing import Dict, Any, List

# ---------- Regex Definitions ----------
BRACKETED_CITATION_RE = re.compile(r'\[\s*\d+(?:[\-,]\s*\d+)*\s*\]')
PAREN_YEAR_CITATION_RE = re.compile(r'\([A-Z][^)]*\d{4}[^)]*\)')
LEADING_NUMBERING_RE = re.compile(r'^\s*(?:[IVXLCDM]+\.?|\d+(?:\.\d+)*\.?|[A-Z]\.?|\(?\d+\)?)?[\s:)\.-]*', re.MULTILINE)
LINE_LEADING_ENUM_RE = re.compile(r'(?m)^\s*(?:[a-zA-Z]|\d+)[\)\.]\s+')
MULTISPACE_RE = re.compile(r'[ \t]+')
MULTINEWLINE_RE = re.compile(r'\n{3,}')

# ---------- Cleaning functions ----------
def clean_heading(h: str) -> str:
    if not h:
        return ""
    cleaned = LEADING_NUMBERING_RE.sub("", h).strip(" :.-\u2013\u2014")
    return cleaned if cleaned else h.strip()

def clean_text(t: str) -> str:
    if not t:
        return ""
    t = BRACKETED_CITATION_RE.sub("", t)
    t = PAREN_YEAR_CITATION_RE.sub("", t)
    t = re.sub(r"\[\s*\]", "", t)
    t = LINE_LEADING_ENUM_RE.sub("", t)
    t = MULTISPACE_RE.sub(" ", t)
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = MULTINEWLINE_RE.sub("\n\n", t)
    return t.strip()

def simplify_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    title = paper.get("title", "").strip()
    out_sections: List[Dict[str, str]] = []

    # include abstract
    abstract = (paper.get("abstract") or "").strip()
    if abstract:
        out_sections.append({"title": "Abstract", "content": clean_text(abstract)})

    # clean sections
    for sec in paper.get("sections", []):
        heading = clean_heading(sec.get("heading", ""))
        if heading.lower() in {"references", "reference", "bibliography"}:
            continue
        text = clean_text(sec.get("text", ""))
        if heading or text:
            out_sections.append({"title": heading or "Section", "content": text})

    return {"title": title, "sections": out_sections}


In [8]:
for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".json"):
        input_path = os.path.join(INPUT_DIR, filename)
        with open(input_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        cleaned = simplify_paper(data)
        output_path = os.path.join(OUTPUT_DIR, filename)
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned, f, ensure_ascii=False, indent=2)
        print(f"Processed & saved: {filename}")

# Check one example
example_file = os.listdir(OUTPUT_DIR)[0]
with open(os.path.join(OUTPUT_DIR, example_file), "r", encoding="utf-8") as f:
    print(json.dumps(json.load(f), indent=2)[:1000])  # print first 1000 chars


Processed & saved: Trading-Off Privacy- Utility- and Explainability in Deep Learning-Based Image Data Analysis.json
Processed & saved: Privacy-Preserved Mobile Sensing through Hybrid Cloud Trust Framework.json
Processed & saved: AI Ethics in Healthcare - A Survey.json
Processed & saved: WIP Using Stories from Traditional Culture to Teach Virtue-Based Engineering Ethics.json
Processed & saved: Exploring Translational Ethical Competency -TEC- in AI Design Conceptual Analysis and Methodological Reflections from a Pilot Study.json
Processed & saved: The Perspective of Dimensional Perpetuity for Artificial Intelligence A Model on Socio-Legal and Political Evolution as a Challenge to Entrepreneurial Ethics.json
Processed & saved: Enhancing Data Privacy in Edge-based Driver AI Monitoring Systems Through Adaptive Differential Privacy.json
Processed & saved: Applying Communication Privacy Management Theory to Youth Privacy Management in AI Contexts.json
Processed & saved: A Privacy-Leakage-Tole