In [1]:
import json
import re
import os
from pprint import pprint
import unicodedata

In [2]:
data_dir = "/mnt/data/sara-salamat/generative-topic-evolution/data/raw"

# list the directories in data_dir and get the name of the directories
directories = os.listdir(data_dir)
data = {}
for directory in directories:
    # load json in the directory with unknown name
    with open(os.path.join(data_dir, directory, f"{directory}_notes_with_decisions.json"), "r") as f:
        data[directory] = json.load(f)



In [4]:
# print and example of the data
pprint(data['neurips2021'][0].keys())

dict_keys(['id', 'original', 'cdate', 'pdate', 'odate', 'mdate', 'tcdate', 'tmdate', 'ddate', 'number', 'content', 'forum', 'referent', 'invitation', 'replyto', 'readers', 'nonreaders', 'signatures', 'writers', 'decision'])


In [5]:
pprint(data['neurips2021'][1].keys())

dict_keys(['id', 'original', 'cdate', 'pdate', 'odate', 'mdate', 'tcdate', 'tmdate', 'ddate', 'number', 'content', 'forum', 'referent', 'invitation', 'replyto', 'readers', 'nonreaders', 'signatures', 'writers', 'decision'])


In [6]:
# count the number of submissions in each conference
# count how many have decisions

for conf_name, conf_data in data.items():
    print(f"Conference {conf_name} has {len(conf_data)} submissions.")
    try:
        not_none_decisions = [submission for submission in conf_data if submission['decision'] is not None]
        print(f"Conference {conf_name} has {len(not_none_decisions)} submissions with decisions.")
        print(f"Conference {conf_name} has {len(not_none_decisions) / len(conf_data) * 100}% submissions with decisions.")
    except:
        print(f"Conference {conf_name} has no decisions.")


Conference neurips2021 has 2768 submissions.
Conference neurips2021 has 0 submissions with decisions.
Conference neurips2021 has 0.0% submissions with decisions.
Conference neurips2022 has 2824 submissions.
Conference neurips2022 has 0 submissions with decisions.
Conference neurips2022 has 0.0% submissions with decisions.
Conference neurips2023 has 3395 submissions.
Conference neurips2023 has no decisions.
Conference neurips2024 has 4236 submissions.
Conference neurips2024 has no decisions.


### Check Later:

- Why data does not have decisions?


In [7]:
def clean_text(text: str) -> str:
    """
    Cleans input text by removing control characters, unusual unicode symbols,
    and invisible or non-ASCII characters, while keeping case, numbers, and punctuation.
    """
    # Normalize Unicode (e.g., decompose accents)
    text = unicodedata.normalize("NFKC", text)

    # Remove non-printable/control characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != "C")

    # Remove lingering non-ASCII or corrupted characters (e.g., \uXXXX)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Optional: collapse extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [8]:
# fromat to save cleaned processed data:
# {
#     "conference_name": [
#         {
#             "id": "...",
#             "TL;DR": "...",
#             "title": "...",
#             "abstract": "...",
#             "authors": [],
#             "keywords": [],
#             "venue": "..."
#         }
#     ]
# }

# Lets print  the schema of data for each conference
for conf_name, conf_data in data.items():
    print(f"\nConference {conf_name} has {len(conf_data)} submissions.")
    print("Schema of data:")
    print("Keys:",conf_data[0].keys())
    # print("Keys of content:",conf_data[0]["content"].keys())
    # print("Format of title in content:",conf_data[0]["content"]["title"])
    # print("Format of abstract in content:",conf_data[0]["content"]["abstract"])
    print("-"*100)
    



Conference neurips2021 has 2768 submissions.
Schema of data:
Keys: dict_keys(['id', 'original', 'cdate', 'pdate', 'odate', 'mdate', 'tcdate', 'tmdate', 'ddate', 'number', 'content', 'forum', 'referent', 'invitation', 'replyto', 'readers', 'nonreaders', 'signatures', 'writers', 'decision'])
----------------------------------------------------------------------------------------------------

Conference neurips2022 has 2824 submissions.
Schema of data:
Keys: dict_keys(['id', 'original', 'cdate', 'pdate', 'odate', 'mdate', 'tcdate', 'tmdate', 'ddate', 'number', 'content', 'forum', 'referent', 'invitation', 'replyto', 'readers', 'nonreaders', 'signatures', 'writers', 'decision'])
----------------------------------------------------------------------------------------------------

Conference neurips2023 has 3395 submissions.
Schema of data:
Keys: dict_keys(['id', 'forum', 'content', 'invitations', 'cdate', 'pdate', 'odate', 'mdate', 'signatures', 'writers', 'readers'])
---------------------

In [9]:
def extract_field(content, key):
    val = content.get(key) or content.get(key.lower())
    return val.get("value") if isinstance(val, dict) else val or ""

processed_data = {}

for conf_name, conf_data in data.items():
    cleaned_entries = []
    for paper in conf_data:
        content = paper.get("content", {})
        cleaned = {
            "id": paper.get("id"),
            "TL;DR": clean_text(extract_field(content, "TL;DR")),
            "title": clean_text(extract_field(content, "title")),
            "abstract": clean_text(extract_field(content, "abstract")),
            "authors": content.get("authors", []),
            "keywords": content.get("keywords", []),
            "venue": content.get("venue")
        }
        if cleaned["title"] and cleaned["abstract"]:
            cleaned_entries.append(cleaned)
    processed_data[conf_name] = cleaned_entries
with open("/mnt/data/sara-salamat/generative-topic-evolution/data/processed/cleaned_data_per_conference.json", "w") as f:
    json.dump(processed_data, f, indent=4)

In [36]:
def count_missing_fields_processed(processed_data):
    for conf_name, papers in processed_data.items():
        missing_title = 0
        missing_abstract = 0
        missing_tldr = 0

        for paper in papers:
            if not paper.get("title"):
                missing_title += 1
            if not paper.get("abstract"):
                missing_abstract += 1
            if not paper.get("TL;DR"):
                missing_tldr += 1

        print(f"{conf_name}:")
        print(f"  Missing title: {missing_title}")
        print(f"  Missing abstract: {missing_abstract}")
        print(f"  Missing TL;DR: {missing_tldr}")
        print("-" * 50)

# Run it
count_missing_fields_processed(processed_data)


neurips2021:
  Missing title: 0
  Missing abstract: 0
  Missing TL;DR: 569
--------------------------------------------------
neurips2022:
  Missing title: 0
  Missing abstract: 0
  Missing TL;DR: 719
--------------------------------------------------
neurips2023:
  Missing title: 0
  Missing abstract: 0
  Missing TL;DR: 3395
--------------------------------------------------
neurips2024:
  Missing title: 0
  Missing abstract: 0
  Missing TL;DR: 4236
--------------------------------------------------


In [37]:
from collections import Counter

def count_venues_with_missing(processed_data):
    for conf_name, papers in processed_data.items():
        venue_counter = Counter()
        missing_count = 0

        for paper in papers:
            venue = paper.get("venue")
            if isinstance(venue, str) and venue.strip():
                venue_counter[venue.lower()] += 1
            else:
                missing_count += 1

        print(f"{conf_name} venue categories:")
        for venue, count in sorted(venue_counter.items(), key=lambda x: -x[1]):
            print(f"  {venue}: {count}")
        print(f"  <empty venue>: {missing_count}")
        print("-" * 50)

# Run it
count_venues_with_missing(processed_data)


neurips2021 venue categories:
  neurips 2021 poster: 2286
  neurips 2021 spotlight: 284
  neurips 2021 submitted: 136
  neurips 2021 oral: 60
  <empty venue>: 2
--------------------------------------------------
neurips2022 venue categories:
  neurips 2022 accept: 2671
  neurips 2022 submitted: 153
  <empty venue>: 0
--------------------------------------------------
neurips2023 venue categories:
  <empty venue>: 3395
--------------------------------------------------
neurips2024 venue categories:
  <empty venue>: 4236
--------------------------------------------------
