In [1]:
import zipfile
import xml.etree.ElementTree as ET
import json

zip_path = "/content/xml.zip"

target_confs = {"acl", "emnlp", "naacl"}
target_years = {"2020", "2021", "2022", "2023", "2024"}

abstracts = []

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for filename in zip_ref.namelist():
        if not filename.startswith("xml/"):
            continue

        for year in target_years:
            for conf in target_confs:
                if filename == f"xml/{year}.{conf}.xml":
                    try:
                        with zip_ref.open(filename) as file:
                            tree = ET.parse(file)
                            root = tree.getroot()

                            for paper in root.findall(".//paper"):
                                title = paper.findtext("title")
                                abstract = paper.findtext("abstract")
                                if abstract:
                                    abstracts.append({
                                        "year": year,
                                        "venue": conf.upper(),
                                        "title": title.strip() if title else "N/A",
                                        "abstract": abstract.strip()
                                    })
                    except Exception as e:
                        print(f"Error in {filename}: {e}")

# Save results
with open("acl-emnlp-naacl-2020-2024_abstracts.json", "w", encoding="utf-8") as f:
    json.dump(abstracts, f, indent=2, ensure_ascii=False)

print(f"Extracted {len(abstracts)} abstracts.")


Extracted 11676 abstracts.


In [2]:
import zipfile
import xml.etree.ElementTree as ET
import json

zip_path = "/content/xml.zip"

target_confs = {"acl", "emnlp", "naacl"}
years_2020_2023 = {"2020", "2021", "2022", "2023"}
year_2024 = {"2024"}

abstracts_2020_2023 = []
abstracts_2024 = []

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for filename in zip_ref.namelist():
        if not filename.startswith("xml/"):
            continue

        # 2020–2023 files
        for year in years_2020_2023:
            for conf in target_confs:
                if filename == f"xml/{year}.{conf}.xml":
                    try:
                        with zip_ref.open(filename) as file:
                            tree = ET.parse(file)
                            root = tree.getroot()
                            for paper in root.findall(".//paper"):
                                title = paper.findtext("title")
                                abstract = paper.findtext("abstract")
                                if abstract:
                                    abstracts_2020_2023.append({
                                        "year": year,
                                        "venue": conf.upper(),
                                        "title": title.strip() if title else "N/A",
                                        "abstract": abstract.strip()
                                    })
                    except Exception as e:
                        print(f"Error in {filename}: {e}")

        # 2024 files
        for year in year_2024:
            for conf in target_confs:
                if filename == f"xml/{year}.{conf}.xml":
                    try:
                        with zip_ref.open(filename) as file:
                            tree = ET.parse(file)
                            root = tree.getroot()
                            for paper in root.findall(".//paper"):
                                title = paper.findtext("title")
                                abstract = paper.findtext("abstract")
                                if abstract:
                                    abstracts_2024.append({
                                        "year": year,
                                        "venue": conf.upper(),
                                        "title": title.strip() if title else "N/A",
                                        "abstract": abstract.strip()
                                    })
                    except Exception as e:
                        print(f"Error in {filename}: {e}")

with open("acl-emnlp-naacl-2020-2023_abstracts.json", "w", encoding="utf-8") as f:
    json.dump(abstracts_2020_2023, f, indent=2, ensure_ascii=False)

with open("acl-emnlp-naacl-2024_abstracts.json", "w", encoding="utf-8") as f:
    json.dump(abstracts_2024, f, indent=2, ensure_ascii=False)

print(f"Extracted {len(abstracts_2020_2023)} abstracts from 2020–2023.")
print(f"Extracted {len(abstracts_2024)} abstracts from 2024.")


Extracted 8584 abstracts from 2020–2023.
Extracted 3092 abstracts from 2024.
