In [11]:
!pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import time

EVENT_URL = "https://aclanthology.org/events/acl-2020/"

def get_paper_links():
    response = requests.get(EVENT_URL)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    paper_links = []
    for link in links:
        href = link['href']
        if href.startswith("/2020.acl-main.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)
    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2020 main papers.")

    with open("acl_2020_main_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


Found 779 ACL 2020 main papers.
[1/779] Fetching abstract from https://aclanthology.org/2020.acl-main.0/
⚠️ No abstract found for: https://aclanthology.org/2020.acl-main.0/
[2/779] Fetching abstract from https://aclanthology.org/2020.acl-main.1/
[3/779] Fetching abstract from https://aclanthology.org/2020.acl-main.10/
[4/779] Fetching abstract from https://aclanthology.org/2020.acl-main.100/
[5/779] Fetching abstract from https://aclanthology.org/2020.acl-main.101/
[6/779] Fetching abstract from https://aclanthology.org/2020.acl-main.102/
[7/779] Fetching abstract from https://aclanthology.org/2020.acl-main.103/
[8/779] Fetching abstract from https://aclanthology.org/2020.acl-main.104/
[9/779] Fetching abstract from https://aclanthology.org/2020.acl-main.105/
[10/779] Fetching abstract from https://aclanthology.org/2020.acl-main.106/
[11/779] Fetching abstract from https://aclanthology.org/2020.acl-main.107/
[12/779] Fetching abstract from https://aclanthology.org/2020.acl-main.108/
[1

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_paper_links(event, year):
    event_url = f"https://aclanthology.org/events/{event}-{year}/"
    response = requests.get(event_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    paper_links = []
    for link in links:
        href = link['href']
        if href.startswith(f"/{year}.{event}-main.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)
    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    events = {'acl': range(2021, 2025), 'emnlp': range(2020, 2025), 'naacl': range(2020, 2025)}
    for event, years in events.items():
        for year in years:
            links = get_paper_links(event, year)
            print(f"Found {len(links)} {event.upper()} {year} main papers.")
            with open(f"{event}_{year}_main_abstracts.txt", "w", encoding="utf-8") as f:
                for i, url in enumerate(links, 1):
                    print(f"[{i}/{len(links)}] Fetching abstract from {url}")
                    abstract = extract_abstract(url)
                    if abstract:
                        f.write(f"URL: {url}\n")
                        f.write(f"Abstract: {abstract}\n")
                        f.write("="*80 + "\n")
                    else:
                        print(f"No abstract found for: {url}")
                    time.sleep(1)

if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[719/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.746/
[720/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.747/
[721/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.748/
[722/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.749/
[723/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.75/
[724/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.750/
[725/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.751/
[726/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.752/
[727/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.76/
[728/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.77/
[729/752] Fetching abstract from https://aclanthology.org/2020.emnlp-main.78/
[730/752] Fetching abstract from https://aclanthology.org/2020.emnlp-m

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2024.naacl-long/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2024.naacl-long.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} NAACL 2024 long papers.")
    with open("naacl_2024_long_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


Found 488 NAACL 2024 long papers.
[1/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.0/
⚠️ No abstract found for: https://aclanthology.org/2024.naacl-long.0/
[2/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.1/
[3/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.10/
[4/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.100/
[5/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.101/
[6/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.102/
[7/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.103/
[8/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.104/
[9/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.105/
[10/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.106/
[11/488] Fetching abstract from https://aclanthology.org/2024.naacl-long.107/
[12/488] Fetching abstract from https://aclanthology

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2024.naacl-short/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2024.naacl-short.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} NAACL 2024 short papers.")
    with open("naacl_2024_short_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 76 NAACL 2024 short papers.
[1/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.0/
⚠️ No abstract found for: https://aclanthology.org/2024.naacl-short.0/
[2/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.1/
[3/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.10/
[4/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.11/
[5/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.12/
[6/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.13/
[7/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.14/
[8/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.15/
[9/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.16/
[10/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.17/
[11/76] Fetching abstract from https://aclanthology.org/2024.naacl-short.18/
[12/76] Fetching abstract from https://aclanthology.org/2

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2021.acl-long/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2021.acl-long.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2021 long papers.")
    with open("acl_2021_long_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 572 ACL 2021 long papers.
[1/572] Fetching abstract from https://aclanthology.org/2021.acl-long.0/
⚠️ No abstract found for: https://aclanthology.org/2021.acl-long.0/
[2/572] Fetching abstract from https://aclanthology.org/2021.acl-long.1/
[3/572] Fetching abstract from https://aclanthology.org/2021.acl-long.10/
[4/572] Fetching abstract from https://aclanthology.org/2021.acl-long.100/
[5/572] Fetching abstract from https://aclanthology.org/2021.acl-long.101/
[6/572] Fetching abstract from https://aclanthology.org/2021.acl-long.102/
[7/572] Fetching abstract from https://aclanthology.org/2021.acl-long.103/
[8/572] Fetching abstract from https://aclanthology.org/2021.acl-long.104/
[9/572] Fetching abstract from https://aclanthology.org/2021.acl-long.105/
[10/572] Fetching abstract from https://aclanthology.org/2021.acl-long.106/
[11/572] Fetching abstract from https://aclanthology.org/2021.acl-long.107/
[12/572] Fetching abstract from https://aclanthology.org/2021.acl-long.108/


In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2021.acl-short/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2021.acl-short.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2021 short papers.")
    with open("acl_2021_short_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 140 ACL 2021 short papers.
[1/140] Fetching abstract from https://aclanthology.org/2021.acl-short.0/
⚠️ No abstract found for: https://aclanthology.org/2021.acl-short.0/
[2/140] Fetching abstract from https://aclanthology.org/2021.acl-short.1/
[3/140] Fetching abstract from https://aclanthology.org/2021.acl-short.10/
[4/140] Fetching abstract from https://aclanthology.org/2021.acl-short.100/
[5/140] Fetching abstract from https://aclanthology.org/2021.acl-short.101/
[6/140] Fetching abstract from https://aclanthology.org/2021.acl-short.102/
[7/140] Fetching abstract from https://aclanthology.org/2021.acl-short.103/
[8/140] Fetching abstract from https://aclanthology.org/2021.acl-short.104/
[9/140] Fetching abstract from https://aclanthology.org/2021.acl-short.105/
[10/140] Fetching abstract from https://aclanthology.org/2021.acl-short.106/
[11/140] Fetching abstract from https://aclanthology.org/2021.acl-short.107/
[12/140] Fetching abstract from https://aclanthology.org/2021.a

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2022.acl-long/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2022.acl-long.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2022 long papers.")
    with open("acl_2022_long_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 604 ACL 2022 long papers.
[1/604] Fetching abstract from https://aclanthology.org/2022.acl-long.0/
⚠️ No abstract found for: https://aclanthology.org/2022.acl-long.0/
[2/604] Fetching abstract from https://aclanthology.org/2022.acl-long.1/
[3/604] Fetching abstract from https://aclanthology.org/2022.acl-long.10/
[4/604] Fetching abstract from https://aclanthology.org/2022.acl-long.100/
[5/604] Fetching abstract from https://aclanthology.org/2022.acl-long.101/
[6/604] Fetching abstract from https://aclanthology.org/2022.acl-long.102/
[7/604] Fetching abstract from https://aclanthology.org/2022.acl-long.103/
[8/604] Fetching abstract from https://aclanthology.org/2022.acl-long.104/
[9/604] Fetching abstract from https://aclanthology.org/2022.acl-long.105/
[10/604] Fetching abstract from https://aclanthology.org/2022.acl-long.106/
[11/604] Fetching abstract from https://aclanthology.org/2022.acl-long.107/
[12/604] Fetching abstract from https://aclanthology.org/2022.acl-long.108/


In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2022.acl-short/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Only match short paper pages (e.g., /2022.acl-short.XX/)
        if href.startswith("/2022.acl-short.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2022 short papers.")
    with open("acl_2022_short_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 98 ACL 2022 short papers.
[1/98] Fetching abstract from https://aclanthology.org/2022.acl-short.0/
⚠️ No abstract found for: https://aclanthology.org/2022.acl-short.0/
[2/98] Fetching abstract from https://aclanthology.org/2022.acl-short.1/
[3/98] Fetching abstract from https://aclanthology.org/2022.acl-short.10/
[4/98] Fetching abstract from https://aclanthology.org/2022.acl-short.11/
[5/98] Fetching abstract from https://aclanthology.org/2022.acl-short.12/
[6/98] Fetching abstract from https://aclanthology.org/2022.acl-short.13/
[7/98] Fetching abstract from https://aclanthology.org/2022.acl-short.14/
[8/98] Fetching abstract from https://aclanthology.org/2022.acl-short.15/
[9/98] Fetching abstract from https://aclanthology.org/2022.acl-short.16/
[10/98] Fetching abstract from https://aclanthology.org/2022.acl-short.17/
[11/98] Fetching abstract from https://aclanthology.org/2022.acl-short.18/
[12/98] Fetching abstract from https://aclanthology.org/2022.acl-short.19/
[13/98] 

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2023.acl-long/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2023.acl-long.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2023 long papers.")
    with open("acl_2023_long_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 912 ACL 2023 long papers.
[1/912] Fetching abstract from https://aclanthology.org/2023.acl-long.0/
⚠️ No abstract found for: https://aclanthology.org/2023.acl-long.0/
[2/912] Fetching abstract from https://aclanthology.org/2023.acl-long.1/
[3/912] Fetching abstract from https://aclanthology.org/2023.acl-long.10/
[4/912] Fetching abstract from https://aclanthology.org/2023.acl-long.100/
[5/912] Fetching abstract from https://aclanthology.org/2023.acl-long.101/
[6/912] Fetching abstract from https://aclanthology.org/2023.acl-long.102/
[7/912] Fetching abstract from https://aclanthology.org/2023.acl-long.103/
[8/912] Fetching abstract from https://aclanthology.org/2023.acl-long.104/
[9/912] Fetching abstract from https://aclanthology.org/2023.acl-long.105/
[10/912] Fetching abstract from https://aclanthology.org/2023.acl-long.106/
[11/912] Fetching abstract from https://aclanthology.org/2023.acl-long.107/
[12/912] Fetching abstract from https://aclanthology.org/2023.acl-long.108/


In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2023.acl-short/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2023.acl-short.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2023 short papers.")
    with open("acl_2023_short_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 165 ACL 2023 short papers.
[1/165] Fetching abstract from https://aclanthology.org/2023.acl-short.0/
⚠️ No abstract found for: https://aclanthology.org/2023.acl-short.0/
[2/165] Fetching abstract from https://aclanthology.org/2023.acl-short.1/
[3/165] Fetching abstract from https://aclanthology.org/2023.acl-short.10/
[4/165] Fetching abstract from https://aclanthology.org/2023.acl-short.100/
[5/165] Fetching abstract from https://aclanthology.org/2023.acl-short.101/
[6/165] Fetching abstract from https://aclanthology.org/2023.acl-short.102/
[7/165] Fetching abstract from https://aclanthology.org/2023.acl-short.103/
[8/165] Fetching abstract from https://aclanthology.org/2023.acl-short.104/
[9/165] Fetching abstract from https://aclanthology.org/2023.acl-short.105/
[10/165] Fetching abstract from https://aclanthology.org/2023.acl-short.106/
[11/165] Fetching abstract from https://aclanthology.org/2023.acl-short.107/
[12/165] Fetching abstract from https://aclanthology.org/2023.a

In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2024.acl-long/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2024.acl-long.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2024 long papers.")
    with open("acl_2024_long_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 865 ACL 2024 long papers.
[1/865] Fetching abstract from https://aclanthology.org/2024.acl-long.0/
⚠️ No abstract found for: https://aclanthology.org/2024.acl-long.0/
[2/865] Fetching abstract from https://aclanthology.org/2024.acl-long.1/
[3/865] Fetching abstract from https://aclanthology.org/2024.acl-long.10/
[4/865] Fetching abstract from https://aclanthology.org/2024.acl-long.100/
[5/865] Fetching abstract from https://aclanthology.org/2024.acl-long.101/
[6/865] Fetching abstract from https://aclanthology.org/2024.acl-long.102/
[7/865] Fetching abstract from https://aclanthology.org/2024.acl-long.103/
[8/865] Fetching abstract from https://aclanthology.org/2024.acl-long.104/
[9/865] Fetching abstract from https://aclanthology.org/2024.acl-long.105/
[10/865] Fetching abstract from https://aclanthology.org/2024.acl-long.106/
[11/865] Fetching abstract from https://aclanthology.org/2024.acl-long.107/
[12/865] Fetching abstract from https://aclanthology.org/2024.acl-long.108/


In [None]:
import requests
from bs4 import BeautifulSoup
import time

VOLUME_URL = "https://aclanthology.org/volumes/2024.acl-short/"

def get_paper_links():
    response = requests.get(VOLUME_URL)
    if response.status_code != 200:
        print(f"Failed to access {VOLUME_URL}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    paper_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/2024.acl-short.") and href.count('.') == 2:
            full_url = "https://aclanthology.org" + href
            paper_links.append(full_url)

    return sorted(set(paper_links))

def extract_abstract(paper_url):
    try:
        response = requests.get(paper_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        abstract_div = soup.find('div', class_='card-body acl-abstract')
        if abstract_div:
            return abstract_div.text.strip()
    except Exception as e:
        print(f"Error fetching {paper_url}: {e}")
    return None

def main():
    links = get_paper_links()
    print(f"Found {len(links)} ACL 2024 short papers.")
    with open("acl_2024_short_abstracts.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Fetching abstract from {url}")
            abstract = extract_abstract(url)
            if abstract:
                f.write(f"URL: {url}\n")
                f.write(f"Abstract: {abstract}\n")
                f.write("="*80 + "\n")
            else:
                print(f"No abstract found for: {url}")
            time.sleep(1)

if __name__ == "__main__":
    main()


📄 Found 78 ACL 2024 short papers.
[1/78] Fetching abstract from https://aclanthology.org/2024.acl-short.0/
⚠️ No abstract found for: https://aclanthology.org/2024.acl-short.0/
[2/78] Fetching abstract from https://aclanthology.org/2024.acl-short.1/
[3/78] Fetching abstract from https://aclanthology.org/2024.acl-short.10/
[4/78] Fetching abstract from https://aclanthology.org/2024.acl-short.11/
[5/78] Fetching abstract from https://aclanthology.org/2024.acl-short.12/
[6/78] Fetching abstract from https://aclanthology.org/2024.acl-short.13/
[7/78] Fetching abstract from https://aclanthology.org/2024.acl-short.14/
[8/78] Fetching abstract from https://aclanthology.org/2024.acl-short.15/
[9/78] Fetching abstract from https://aclanthology.org/2024.acl-short.16/
[10/78] Fetching abstract from https://aclanthology.org/2024.acl-short.17/
[11/78] Fetching abstract from https://aclanthology.org/2024.acl-short.18/
[12/78] Fetching abstract from https://aclanthology.org/2024.acl-short.19/
[13/78] 