In [None]:
%pip install  -e ../

# RESTART YOUR JUPYTER NOTEBOOK AFTER RUNNING THIS

#%pip show pyarrow


In [1]:
%pip show aad

Name: aad
Version: 1.5.2
Summary: Python package for downloading aclanthology papers based on keywords
Home-page: 
Author: Roxanne El Baff
Author-email: roxanne.elbaff@gmail.com
License: MIT
Location: C:\Users\elba_ro\Documents\projects\github\aclanthology-papers
Editable project location: C:\Users\elba_ro\Documents\projects\github\aclanthology-papers
Requires: bibtexparser, clean-text, pandas, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from aad.aad import AADSearch

## Define your Keywords

the keywords are a 2D array, wich is a list of keywords list. The search fields (e.g. paper title) should match **at least one keyword of each keyword list**.

Your fields can include paper *titles* and paper *abstracts*. 

For example, if your want to filter papers based on title and abstract:

- keywords =[[ "emotion" ]] --> get all papers that has "emotion" in their titles **OR** in their abstracts
- keywords =[[ "emotion" ], [ "multimodel" ]] --> get all papers that has ("emotion" AND "mutimodel") in their titles **OR** in their abstracts
- keywords =[[ "emotion", "sentiment"], [ "multimodel" ]] --> get all papers that has (("emotion" or "sentiment") AND "mutimodel") in their titles **OR** in their abstracts

In [3]:

configs = [
    {
        "folder": "conf23_arg_style_transfer_gen",
        "keywords": [
            [
                "argument",
                "arguments",
                "argumentation",
                "opinion",
                "stance",
                "debate",
                "argumentative",
            ],
            ["style transfer", "style"],
            [
                "transfer",
                "generation",
            ],
        ],
    },
    {
        "folder": "conf23_style_transfer_generation",
        "keywords": [
            ["style transfer", "style"],
            [
                "transfer",
                "generation",
            ],
        ],
    },
    {
        "folder": "conf23_argument_quality_gentrans",
        "keywords": [
            [
                "argument",
                "argumentative",
                "stance",
                "editorial",
                "opinion",
                "opinions",
                "debate",
            ],
            [
                "transfer",
                "generation",
            ],
            [
                "effect",
                "effectiveness",
                "quality",
                "convinc",
                "persuasive",
                "persuasion",
            ],
        ],
    },
]

configs =  [
        {
            "folder": "dissertation_arg_qualhuman-title_search",
            "keywords": [['argu', 'opinion', 'editorial', 'persua', 'stance', 'debate', 'conversat', 'discuss', 'view', 'position'],
                        ['qualit', 'effect',  'understand', 'convinc', 'social', 'cultur', 
                    'ideology', 'belief', 'background', 'persua', 'human', 'personal', 'politic']]
        }]

In [5]:


def add_config(config:list, folder_name:str, keywords_lst:list, fields:list=["title", "abstract"]):
    item = {
        "folder": folder_name,
        "keywords": keywords_lst,
        "fields": fields
    } 
    config.append(item)
    return config

configs = []
add_config(configs,"dissertation-rw-title_search",  [['argu', 'opinion', 'editorial', 'persua', 'stance', 'debate', 'conversat', 'discuss', 'view', 'position'],
                        ['qualit', 'effect',  'understand', 'convinc', 'social', 'cultur', 
                    'ideology', 'belief', 'background', 'persua', 'human', 'personal', 'politic']],
                    fields=["title"])
print(configs)


[{'folder': 'dissertation-rw-title_search', 'keywords': [['argu', 'opinion', 'editorial', 'persua', 'stance', 'debate', 'conversat', 'discuss', 'view', 'position'], ['qualit', 'effect', 'understand', 'convinc', 'social', 'cultur', 'ideology', 'belief', 'background', 'persua', 'human', 'personal', 'politic']], 'fields': ['title']}]


In [6]:
configs

[{'folder': 'dissertation-rw-title_search',
  'keywords': [['argu',
    'opinion',
    'editorial',
    'persua',
    'stance',
    'debate',
    'conversat',
    'discuss',
    'view',
    'position'],
   ['qualit',
    'effect',
    'understand',
    'convinc',
    'social',
    'cultur',
    'ideology',
    'belief',
    'background',
    'persua',
    'human',
    'personal',
    'politic']],
  'fields': ['title']}]

In [9]:
def download_papers(configs, overview_only:bool = False):
    result_lst =[]
    for c in configs:
        searcher = None
        print(f'processing {c["folder"]}')
        searcher = AADSearch(
            keywords=c["keywords"], force_download=False, fields=c["fields"]
        )
        searcher.download_papers(
            folder_name=f"../data/{c['folder']}",
            overview_only=overview_only
        )  # This function calls "filter" (in case the papers are not filtered) and then downloads the papers.

        result_lst.append({"id": c['folder'], "searcher": searcher})
        print(len(searcher.filtered_df))
    return result_lst

In [10]:
download_papers(configs, overview_only=False)


processing dissertation-rw-title_search
The processed keywords are: ['(argu|opinion|editorial|persua|stance|debate|conversat|discuss|view|position)', '(qualit|effect|understand|convinc|social|cultur|ideology|belief|background|persua|human|personal|politic)']


## Define your searcher
You set the keywords and fields (in the acl anthology case, we only have title and abstract).

- **keywords**: as defined above, are the keywords used for having a match in your fields.
- **fields**: are the fields you filter on.
- **force_download**: The searcher will by default download the acl anthology from the website in case it is not locally downloaded. If this field is set to True, then the bib anthology will be redownloaded. Please set to True if you think the online bib was updated (probably each few months). Otherwise set to False.

In [None]:
searcher = AADSearch(
    keywords=keywords, force_download=False, fields=["title", "abstract"]
)

## Download your papers
Downloads the filtered papers into a customized folder. It also downloads a csv file containing the metadata (title, url, etc.) of the downloaded papers

- The folder will be created (in case it does not exist).
- The filtered_df shows a dataframe with the filtered papers metadata

If you want to check the filtered results and then download the paper, then call **searcher.filter()** which will return a dataframe with the filtered data.
After that, if you call download_papers, the papers will be downloaded.

In [None]:
searcher.download_papers(
    folder_name="../data/conf2023_argument_style_transfer"
)  # This function calls "filter" (in case the papers are not filtered) and then downloads the papers.
searcher.filtered_df.head()

In [None]:
searcher.filtered_df

In [None]:
import bibtexparser
import pandas as pd
with open('../data/test.bib', encoding="utf-8") as bibtex_file:
   print("Loading bib...")
   bibtex_database = bibtexparser.load(bibtex_file)
   print("Loading dataframe...")
   _df = pd.DataFrame(bibtex_database.entries)


len(_df)
_df.to_parquet("../data/test.parquet")

In [None]:
len(pd.read_parquet("../data/test.parquet"))
pd.read_parquet("../data/test.parquet")

## Downloading from a list of urls
This is a static funtion that takes a list of urls where these urls point to the paper aclanthology link or pdf link.

It takes a list of urls and folder name and it downloads the papers and assigns them the name assigned by aclanth.

I used this after checking my filtered_df csv file (papers.csv) and manually filtering the papers I am interested in. I thought it would be quicker to copy paset the urls from the url columns and download the papers in a custom folder.

