# Prepare documents in data/
This file prepares the corpus of documents consisting of the DOI and the abstract from a source

In [62]:
import requests
import pandas as pd
from tqdm import tqdm
from tqdm.gui import tqdm as tqdm_gui
import time


tqdm.pandas()


In [59]:
from reconstruct_abstract import reconstruct_abstract
def get_openalex_data(doi) -> dict:
        """
        Used to retrieve data from the OpenAlex API.
        Arg: takes a DOI as a string without the resolver.
        Return: A dictionary of values.

        Note: oa_abstract is reconstructed from the function reconstruct_abstract(). You will need to install
        https://github.com/poppy-nicolette/Bibliometric_tools/tree/7bcb724c95d9f6a571322076a730736097cf5886/reconstruct_abstract

        Example usage
            doi = "10.1234/example"
            data = get_openalex_data(doi)
            print(data)
        """
        URL = f"https://api.openalex.org/works?filter=doi:{doi}&select=doi,title,abstract_inverted_index"
        try:
            result = requests.get(URL)

            if result.status_code == 200:
                data = result.json()

                # Parse json data into each element:
                try:
                    oa_doi = data['results'][0]['doi'].lstrip('https://doi.org/')
                except:
                    oa_doi = doi
                oa_title = data['results'][0]['title']
                oa_abstract_inverted_index = data['results'][0]['abstract_inverted_index']
                # Reconstruct abstract
                try:
                    oa_abstract = reconstruct_abstract.reconstruct_abstract(oa_abstract_inverted_index)
                except:
                    oa_abstract = "None"

                return {
                    'oa_doi': oa_doi,
                    'oa_title': oa_title,
                    'oa_abstract': oa_abstract,
                    }
            else:
                print(f"Error: Received status code {result.status_code} for DOI {doi}")
                return {'oa_doi':oa_doi,
                        'oa_title':None,
                        'oa_abstract':None}
        except (requests.exceptions.RequestException,IndexError) as e:
            print(f"Request failed for DOI {doi}: {e}")
            return {'oa_doi':oa_doi,
                    'oa_title':None,
                    'oa_abstract':None}
        finally:
            # Sleep so that you are below the 10 per second limit or 100k per day.
            time.sleep(0.11)

#return document for each doi from dictionary 
def prepare_document(x:dict):
    """
    Takes a dictionary of three values as input. 
    Outputs a plain text file
    Input
        dictionary containing oa_doi, oa_title, oa_abstract
    Output
        writes to a text tile
    Returns
        None
    """
    oa_doi = x.get('oa_doi', "None")
    oa_title = x.get('oa_title', "None")
    oa_abstract = x.get('oa_abstract', "None")

    #create file name
    oa_doi_mod = oa_doi.replace("/","_")
    file_name = f"{oa_doi_mod}.txt"

    #write to file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"DOI: {oa_doi}\n")
        file.write(f"Abstract: {oa_abstract}\n")

In [56]:
doi = "10.3390/su142416618"
prepare_document(get_openalex_data(doi))

In [60]:
# find you a list of DOIs to process
doi_list = pd.read_csv("data/doi_list.csv")
doi_list.info()
print(type(doi_list.iloc[0,0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doi     44 non-null     object
dtypes: object(1)
memory usage: 484.0+ bytes
<class 'str'>


In [61]:
# read the list and work through the list
doi_list['doi'] = doi_list['doi'].astype(str)#check they are all strings

#apply both functions
doi_list['doi'].progress_apply(lambda x: prepare_document(get_openalex_data(x)))
print('done')


  7%|▋         | 3/44 [00:01<00:23,  1.74it/s]

Request failed for DOI 10.18653/v1/D19-1371: list index out of range


 18%|█▊        | 8/44 [00:06<00:36,  1.00s/it]

Request failed for DOI 10.48550/arXiv.2303.17661: list index out of range


 23%|██▎       | 10/44 [00:07<00:28,  1.18it/s]

Request failed for DOI 10.48550/arXiv.2410.04231: list index out of range


 25%|██▌       | 11/44 [00:08<00:26,  1.25it/s]

Request failed for DOI 10.5281/ZENODO.13960973: list index out of range


 34%|███▍      | 15/44 [00:11<00:19,  1.51it/s]

Request failed for DOI 10.48550/arXiv.2404.17663: list index out of range


 36%|███▋      | 16/44 [00:11<00:16,  1.68it/s]

Request failed for DOI 10.48550/arXiv.2409.10633: list index out of range


 39%|███▊      | 17/44 [00:11<00:14,  1.81it/s]

Request failed for DOI 10.48550/arXiv.2401.16359: list index out of range


 48%|████▊     | 21/44 [00:14<00:13,  1.66it/s]

Request failed for DOI 10.48550/arXiv.2404.01985: list index out of range


 52%|█████▏    | 23/44 [00:15<00:10,  1.97it/s]

Request failed for DOI 10.48550/arXiv.2406.15154: list index out of range


 66%|██████▌   | 29/44 [00:19<00:09,  1.66it/s]

Request failed for DOI 10.48550/arXiv.2406.13213: list index out of range


 70%|███████   | 31/44 [00:20<00:07,  1.78it/s]

Request failed for DOI 10.48550/arXiv.2402.01788: list index out of range


 77%|███████▋  | 34/44 [00:22<00:07,  1.40it/s]

Request failed for DOI 10.1590/SciELOPreprints.11205: list index out of range


 84%|████████▍ | 37/44 [00:24<00:04,  1.47it/s]

Request failed for DOI 10.48550/arXiv.2312.10997: list index out of range


 86%|████████▋ | 38/44 [00:25<00:03,  1.64it/s]

Request failed for DOI 10.48550/arXiv.2505.18247: list index out of range


 89%|████████▊ | 39/44 [00:25<00:03,  1.52it/s]

Request failed for DOI 10.48550/arXiv.2502.03627: list index out of range


 93%|█████████▎| 41/44 [00:26<00:01,  1.89it/s]

Request failed for DOI 10.48550/arXiv.2109.05052: list index out of range


 95%|█████████▌| 42/44 [00:27<00:01,  1.60it/s]

Request failed for DOI 10.48550/arXiv.2407.17023: list index out of range


 98%|█████████▊| 43/44 [00:28<00:00,  1.47it/s]

Request failed for DOI 10.48550/arXiv.2402.11782: list index out of range


100%|██████████| 44/44 [00:29<00:00,  1.51it/s]

Request failed for DOI 10.48550/arXiv.2404.13948: list index out of range


100%|██████████| 44/44 [00:29<00:00,  1.47it/s]

done





In [None]:
def write_to_file(x,y,z):
    