# Prepare documents in data/
This file prepares the corpus of documents consisting of the DOI and the abstract from a source

In [18]:
import requests
import pandas as pd
from tqdm import tqdm
from tqdm.gui import tqdm as tqdm_gui
import time

tqdm.pandas()


In [52]:
from reconstruct_abstract import reconstruct_abstract
def get_openalex_data(doi) -> dict:
        """
        Used to retrieve data from the OpenAlex API.
        Arg: takes a DOI as a string without the resolver.
        Return: A dictionary of values.

        Note: oa_abstract is reconstructed from the function reconstruct_abstract(). You will need to install
        https://github.com/poppy-nicolette/Bibliometric_tools/tree/7bcb724c95d9f6a571322076a730736097cf5886/reconstruct_abstract

        Example usage
            doi = "10.1234/example"
            data = get_openalex_data(doi)
            print(data)
        """
        URL = f"https://api.openalex.org/works?filter=doi:{doi}&select=doi,title,abstract_inverted_index"
        try:
            result = requests.get(URL)

            if result.status_code == 200:
                data = result.json()

                # Parse json data into each element:
                try:
                    oa_doi = data['results'][0]['doi'].lstrip('https://doi.org/')
                except:
                    oa_doi = doi
                oa_title = data['results'][0]['title']
                oa_abstract_inverted_index = data['results'][0]['abstract_inverted_index']
                # Reconstruct abstract
                try:
                    oa_abstract = reconstruct_abstract.reconstruct_abstract(oa_abstract_inverted_index)
                except:
                    oa_abstract = "None"

                return {
                    'oa_doi': oa_doi,
                    'oa_title': oa_title,
                    'oa_abstract': oa_abstract,
                    }
            else:
                print(f"Error: Received status code {result.status_code} for DOI {doi}")
                return {'oa_doi':oa_doi,
                        'oa_title':None,
                        'oa_abstract':None}
        except (requests.exceptions.RequestException,IndexError) as e:
            print(f"Request failed for DOI {doi}: {e}")
            return {'oa_doi':oa_doi,
                    'oa_title':None,
                    'oa_abstract':None}
        finally:
            # Sleep so that you are below the 10 per second limit or 100k per day.
            time.sleep(0.11)

#return document for each doi from dictionary 
def prepare_document(x:dict):
    """
    Takes a dictionary of three values as input. 
    Outputs a plain text file
    Input
        dictionary containing oa_doi, oa_title, oa_abstract
    Output
        writes to a text tile
    Returns
        None
    """
    oa_doi = x.get('oa_doi', "None")
    oa_title = x.get('oa_title', "None")
    oa_abstract = x.get('oa_abstract', "None")

    #create file name
    oa_doi_mod = oa_doi.replace("/","_")
    file_name = f"{oa_doi_mod}.txt"

    #write to file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"DOI: {oa_doi}\n")
        file.write(f"Abstract: {oa_abstract}\n")

In [56]:
doi = "10.3390/su142416618"
prepare_document(get_openalex_data(doi))

In [48]:
# find you a list of DOIs to process
doi_list = pd.read_csv("data/doi_list.csv")
doi_list.info()
print(type(doi_list.iloc[0,0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doi     32 non-null     object
dtypes: object(1)
memory usage: 388.0+ bytes
<class 'str'>


In [57]:
# read the list and work through the list
doi_list['doi'] = doi_list['doi'].astype(str)#check they are all strings

#apply both functions
doi_list['doi'].progress_apply(lambda x: prepare_document(get_openalex_data(x)))
print('done')


 12%|█▎        | 4/32 [00:01<00:07,  3.60it/s]

Request failed for DOI 10.1175/WCAS-D-23-0143.1: list index out of range


 19%|█▉        | 6/32 [00:01<00:07,  3.32it/s]

Request failed for DOI 10.1177/0092055X19862012: list index out of range


 66%|██████▌   | 21/32 [00:07<00:03,  2.83it/s]

Request failed for DOI 10.5281/zenodo.4011278: list index out of range


 97%|█████████▋| 31/32 [00:10<00:00,  2.73it/s]

Request failed for DOI 10.1175/2009BAMS2625.1: list index out of range


100%|██████████| 32/32 [00:11<00:00,  2.76it/s]

done





In [None]:
def write_to_file(x,y,z):
    