# Prepare documents in data/
This file prepares the corpus of documents consisting of the DOI and the abstract from a source

In [12]:
import requests
import pandas as pd
from tqdm import tqdm
from tqdm.gui import tqdm as tqdm_gui
import time
import os


tqdm.pandas()


In [13]:
from reconstruct_abstract import reconstruct_abstract
def get_openalex_data(doi) -> dict:
        """
        Used to retrieve data from the OpenAlex API.
        Arg: takes a DOI as a string without the resolver.
        Return: A dictionary of values.

        Note: oa_abstract is reconstructed from the function reconstruct_abstract(). You will need to install
        https://github.com/poppy-nicolette/Bibliometric_tools/tree/7bcb724c95d9f6a571322076a730736097cf5886/reconstruct_abstract

        Example usage
            doi = "10.1234/example"
            data = get_openalex_data(doi)
            print(data)
        """
        URL = f"https://api.openalex.org/works?filter=doi:{doi}&select=doi,title,abstract_inverted_index"
        try:
            result = requests.get(URL)

            if result.status_code == 200:
                data = result.json()

                # Parse json data into each element:
                try:
                    oa_doi = data['results'][0]['doi'].lstrip('https://doi.org/')
                except:
                    oa_doi = doi
                oa_title = data['results'][0]['title']
                oa_abstract_inverted_index = data['results'][0]['abstract_inverted_index']
                # Reconstruct abstract
                try:
                    oa_abstract = reconstruct_abstract.reconstruct_abstract(oa_abstract_inverted_index)
                except:
                    oa_abstract = "None"

                return {
                    'oa_doi': oa_doi,
                    'oa_title': oa_title,
                    'oa_abstract': oa_abstract,
                    }
            else:
                print(f"Error: Received status code {result.status_code} for DOI {doi}")
                return {'oa_doi':oa_doi,
                        'oa_title':None,
                        'oa_abstract':None}
        except (requests.exceptions.RequestException,IndexError) as e:
            print(f"Request failed for DOI {doi}: {e}")
            return {'oa_doi':oa_doi,
                    'oa_title':None,
                    'oa_abstract':None}
        finally:
            # Sleep so that you are below the 10 per second limit or 100k per day.
            time.sleep(0.11)

#return document for each doi from dictionary 
def prepare_document(x:dict):
    """
    Takes a dictionary of three values as input. 
    Outputs a plain text file
    Input
        dictionary containing oa_doi, oa_title, oa_abstract
    Output
        writes to a text tile
    Returns
        None
    """
    oa_doi = x.get('oa_doi', "None")
    oa_title = x.get('oa_title', "None")
    oa_abstract = x.get('oa_abstract', "None")

    #create file name
    oa_doi_mod = oa_doi.replace("/","_")
    file_name = f"data/{oa_doi_mod}.txt"

    #write to file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"DOI: {oa_doi}\n")
        file.write(f"Title: {oa_title}\n")
        file.write(f"Abstract: {oa_abstract}\n")

In [14]:
doi = "10.3390/su142416618"
prepare_document(get_openalex_data(doi))

## Call .csv file and run functions

In [15]:
# find you a list of DOIs to process
doi_list = pd.read_csv("data/doi_list.csv")
doi_list.info()
print(type(doi_list.iloc[0,0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doi     98 non-null     object
dtypes: object(1)
memory usage: 916.0+ bytes
<class 'str'>


In [16]:
# read the list and work through the list
doi_list['doi'] = doi_list['doi'].astype(str)#check they are all strings

#apply both functions
doi_list['doi'].progress_apply(lambda x: prepare_document(get_openalex_data(x)))
print('done')


  2%|▏         | 2/98 [00:00<00:20,  4.69it/s]

Request failed for DOI 10.24251/HICSS.2024.068: list index out of range


  5%|▌         | 5/98 [00:02<00:41,  2.24it/s]

Request failed for DOI 10.18653/v1/D19-1371: list index out of range


 10%|█         | 10/98 [00:04<00:36,  2.44it/s]

Request failed for DOI 10.48550/arXiv.2311.04310: list index out of range


 13%|█▎        | 13/98 [00:05<00:30,  2.76it/s]

Request failed for DOI 10.5281/zenodo.14006424: list index out of range


 14%|█▍        | 14/98 [00:05<00:31,  2.64it/s]

Request failed for DOI 10.5281/zenodo.13951886: list index out of range


 15%|█▌        | 15/98 [00:05<00:30,  2.68it/s]

Request failed for DOI 10.48550/arXiv.2303.17661: list index out of range


 18%|█▊        | 18/98 [00:06<00:27,  2.88it/s]

Request failed for DOI 10.48550/arXiv.2410.04231: list index out of range


 19%|█▉        | 19/98 [00:07<00:27,  2.90it/s]

Request failed for DOI 10.48550/arXiv.2405.13000: list index out of range


 20%|██        | 20/98 [00:07<00:26,  2.94it/s]

Request failed for DOI 10.1109/ACCESS.2024.3395449: list index out of range


 22%|██▏       | 22/98 [00:08<00:25,  2.94it/s]

Request failed for DOI 10.48550/arXiv.2402.05131: list index out of range


 26%|██▌       | 25/98 [00:09<00:31,  2.29it/s]

Request failed for DOI 10.1007/s11192-022-04367-w: list index out of range


 27%|██▋       | 26/98 [00:10<00:30,  2.36it/s]

Request failed for DOI 10.1108/JD-10-2022-0234: list index out of range


 28%|██▊       | 27/98 [00:10<00:27,  2.61it/s]

Request failed for DOI 10.48550/arXiv.2404.17663: list index out of range


 29%|██▊       | 28/98 [00:10<00:25,  2.77it/s]

Request failed for DOI 10.48550/arXiv.2409.10633: list index out of range


 30%|██▉       | 29/98 [00:11<00:24,  2.82it/s]

Request failed for DOI 10.48550/arXiv.2401.16359: list index out of range


 34%|███▎      | 33/98 [00:14<00:57,  1.13it/s]

Request failed for DOI 10.48550/arXiv.2404.01985: list index out of range


 36%|███▌      | 35/98 [00:16<00:55,  1.14it/s]

Request failed for DOI 10.48550/arXiv.2406.15154: list index out of range


 42%|████▏     | 41/98 [00:27<01:30,  1.58s/it]

Request failed for DOI 10.48550/arXiv.2406.13213: list index out of range


 44%|████▍     | 43/98 [00:31<01:45,  1.91s/it]

Request failed for DOI 10.48550/arXiv.2402.01788: list index out of range


 51%|█████     | 50/98 [00:37<00:34,  1.39it/s]

Request failed for DOI 10.1590/SciELOPreprints.11205: list index out of range


 54%|█████▍    | 53/98 [00:39<00:23,  1.88it/s]

Request failed for DOI 10.5281/ZENODO.6188748: list index out of range


 58%|█████▊    | 57/98 [00:40<00:16,  2.42it/s]

Request failed for DOI 10.48550/arXiv.2412.17031: list index out of range


 59%|█████▉    | 58/98 [00:41<00:15,  2.62it/s]

Request failed for DOI 10.48550/arXiv.2312.10997: list index out of range


 60%|██████    | 59/98 [00:41<00:16,  2.34it/s]

Request failed for DOI 10.48550/arXiv.2505.13557: list index out of range


 61%|██████    | 60/98 [00:42<00:18,  2.07it/s]

Request failed for DOI 10.48550/arXiv.2505.18247: list index out of range


 63%|██████▎   | 62/98 [00:43<00:16,  2.12it/s]

Request failed for DOI 10.48550/arXiv.2502.15005: list index out of range


 64%|██████▍   | 63/98 [00:43<00:15,  2.29it/s]

Request failed for DOI 10.48550/arXiv.2502.03627: list index out of range


 65%|██████▌   | 64/98 [00:43<00:13,  2.58it/s]

Request failed for DOI 10.48550/arXiv.2307.03172: list index out of range


 66%|██████▋   | 65/98 [00:44<00:11,  2.77it/s]

Request failed for DOI 10.48550/arXiv.1301.3781: list index out of range


 68%|██████▊   | 67/98 [00:44<00:10,  2.90it/s]

Request failed for DOI 10.48550/arXiv.1607.04606: list index out of range


 69%|██████▉   | 68/98 [00:45<00:10,  2.86it/s]

Request failed for DOI 10.48550/arXiv.1607.01759: list index out of range


 71%|███████▏  | 70/98 [00:45<00:09,  2.87it/s]

Request failed for DOI 10.48550/arXiv.2109.05052: list index out of range


 72%|███████▏  | 71/98 [00:46<00:09,  2.95it/s]

Request failed for DOI 10.48550/arXiv.2407.17023: list index out of range


 73%|███████▎  | 72/98 [00:46<00:08,  3.06it/s]

Request failed for DOI 10.48550/arXiv.2402.11782: list index out of range


 74%|███████▍  | 73/98 [00:46<00:08,  3.00it/s]

Request failed for DOI 10.48550/arXiv.2404.13948: list index out of range


 78%|███████▊  | 76/98 [00:47<00:07,  2.96it/s]

Request failed for DOI 10.48550/arXiv.2508.18620: list index out of range


 79%|███████▊  | 77/98 [00:48<00:06,  3.04it/s]

Request failed for DOI 10.1108/JD-11-2017-0154: list index out of range


 80%|███████▉  | 78/98 [00:48<00:06,  2.88it/s]

Request failed for DOI 10.18452/25258: list index out of range


 84%|████████▎ | 82/98 [00:49<00:05,  2.92it/s]

Request failed for DOI 10.48550/arXiv.2311.08147: list index out of range


 85%|████████▍ | 83/98 [00:50<00:05,  2.88it/s]

Request failed for DOI 10.48550/arXiv.1909.06146: list index out of range


 87%|████████▋ | 85/98 [00:51<00:04,  2.83it/s]

Request failed for DOI 10.48550/arXiv.2510.11195: list index out of range


 88%|████████▊ | 86/98 [00:51<00:04,  2.78it/s]

Request failed for DOI 10.48550/arXiv.2506.00789: list index out of range


 89%|████████▉ | 87/98 [00:51<00:03,  2.93it/s]

Request failed for DOI 10.1080/1941126X.2025.2497738: list index out of range


 90%|████████▉ | 88/98 [00:52<00:03,  2.82it/s]

Request failed for DOI 10.48550/ARXIV.2504.08231: list index out of range


 94%|█████████▍| 92/98 [00:53<00:02,  2.86it/s]

Request failed for DOI 10.3233/SW-150175: list index out of range


 95%|█████████▍| 93/98 [00:53<00:01,  2.85it/s]

Request failed for DOI 10.1109/ADL.1998.670425: list index out of range


 96%|█████████▌| 94/98 [00:54<00:01,  2.99it/s]

Request failed for DOI 10.1045/september2016‐meschenmoser: list index out of range


 97%|█████████▋| 95/98 [00:54<00:00,  3.06it/s]

Request failed for DOI 10.1109/NETAPPS63333.2024.10823528: list index out of range


 98%|█████████▊| 96/98 [00:54<00:00,  3.05it/s]

Request failed for DOI 10.1109/ICITR64794.2024.10857771: list index out of range


 99%|█████████▉| 97/98 [00:55<00:00,  3.16it/s]

Request failed for DOI 10.1162/tacl a 00466: list index out of range


100%|██████████| 98/98 [00:55<00:00,  1.76it/s]

Request failed for DOI 10.48550/ARXIV.2205.01833: list index out of range
done





## Create documents with errors -
This section takes the same documents from above and introduces errors or characteristics identified in Part 1.
- [✅] JATS tags typical of Crossref
- [✅] multiple languages with multiple unicode blocks

In [17]:
langs_list = GoogleTranslator().get_supported_languages(as_dict=True)
langs_list

{'afrikaans': 'af',
 'albanian': 'sq',
 'amharic': 'am',
 'arabic': 'ar',
 'armenian': 'hy',
 'assamese': 'as',
 'aymara': 'ay',
 'azerbaijani': 'az',
 'bambara': 'bm',
 'basque': 'eu',
 'belarusian': 'be',
 'bengali': 'bn',
 'bhojpuri': 'bho',
 'bosnian': 'bs',
 'bulgarian': 'bg',
 'catalan': 'ca',
 'cebuano': 'ceb',
 'chichewa': 'ny',
 'chinese (simplified)': 'zh-CN',
 'chinese (traditional)': 'zh-TW',
 'corsican': 'co',
 'croatian': 'hr',
 'czech': 'cs',
 'danish': 'da',
 'dhivehi': 'dv',
 'dogri': 'doi',
 'dutch': 'nl',
 'english': 'en',
 'esperanto': 'eo',
 'estonian': 'et',
 'ewe': 'ee',
 'filipino': 'tl',
 'finnish': 'fi',
 'french': 'fr',
 'frisian': 'fy',
 'galician': 'gl',
 'georgian': 'ka',
 'german': 'de',
 'greek': 'el',
 'guarani': 'gn',
 'gujarati': 'gu',
 'haitian creole': 'ht',
 'hausa': 'ha',
 'hawaiian': 'haw',
 'hebrew': 'iw',
 'hindi': 'hi',
 'hmong': 'hmn',
 'hungarian': 'hu',
 'icelandic': 'is',
 'igbo': 'ig',
 'ilocano': 'ilo',
 'indonesian': 'id',
 'irish': 'ga

In [24]:
from deep_translator import GoogleTranslator
#source: https://github.com/nidhaloff/deep-translator
# open each txt file
# write txt file with error or characteristic. 

# add in JATS tags at beginning and end.
beginning_pattern = "<jats:title>Abstract</jats:title><jats:p>"
end_pattern = "</jats:p>"

def add_jats_pattern(input_dir:str):

    #open file#read each file in input_dir
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_dir,file_name)

            with open(file_path, 'r') as file:
                content = file.readlines()
                #changing the abstract only 
                abstract = content[2]
                abstract = abstract.lstrip("Abstract: ")
                abstract = f"Abstract: {beginning_pattern}{abstract}{end_pattern}"# ADD ERRORS HERE
                content[2] = abstract
            # now overwrite the file entirely with variable content
            with open(file_path,"w") as file:
                file.writelines(content)
                print(f"done with {file_name}")
    print(f"done with everything. \n completed {len(os.listdir(input_dir))} files.")



multilan_end_pattern = ""

def add_multilang_pattern(input_dir:str):
    #instantiate translator
    my_translator = GoogleTranslator(source='en', target='ja') #changed from japanese 'ja' to greek 'el'
    #open file, read in each file in input dir
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_dir,file_name)

            with open(file_path, 'r') as file:
                content = file.readlines()
                #change title
                title = content[1]
                title = title.lstrip("Title: ")
                translated = my_translator.translate(title)
                title = f"Title: {translated} {title}" #adds multilanguage error
                content[1] = title
                # change abstract
                abstract = content[2]
                abstract = abstract.lstrip("Abstract: ")
                translated = my_translator.translate(abstract)
                abstract = f"Abstract: {translated} {abstract}"#adds multlanguage error to abstract
                content[2] = abstract
            # overwrite files entires with variables
            with open(file_path, 'w') as file:
                file.writelines(content)
                print(f"added multilanguage to {file_name}")
    print(f" done adding languages. \n Completed {len(os.listdir(input_dir))} file changes.")





In [25]:
from deep_translator import GoogleTranslator
#Source: https://github.com/nidhaloff/deep-translator
#my_translator = GoogleTranslator(source='en', target='el') # for greek version
my_translator = GoogleTranslator(source='en', target='ja') # for japanese version
translated = my_translator.translate("that's the way, uh-huh, I like it!")
print(translated)


それはそれで、うーん、いいね！


In [26]:
"""
This applies the jats patterns above onto all text files within a directory.
"""
#input_dir = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data_jats"

#add_jats_pattern(input_dir)

"""
This applies the multilingual pattern onto all texts within a directory
"""
input_dir = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data_multi_lang_ja"

add_multilang_pattern(input_dir)

added multilanguage to 10.48550_arXiv.2406.15154.txt
added multilanguage to 10.1109_ICITR64794.2024.10857771.txt
added multilanguage to 10.1177_09610006241239080.txt
added multilanguage to 10.48550_arXiv.2402.01788.txt
added multilanguage to 10.48550_arXiv.2412.17031.txt
added multilanguage to 10.48550_arXiv.2505.13557.txt
added multilanguage to 10.1145_3681780.3697252.txt
added multilanguage to 10.1590_SciELOPreprints.11205.txt
added multilanguage to 10.48550_arXiv.2410.04231.txt
added multilanguage to 10.24251_HICSS.2024.068.txt
added multilanguage to 10.48550_arXiv.2404.17663.txt
added multilanguage to 10.1093_jamia_ocae129.txt
added multilanguage to 10.1007_s11192-023-04923-y.txt
added multilanguage to 10.1045_september2016‐meschenmoser.txt
added multilanguage to 10.48550_arXiv.2401.16359.txt
added multilanguage to 10.1007_s11192-015-1765-5.txt
added multilanguage to 10.1162_qss_a_00022.txt
added multilanguage to 10.48550_arXiv.1607.04606.txt
added multilanguage to 10.48550_arXiv.2