In [46]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders

In [47]:
import pyalex

pyalex.config.email = "quentin.glorieux@lkb.upmc.fr"

In [106]:
import re

def get_openalex_id(input_identifier):
    authors = Authors()
    if "0-" in input_identifier:  # Assuming it's an ORCID number
       orcid_url = f"https://orcid.org/{input_identifier}"
       openalex_url = authors[orcid_url]['id']
    else:
        name_result = authors.search_filter(display_name=input_identifier).get()
        openalex_url = name_result[0]['id']
        
    pattern = re.compile(r'A(\d+)')
    match = pattern.search(openalex_url)
    if match:
        extracted_id = match.group(1)
        return 'A' + extracted_id
    else:
        return    


## Fetch

In [111]:
import pandas as pd

pager = (
    Works()
    .filter(author={"id": "A5024990264"})
    .filter(has_doi=True, primary_location={"source": {"has_issn": True}})
    .select(
        [
            "id",
            "doi",
            "title",
            "publication_year",
            "ids",
            "type",
            "type_crossref",
            "open_access",
            "primary_location",
            "authorships",
            "biblio",
            "concepts",
        ]
    )
    .sort(publication_year="desc")
    .paginate(per_page=100)
)
list = []
for page in pager:
    list = list + page
df0 = pd.DataFrame(list)
df = df0

## Code to export the publication list

In [121]:
import json


# Define a function to extract author information
def extract_author_info(row):
    match = re.search(r'physrev(.+?)(\d+)', row["doi"])
    if match:
        if match.group(1)[0] not in ('l', 'r', 'x') : 
            journal_abbreviation = match.group(1)[0].capitalize()
        else: 
            journal_abbreviation = ""
    else:
        journal_abbreviation = ""
    return {
        "title": row["title"],
        "authors": [
            {"name": entry["author"]["display_name"], "orcid": entry["author"]["orcid"]}
            for entry in row["authorships"]
        ],
        "link": {
            "url": row["doi"],
            "display": row["primary_location"]["source"]["display_name"]
            + " "
            + str(row["biblio"]["volume"])
            + " "
            + str(row["biblio"]["issue"])
            + " ("
            + str(row["publication_year"])
            + ").",
        },
        "orcid": [entry["author"]["orcid"] for entry in row["authorships"]],
        "doi": row["doi"],
        "is_oa": row["open_access"]["is_oa"],
        "oa_url": row["open_access"]["oa_url"],
        "publication_year": row["publication_year"],
        "journal": row["primary_location"]["source"]["display_name"],
        "journal_abbreviation": journal_abbreviation,
        "biblio": row["biblio"],
    }


# Apply the function to each row and create a list of dictionaries
author_info_list = df.apply(extract_author_info, axis=1).tolist()

# Convert the list of dictionaries to a JSON string
json_string = json.dumps(author_info_list, indent=2)

# Save the JSON string to a file
with open("../_data/openalex_list.json", "w") as file:
    file.write(json_string)

## Tests

In [None]:
# IDs
expanded_df = pd.json_normalize(df['ids'])
result_df = pd.concat([df, expanded_df], axis=1)
result_df = result_df.drop(['ids', 'pmid', 'pmcid'], axis=1)

#Location
expanded_df = pd.json_normalize(df['primary_location'])
result_df = pd.concat([df, expanded_df], axis=1)
result_df = result_df.drop(['primary_location', 'license', 'version'], axis=1)
