## Affiliation data retrieval from OpenAIRE

In [61]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

file_path = "publications.csv" #input file with "doi" column
df = pd.read_csv(file_path)

doi_affiliation_data = []


for doi in df['doi'].dropna().tolist(): 
    api_url = f"https://api.openaire.eu/search/researchProducts?doi={doi}"
    response = requests.get(api_url)
    if response.status_code == 200:
        root = ET.fromstring(response.text) #parse XML response
        results = []

        if root is not None:
            rel_elements = root.findall(".//rel")  #find all <rel> elements
            for rel in rel_elements:
                to_element = rel.find("to")  #find <to>
                if to_element is not None and to_element.get("class") == "hasAuthorInstitution":
                    legalname = rel.find("legalname")
                    legalshortname = rel.find("legalshortname")
                    websiteurl = rel.find("websiteurl")
                    country = rel.find("country")
                    if country is not None and country.get("classname"):
                        countryname = country.get("classname")
                        
                    results.append({
                        "doi": doi,
                        "legalname": legalname.text.strip() if legalname is not None else "No legalname provided",
                        "legalshortname": legalshortname.text.strip() if legalshortname is not None else "No legalshortname provided",
                        "websiteurl": websiteurl.text.strip() if websiteurl is not None else "No websiteurl provided",
                        "country": countryname if countryname is not None else "No country provided"
                    })

        #remove duplicates (keep 1st one)
        unique_results = []
        seen_legalnames = set()
        for result in results:
            legalname = result["legalname"]
            if legalname not in seen_legalnames:
                unique_results.append(result)
                seen_legalnames.add(legalname)

                
        doi_affiliation_data.extend(unique_results)

#save data to a CSV file
affiliation_df = pd.DataFrame(doi_affiliation_data)
affiliation_df.to_csv("affiliations.csv", index=False)


print(affiliation_df.head())



                   doi                                    legalname  \
0  10.1093/bib/bbae122               The University of Texas System   
1  10.1093/bib/bbae122                Agricultural Research Service   
2  10.1093/bib/bbae122           California Institute of Technology   
3  10.1093/bib/bbae122                   Baylor College of Medicine   
4  10.1093/bib/bbae122  Centre for Plant Biotechnology and Genomics   

  legalshortname                             websiteurl        country  
0      UT System               http://www.utsystem.edu/  United States  
1            ARS  http://www.ars.usda.gov/main/main.htm  United States  
2            CIT                http://www.caltech.edu/  United States  
3            BCM                   https://www.bcm.edu/  United States  
4           CBGP             http://www.cbgp.upm.es/en/          Spain  


## Organization data retrieval from ROR

In [62]:
af_file_path = "affiliations.csv"
af = pd.read_csv(af_file_path)

ror_affiliation_data = []

for name in af['legalname'].dropna().tolist(): 
    base_url = "https://api.ror.org/organizations"
    params = {"query": name}
    
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        results = []
    
        for item in data.get("items", []):
            if item.get("name") == name: #pick the one that matches the legal name of the institution
                results.append({
                    "Institution Name": name,
                    "ROR_ID": item.get("id", "N/A"),
                    "Latitude": item.get("addresses", [{}])[0].get("lat", "N/A"),
                    "Longitude": item.get("addresses", [{}])[0].get("lng", "N/A")
                })
        
  
        ror_affiliation_data.extend(results)
    else:
        print(f"Failed to fetch data for {name} (Status Code: {response.status_code})")


ror_df = pd.DataFrame(ror_affiliation_data)
#remove duplicates
ror_df = ror_df.drop_duplicates(subset=["ROR_ID"])
#save data to a CSV file
ror_df.to_csv("affiliation_ror.csv", index=False)


print(ror_df.head())


Failed to fetch data for National Institute of Health (NIH/NICHD) (Status Code: 500)
Failed to fetch data for UNIVERSIDAD DEL PAIS VASCO/ EUSKAL HERRIKO UNIBERTSITATEA (Status Code: 500)
Failed to fetch data for UNIVERSIDAD DEL PAIS VASCO/ EUSKAL HERRIKO UNIBERTSITATEA (Status Code: 500)
Failed to fetch data for Universidad del País Vasco (UPV/EHU) (Status Code: 500)
Failed to fetch data for UNIVERSIDAD DEL PAIS VASCO/ EUSKAL HERRIKO UNIBERTSITATEA (Status Code: 500)
Failed to fetch data for UNIVERSIDAD DEL PAIS VASCO/ EUSKAL HERRIKO UNIBERTSITATEA (Status Code: 500)
                              Institution Name                     ROR_ID  \
0               The University of Texas System  https://ror.org/01gek1696   
1                Agricultural Research Service  https://ror.org/02d2m2044   
2           California Institute of Technology  https://ror.org/05dxps055   
3                   Baylor College of Medicine  https://ror.org/02pttbw34   
4  Centre for Plant Biotechnology and Gen