In [1]:
import pandas as pd
import pickle
import time
from scholarly import scholarly

In [2]:
DATA_FOLDER = '../../data/'

In [7]:
with open(f'{DATA_FOLDER}authors.pickle', 'rb') as handle:
    authors = pickle.load(handle)

In [8]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

def extract_organizations(affiliation):
    """Extracts organization names from a affiliation value of an author record."""
    organizations = []
    
    doc = nlp(affiliation)
    # Extract entities labeled as ORG (organization)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            organizations.append(ent.text)
    return organizations

In [9]:
def generate_organizations(authors):
    author_organizations_df = pd.DataFrame()
    data = list()
    
    for author in authors:
        affiliation = author['affiliation']
        author_id = author['author_id']
        if affiliation != 'Unknown':            
            try:                
                print(f'Extracting organizations for author with id {author_id}...')
                organizations = extract_organizations(affiliation)
                for org in organizations:
                    data.append((author_id, org))                                
            except Exception as e:
                print(f"Error processing author with id {author_id}: {e}")
                
    author_organizations_df = pd.DataFrame(data, columns=["author_id", "organization"])
    return author_organizations_df
        

# Extract and store organizations from the author-affiliation field

In [10]:
author_organizations_df = generate_organizations(authors)

Extracting organizations for author with id zkBXb_kAAAAJ...
Extracting organizations for author with id EHvA-IUAAAAJ...
Extracting organizations for author with id 5RoxYhkAAAAJ...
Extracting organizations for author with id ap3FfWEAAAAJ...
Extracting organizations for author with id uxiJL_cAAAAJ...
Extracting organizations for author with id QV_7inYAAAAJ...
Extracting organizations for author with id JNrJo8cAAAAJ...
Extracting organizations for author with id o70NT8IAAAAJ...
Extracting organizations for author with id ZeGca3cAAAAJ...
Extracting organizations for author with id o3DdNZMAAAAJ...
Extracting organizations for author with id Ydo9ResAAAAJ...
Extracting organizations for author with id 4IRe3WYAAAAJ...
Extracting organizations for author with id eKtUwa4AAAAJ...
Extracting organizations for author with id dYi-SW8AAAAJ...
Extracting organizations for author with id -Hz8RP0AAAAJ...
Extracting organizations for author with id t-hrVHwAAAAJ...
Extracting organizations for author with

In [11]:
author_organizations_df

Unnamed: 0,author_id,organization
0,zkBXb_kAAAAJ,Biomedical Informatics
1,zkBXb_kAAAAJ,Shandong University
2,EHvA-IUAAAAJ,Tianjin University
3,EHvA-IUAAAAJ,Tsinghua University
4,EHvA-IUAAAAJ,City University of Hong Kong
...,...,...
111,1wloHDIAAAAJ,City University of Hong Kong
112,Tc_U_9YAAAAJ,Amazon.com
113,jV50Ks8AAAAJ,"Biostatistics, University of Michigan"
114,QVJvfz8AAAAJ,Computer Science and Engineeing


In [19]:
author_organizations_df.to_parquet(f'{DATA_FOLDER}author_organizations.parquet')

In [24]:
organizations_df = pd.DataFrame(author_organizations_df['organization'].unique(), columns=['organization'])

In [25]:
organizations_df

Unnamed: 0,organization
0,Biomedical Informatics
1,Shandong University
2,Tianjin University
3,Tsinghua University
4,City University of Hong Kong
...,...
105,Data Scientist
106,Amazon.com
107,"Biostatistics, University of Michigan"
108,Computer Science and Engineeing


In [26]:
organizations_df.to_parquet(f'{DATA_FOLDER}organizations.parquet')