In [62]:
import requests
import pandas as pd
import time
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd

In [63]:
authors_data = pd.read_csv("authors_data1.csv")
author_id_list = authors_data['id'].tolist()
author_id_list = [url.split('/')[-1] for url in author_id_list]
author_id_list

# --------------
# or use your own author ID list here
# --------------

FileNotFoundError: [Errno 2] No such file or directory: 'authors_data1.csv'

In [64]:
df_author = pd.read_csv("test.csv", index_col=0)
df_author = df_author[(df_author["works_count"]>5) & (df_author["works_count"]<5000)]
df_author.dropna(inplace=True)
author_id_list = [item.replace("https://openalex.org/","")for item in df_author["id"]] 
author_id_list = list(set(author_id_list))
len(author_id_list), author_id_list

(913,
 ['A5012739163',
  'A5070072188',
  'A5045356775',
  'A5030255458',
  'A5015080044',
  'A5101890757',
  'A5043228682',
  'A5066733214',
  'A5043162191',
  'A5047584056',
  'A5029821479',
  'A5072263832',
  'A5026909061',
  'A5037923102',
  'A5043896739',
  'A5049693274',
  'A5046908604',
  'A5113461179',
  'A5076193392',
  'A5067625837',
  'A5004836032',
  'A5100756799',
  'A5081756270',
  'A5029768249',
  'A5089178335',
  'A5044856388',
  'A5062199735',
  'A5054806059',
  'A5037938374',
  'A5063367289',
  'A5100376896',
  'A5076145498',
  'A5006566063',
  'A5005775991',
  'A5028392704',
  'A5079825006',
  'A5011857931',
  'A5021905307',
  'A5019373396',
  'A5007166081',
  'A5086453253',
  'A5088460626',
  'A5113325281',
  'A5038897083',
  'A5042418453',
  'A5080791781',
  'A5082353608',
  'A5072281431',
  'A5022573878',
  'A5003636926',
  'A5067816317',
  'A5006314379',
  'A5048877432',
  'A5008373524',
  'A5024378198',
  'A5040651832',
  'A5048428275',
  'A5070221432',
  'A5026

In [65]:
# Define Concepts
# can find all concept ID's at: https://api.openalex.org/concepts
concepts1 = ['C144024400', 'C15744967', 'C162324750', 'C17744445']
concepts2 = ['C33923547', 'C121332964', 'C41008148']
# EMAIL = "s204120@dtu.dk"

# Function to generate OpenAlex API request URL
def URL_filter(author_ids):
    base_url = "https://api.openalex.org/works"
    author_filter = f"author.id:{'|'.join(author_ids)}"
    concepts1_filter = '|'.join(concepts1)
    concepts2_filter = '|'.join(concepts2)
    filters = (
        "?filter="
        f"{author_filter},"
        f"cited_by_count:>10,"
        f"concepts.id:({concepts1_filter}),"
        f"concepts.id:({concepts2_filter})"
    )
    
    return base_url + filters

# Function to fetch works for a given batch of authors
def fetch_author_works(author_ids, max_results=200, cursor="*"):
    all_works = []
    
    while cursor:
        url = URL_filter(author_ids)
        params = {
            'per_page': max_results, 
            'cursor': cursor,  
            # 'mailto': EMAIL,
        }

        response = requests.get(url, params=params)
        if response.status_code == 429:
            time.sleep(1)
            response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            continue  # Skip this name and move to the next one
        data = response.json()

        if 'results' in data and data['results']:
            all_works.extend(data['results'])
        cursor = data.get('meta', {}).get('next_cursor', None)

    return all_works

# Function to extract relevant details from works
def extract_work_details(works):
    works_data = []
    for work in works:
        work_info = {
            'id': work.get('id', 'N/A'),
            'publication_year': work.get('publication_year', 'N/A'),
            'cited_by_count': work.get('cited_by_count', 0),
            'author_ids': [author['author'].get('id', 'N/A') for author in work.get('authorships', [])],
            'title': work.get('title', 'N/A'),
            'abstract_inverted_index': work.get('abstract_inverted_index', 'N/A'),
        }
        works_data.append(work_info)
    return works_data


def fetch_and_process_batch(chunk):
    works = fetch_author_works(chunk)
    return extract_work_details(works)

#Batch size = how many authors per API request, n_jobs are parallel threads.
def Request_batching(author_ids, batch_size=25, n_jobs=9):
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    all_works_data = []
    with tqdm(total=len(chunks), desc="Fetching batches") as pbar:
        for i in range(0, len(chunks), n_jobs):  # Process in groups of 10
            batch_chunks = chunks[i:i + n_jobs]  # Take up to 10 batches
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_batch)(chunk) for chunk in batch_chunks
            )
            
            for batch in results:
                all_works_data.extend(batch)
            
            pbar.update(len(batch_chunks))  

    return all_works_data

all_works_data = Request_batching(author_id_list)

df_work = pd.DataFrame(all_works_data)

Fetching batches: 100%|██████████| 37/37 [00:37<00:00,  1.03s/it]


In [66]:
df_work.shape

(12337, 6)

In [67]:
df_work = df_work.drop_duplicates(subset=["id"])
df_work

Unnamed: 0,id,publication_year,cited_by_count,author_ids,title,abstract_inverted_index
0,https://openalex.org/W2110620844,2010,1862,"[https://openalex.org/A5059236838, https://ope...",Link communities reveal multiscale complexity ...,
1,https://openalex.org/W2402700,2021,1324,"[https://openalex.org/A5102962995, https://ope...",Predicting Depression via Social Media,"{'Major': [0], 'depression': [1, 20, 137, 197]..."
2,https://openalex.org/W2916904544,2019,1065,"[https://openalex.org/A5057541630, https://ope...",Guidelines for Human-AI Interaction,"{'Advances': [0], 'in': [1, 20, 36, 40, 49, 10..."
3,https://openalex.org/W4360836968,2023,976,"[https://openalex.org/A5021842384, https://ope...",Sparks of Artificial General Intelligence: Ear...,"{'Artificial': [0], 'intelligence': [1, 99, 19..."
4,https://openalex.org/W2122841972,2005,832,"[https://openalex.org/A5032417423, https://ope...",Personalizing search via automated analysis of...,"{'We': [0, 66, 138], 'formulate': [1], 'and': ..."
...,...,...,...,...,...,...
12330,https://openalex.org/W2124125807,2007,15,"[https://openalex.org/A5035349918, https://ope...",COMMUNITY DYNAMICS IN SOCIAL NETWORKS,"{'We': [0, 72], 'study': [1], 'the': [2, 13, 2..."
12333,https://openalex.org/W1591233664,1999,22,"[https://openalex.org/A5082948032, https://ope...",Building an adaptive multimedia system using t...,
12334,https://openalex.org/W1996496567,1992,17,"[https://openalex.org/A5038976962, https://ope...",Three-dimensional Toom model: Connection to th...,"{'A': [0], 'three-dimensional': [1], 'Toom': [..."
12335,https://openalex.org/W2008318688,2001,14,"[https://openalex.org/A5015642017, https://ope...",Modeling relaxation and jamming in granular media,"{'We': [0], 'introduce': [1], 'a': [2, 20, 66,..."


Extracting them into seperate datasets

In [68]:
IC2S2_papers = df_work[["id","publication_year","cited_by_count", "author_ids"]]
IC2S2_abstracts = df_work[["id","title", "abstract_inverted_index"]]
IC2S2_papers = IC2S2_papers.dropna()
IC2S2_papers

Unnamed: 0,id,publication_year,cited_by_count,author_ids
0,https://openalex.org/W2110620844,2010,1862,"[https://openalex.org/A5059236838, https://ope..."
1,https://openalex.org/W2402700,2021,1324,"[https://openalex.org/A5102962995, https://ope..."
2,https://openalex.org/W2916904544,2019,1065,"[https://openalex.org/A5057541630, https://ope..."
3,https://openalex.org/W4360836968,2023,976,"[https://openalex.org/A5021842384, https://ope..."
4,https://openalex.org/W2122841972,2005,832,"[https://openalex.org/A5032417423, https://ope..."
...,...,...,...,...
12330,https://openalex.org/W2124125807,2007,15,"[https://openalex.org/A5035349918, https://ope..."
12333,https://openalex.org/W1591233664,1999,22,"[https://openalex.org/A5082948032, https://ope..."
12334,https://openalex.org/W1996496567,1992,17,"[https://openalex.org/A5038976962, https://ope..."
12335,https://openalex.org/W2008318688,2001,14,"[https://openalex.org/A5015642017, https://ope..."


In [69]:
IC2S2_abstracts = IC2S2_abstracts.dropna()
IC2S2_abstracts

Unnamed: 0,id,title,abstract_inverted_index
1,https://openalex.org/W2402700,Predicting Depression via Social Media,"{'Major': [0], 'depression': [1, 20, 137, 197]..."
2,https://openalex.org/W2916904544,Guidelines for Human-AI Interaction,"{'Advances': [0], 'in': [1, 20, 36, 40, 49, 10..."
3,https://openalex.org/W4360836968,Sparks of Artificial General Intelligence: Ear...,"{'Artificial': [0], 'intelligence': [1, 99, 19..."
4,https://openalex.org/W2122841972,Personalizing search via automated analysis of...,"{'We': [0, 66, 138], 'formulate': [1], 'and': ..."
5,https://openalex.org/W2036625316,A diary study of task switching and interruptions,"{'We': [0, 47], 'report': [1], 'on': [2], 'a':..."
...,...,...,...
12327,https://openalex.org/W2885796313,"Mandarin-Speaking, Kindergarten-Aged Children ...","{'Purpose': [0], 'The': [1, 31], 'purpose': [2..."
12328,https://openalex.org/W3127579382,Social network structure and composition in fo...,"{'Abstract': [0], 'Social': [1], 'networks': [..."
12330,https://openalex.org/W2124125807,COMMUNITY DYNAMICS IN SOCIAL NETWORKS,"{'We': [0, 72], 'study': [1], 'the': [2, 13, 2..."
12334,https://openalex.org/W1996496567,Three-dimensional Toom model: Connection to th...,"{'A': [0], 'three-dimensional': [1], 'Toom': [..."


In [70]:
IC2S2_papers.to_csv("IC2S2_papers.csv")
IC2S2_abstracts.to_csv("IC2S2_abstracts.csv")

## Co-author

First extract all author_ids and find all uniques (not in previous list)

In [71]:
co_authors = []
for i_list in IC2S2_papers["author_ids"].to_list():
    for id in i_list:
        id = id.replace("https://openalex.org/", "")
        if id not in author_id_list:
            co_authors.append(id)
set(co_authors), len(set(co_authors))

({'A5091040194',
  'A5086960085',
  'A5113599845',
  'A5016645705',
  'A5054422803',
  'A5019877531',
  'A5091290326',
  'A5036669088',
  'A5100341196',
  'A5109000592',
  'A5076260624',
  'A5113818031',
  'A5039681359',
  'A5048425098',
  'A5016246327',
  'A5012881028',
  'A5102104312',
  'A5089902996',
  'A5068059558',
  'A5014792235',
  'A5059188751',
  'A5053378150',
  'A5082775596',
  'A5081618494',
  'A5101784712',
  'A5051355254',
  'A5005627910',
  'A5114072975',
  'A5075493826',
  'A5087629528',
  'A5009379263',
  'A5081241112',
  'A5056574916',
  'A5059256506',
  'A5056169129',
  'A5109859076',
  'A5089241435',
  'A5039777967',
  'A5088968784',
  'A5032434500',
  'A5034142550',
  'A5029317616',
  'A5108524530',
  'A5016354542',
  'A5066556110',
  'A5052026918',
  'A5060005215',
  'A5046155626',
  'A5100727381',
  'A5108651218',
  'A5058724999',
  'A5001775141',
  'A5005779176',
  'A5062910836',
  'A5037340562',
  'A5052308997',
  'A5032489175',
  'A5100434570',
  'A5030951161

co_authors works

In [72]:
all_works_data = Request_batching(co_authors)

df_co_work = pd.DataFrame(all_works_data)

Fetching batches:   1%|          | 9/1598 [00:20<1:01:16,  2.31s/it]


KeyboardInterrupt: 

In [None]:
df_co_work.shape

(850433, 6)

In [None]:
df_co_work.drop_duplicates(subset=["id"])
df_co_work.to_csv("co_author_works.csv")

Then we edit it to get the author information

In [73]:

# Function to generate OpenAlex API request URL
def URL_filter(author_ids):
    base_url = "https://api.openalex.org/authors"
    author_filter = f"id:{'|'.join(author_ids)}"
    # sort = "&sort=relevance_score:desc"
    filters = (
        "?filter="
        f"{author_filter}"
    )
    
    return base_url + filters

# Function to fetch works for a given batch of authors
def fetch_author(author_ids, max_results=200, cursor="*"):
    all_works = []
    
    while cursor:
        url = URL_filter(author_ids)
        params = {
            'per_page': max_results, 
            'cursor': cursor,
        }

        response = requests.get(url, params=params)
        if response.status_code == 429:
            time.sleep(1)
            response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            continue  # Skip this name and move to the next one
        data = response.json()

        if 'results' in data and data['results']:
            all_works.extend(data['results'])
        cursor = data.get('meta', {}).get('next_cursor', None)

    return all_works

# Function to extract relevant details from works
def extract_author_details(authors):
    data_list = []
    for person in authors:
        data_info = {
                "id": person.get("id"),
                "display_name": person.get("display_name"),
                "works_api_url": person.get("works_api_url"),
                "works_count": person.get("works_count"),
                "cited_by_count": person.get("cited_by_count"),
                "h_index": person.get("summary_stats", {}).get("h_index", None),
                "country_code": (person.get("affiliations", [{}])[0].get("institution", {}).get("country_code", None) 
                         if person.get("affiliations") else None)
            }
        data_list.append(data_info)
    return data_list


def fetch_and_process_batch(chunk):
    authors = fetch_author(chunk)
    return extract_author_details(authors)

#Batch size = how many authors per API request, n_jobs are parallel threads.
def Request_batching(author_ids, batch_size=25, n_jobs=9):
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    all_author_data = []
    with tqdm(total=len(chunks), desc="Fetching batches") as pbar:
        for i in range(0, len(chunks), n_jobs):  # Process in groups of 10
            batch_chunks = chunks[i:i + n_jobs]  # Take up to 10 batches
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_batch)(chunk) for chunk in batch_chunks
            )
            
            for batch in results:
                all_author_data.extend(batch)
            
            pbar.update(len(batch_chunks))   

    return all_author_data

all_author_data = Request_batching(co_authors)

df_co_author = pd.DataFrame(all_author_data)
# fetch_author_works([co_authors[0],co_authors[1]])


Fetching batches:  19%|█▉        | 306/1598 [01:31<06:27,  3.34it/s]


KeyboardInterrupt: 

In [None]:
df_co_author

Unnamed: 0,id,display_name,works_api_url,works_count,cited_by_count,h_index,country_code
0,https://openalex.org/A5087421071,Gary King,https://api.openalex.org/works?filter=author.i...,2344,74081,101,US
1,https://openalex.org/A5065393757,Michael Wooldridge,https://api.openalex.org/works?filter=author.i...,514,29165,64,GB
2,https://openalex.org/A5075915553,Stephen McAdams,https://api.openalex.org/works?filter=author.i...,458,13082,55,CA
3,https://openalex.org/A5075562557,Peter McBurney,https://api.openalex.org/works?filter=author.i...,349,8244,42,GB
4,https://openalex.org/A5072930140,Jon A. Krosnick,https://api.openalex.org/works?filter=author.i...,334,35727,93,US
...,...,...,...,...,...,...,...
38720,https://openalex.org/A5032062810,Isabel Diersen,https://api.openalex.org/works?filter=author.i...,2,12,1,
38721,https://openalex.org/A5101457009,Soumen Chakrabarti,https://api.openalex.org/works?filter=author.i...,273,12813,52,IN
38722,https://openalex.org/A5068021191,Andrew Tomkins,https://api.openalex.org/works?filter=author.i...,181,23798,54,US
38723,https://openalex.org/A5081744093,Lars Bäckström,https://api.openalex.org/works?filter=author.i...,57,12746,34,IL


In [None]:
df_co_author.drop_duplicates(subset=["id"])

In [None]:
df_co_author.to_csv("df_co_author.csv")