In [1]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import pandas as pd
import sqlite3

In [2]:
def create_session():
    # Create a custom session with retry logic and polite pool configuration
    session = requests.Session()
    
    # Configure retry strategy (5 retries with exponential backoff)
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    
    # Mount custom adapter with retry logic
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    
    # Set polite pool headers with your institutional email
    session.headers.update({
        'User-Agent': 'UTAustinResearch/1.0 (mailto:robert.stein@utexas.edu)',
        'From': 'robert.stein@utexas.edu'
    })
    
    return session

In [3]:
def clean_id(value):
    if not isinstance(value, str):
        return value
    
    # Remove everything before the last '/'
    clean_value = value.rsplit('/', 1)[-1]
    
    # Remove 'cites:' prefix if present
    if clean_value.startswith('works?'):
        clean_value = clean_value.split('works?filter=cites:', 1)[-1]
    
    return clean_value

In [4]:
# Define all functions
def fetch_cited_by(url, params=None):
    if params is None:
        params = {}
    params['per_page'] = 100  # items per page
    data_frames = []
    page = 1
    work_id = url.split('/')[-1]  # extract work id from URL

    with create_session() as session:
        while True:
            params['page'] = page

            time.sleep(1)

            response = session.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            page_results = data.get('results', [])
            if page_results:
                # Temporarily drop abstracts before flattening
                abstracts = [r.pop('abstract_inverted_index', None) for r in page_results]
                df_page = pd.json_normalize(page_results, sep='_')
                df_page['abstract_inverted_index'] = abstracts
                # Tag each record with the seed_id
                df_page['seed_id'] = work_id
                data_frames.append(df_page)
            meta = data.get('meta', {})
            total_pages = meta.get('page_count', 1)
            print(f"Fetched page {page} of {total_pages} for cited_by work {work_id}")
            if page >= total_pages:
                break
            page += 1

    return pd.concat(data_frames, ignore_index=True) if data_frames else pd.DataFrame()

def fetch_ref(chunks, params=None):
    if params is None:
        params = {}
    params['per_page'] = 50  # items per page for referenced works
    data_frames = []
    with create_session() as session:
        for seed_id, chunk in chunks:
            # Build the URL using the filter field "ids.openalex"
            url = f"https://api.openalex.org/works?filter=ids.openalex:{chunk}"
            page = 1
            while True:
                time.sleep(1)
                params['page'] = page
                response = session.get(url, params=params)
                response.raise_for_status()
                data = response.json()
                page_results = data.get('results', [])
                if page_results:
                    abstracts = [r.pop('abstract_inverted_index', None) for r in page_results]
                    df_page = pd.json_normalize(page_results, sep='_')
                    df_page['abstract_inverted_index'] = abstracts
                    df_page['seed_id'] = seed_id
                    data_frames.append(df_page)
                meta = data.get('meta', {})
                total_pages = meta.get('page_count', 1)
                print(f"Fetched page {page} of {total_pages} for chunk: {chunk}")
                if page >= total_pages:
                    break
                page += 1
    return pd.concat(data_frames, ignore_index=True) if data_frames else pd.DataFrame()

def get_seed_data(seed_id):
    seed_url = f"https://api.openalex.org/works/{seed_id}"
    response = requests.get(seed_url)
    response.raise_for_status()
    return response.json()

def process_seed(seed_id):
    # Retrieve seed paper data
    seed_data = get_seed_data(seed_id)
    
    # Process referenced works into 50-item chunks
    referenced = seed_data.get('referenced_works', [])
    chunks = [
        (seed_id, "|".join(referenced[i:i+50]))
        for i in range(0, len(referenced), 50)
    ]
    
    print(f"Processing referenced works for seed {seed_id}")
    df_ref = fetch_ref(chunks)
    
    # Process cited_by works using the cited_by API URL
    cited_by_url = seed_data.get('cited_by_api_url')
    if cited_by_url:
        print(f"Processing cited_by works for seed {seed_id}")
        df_cited = fetch_cited_by(cited_by_url)
    else:
        print(f"No cited_by_api_url found for seed {seed_id}")
        df_cited = pd.DataFrame()
    
    return df_ref, df_cited

def process_seeds(seed_ids):
    master_ref_df = pd.DataFrame()
    master_cited_df = pd.DataFrame()
    
    for seed_id in seed_ids:
        print(f"\n=== Processing seed paper {seed_id} ===")
        df_ref, df_cited = process_seed(seed_id)
        master_ref_df = pd.concat([master_ref_df, df_ref], ignore_index=True)
        master_cited_df = pd.concat([master_cited_df, df_cited], ignore_index=True)
    
    # Tag the type of relationship for later chaining
    master_ref_df['relationship'] = 'referenced'
    master_cited_df['relationship'] = 'cited_by'
    
    # Combine both DataFrames
    combined_df = pd.concat([master_ref_df, master_cited_df], ignore_index=True)
    return combined_df

In [5]:
def reconstruct_text(inverted_index):
    if not isinstance(inverted_index, dict) or not inverted_index:
        return pd.NA  # Returns missing value for non-dict/empty inputs
    
    try:
        all_positions = [pos for positions in inverted_index.values() for pos in positions]
        if not all_positions:
            return pd.NA
            
        max_index = max(all_positions)
        tokens = [None] * (max_index + 1)
        
        for token, positions in inverted_index.items():
            for pos in positions:
                if pos <= max_index:
                    tokens[pos] = token
                    
        # Join only valid tokens and filter empty results
        reconstructed = ' '.join(filter(None, tokens))
        return reconstructed if reconstructed else pd.NA
    
    except Exception as e:
        print(f"Error reconstructing abstract: {str(e)}")
        return pd.NA

In [6]:
# First level: Process the original seed papers
seed_ids = ['W2001526706']  # Replace or add your seed paper IDs here
#'W85815303' Immerwahr & Foleno (2000)
#'W2001526706' #Doyle (2016)
first_level_df = process_seeds(seed_ids)
print('DONE!')


=== Processing seed paper W2001526706 ===
Processing referenced works for seed W2001526706
Fetched page 1 of 1 for chunk: https://openalex.org/W105607339|https://openalex.org/W124952081|https://openalex.org/W139109474|https://openalex.org/W1481749143|https://openalex.org/W151672427|https://openalex.org/W1558393125|https://openalex.org/W162456649|https://openalex.org/W1717966377|https://openalex.org/W195552800|https://openalex.org/W1977015532|https://openalex.org/W1977825494|https://openalex.org/W1988323515|https://openalex.org/W1990244805|https://openalex.org/W1995278230|https://openalex.org/W2016382728|https://openalex.org/W2018291456|https://openalex.org/W2037927976|https://openalex.org/W2073116532|https://openalex.org/W2101285173|https://openalex.org/W2115074743|https://openalex.org/W2116399772|https://openalex.org/W2128229925|https://openalex.org/W2128416303|https://openalex.org/W2129345644|https://openalex.org/W2160460682|https://openalex.org/W2192791544|https://openalex.org/W228

In [7]:

# Extract unique work IDs from the first-level results (assuming the field 'id' holds the OpenAlex work ID)
# You can also filter by relationship if needed.
new_seed_ids = first_level_df['id'].dropna().unique().tolist()

# Optional: Exclude original seed IDs from the second-level seeds
second_level_seed_ids = [sid for sid in new_seed_ids if sid not in seed_ids]

print(f"Second-level seed IDs (new seeds): {second_level_seed_ids}")
print('DONE!')

Second-level seed IDs (new seeds): ['https://openalex.org/W1977825494', 'https://openalex.org/W2037927976', 'https://openalex.org/W2016382728', 'https://openalex.org/W2329648039', 'https://openalex.org/W2115074743', 'https://openalex.org/W1988323515', 'https://openalex.org/W1481749143', 'https://openalex.org/W1558393125', 'https://openalex.org/W645974420', 'https://openalex.org/W1977015532', 'https://openalex.org/W2319073104', 'https://openalex.org/W2018291456', 'https://openalex.org/W2101285173', 'https://openalex.org/W2160460682', 'https://openalex.org/W2116399772', 'https://openalex.org/W2129345644', 'https://openalex.org/W3128636248', 'https://openalex.org/W2320813392', 'https://openalex.org/W1717966377', 'https://openalex.org/W2192791544', 'https://openalex.org/W638958551', 'https://openalex.org/W1995278230', 'https://openalex.org/W2929511096', 'https://openalex.org/W151672427', 'https://openalex.org/W85815303', 'https://openalex.org/W2128416303', 'https://openalex.org/W1990244805

In [8]:
# Second level: Use these new seed IDs for further citation chaining
second_level_df = process_seeds(second_level_seed_ids)
print('DONE!')


=== Processing seed paper https://openalex.org/W1977825494 ===
Processing referenced works for seed https://openalex.org/W1977825494
Processing cited_by works for seed https://openalex.org/W1977825494
Fetched page 1 of 1 for cited_by work works?filter=cites:W1977825494

=== Processing seed paper https://openalex.org/W2037927976 ===
Processing referenced works for seed https://openalex.org/W2037927976
Processing cited_by works for seed https://openalex.org/W2037927976
Fetched page 1 of 1 for cited_by work works?filter=cites:W2037927976

=== Processing seed paper https://openalex.org/W2016382728 ===
Processing referenced works for seed https://openalex.org/W2016382728
Processing cited_by works for seed https://openalex.org/W2016382728
Fetched page 1 of 1 for cited_by work works?filter=cites:W2016382728

=== Processing seed paper https://openalex.org/W2329648039 ===
Processing referenced works for seed https://openalex.org/W2329648039
Processing cited_by works for seed https://openalex.o

  master_ref_df['relationship'] = 'referenced'
  master_cited_df['relationship'] = 'cited_by'


In [9]:
# Merge and clean
merged = pd.concat([first_level_df, second_level_df], ignore_index=True)

# Clean merged_clean in a single pipeline
merged_clean = (
    merged
    .query("is_retracted != True")
    .loc[lambda df: df['cited_by_count'] > 0]
    .assign(
        id=lambda df: df['id'].apply(clean_id),
        seed_id=lambda df: df['seed_id'].apply(clean_id),
        # Convert and remove original columns in one operation
        citation_top_1=lambda df: df.pop('citation_normalized_percentile_is_in_top_1_percent').astype(bool),
        citation_top_10=lambda df: df.pop('citation_normalized_percentile_is_in_top_10_percent').astype(bool),
        abstract=lambda df: df['abstract_inverted_index'].apply(reconstruct_text)
    )
    .dropna(axis='columns', how='all')
)

In [10]:
# Create main works dataframe
works_cols = [
    'id', 'title', 'primary_location_landing_page_url', 'publication_date', 'publication_year',
    'language', 'type', 'open_access_is_oa', 'cited_by_count', 'fwci',
    'citation_top_1', 'citation_top_10', 'has_fulltext', 'fulltext_origin'
]

works = merged_clean[works_cols]

# Columns to consider for duplicate checking
cols_to_check = works.columns.difference(['cited_by_count']).tolist()

# Remove duplicates based on all columns EXCEPT cited_by_count
works = works.drop_duplicates(
    subset=cols_to_check,  # All columns except cited_by_count
    keep='first'
)

In [11]:
# Create authors dataframe
authors = pd.json_normalize(
    merged_clean.to_dict(orient='records'),
    record_path='authorships',
    meta=['id']
)

authors = authors.rename(columns={
    'id': 'work_id',
    'author.id': 'author_id',
    'author.display_name': 'author_display_name',
    'author.orcid': 'author_orcid'
})

authors['author_id'] = authors['author_id'].apply(lambda x: x.rsplit('/', 1)[-1] if isinstance(x, str) else x)
authors['author_orcid'] = authors['author_orcid'].apply(lambda x: x.rsplit('/', 1)[-1] if isinstance(x, str) else x)

authors_cols = [
    'author_id', 'author_display_name', 'author_orcid',
    'raw_author_name', 'author_position', 'is_corresponding'
]

authors_clean = authors[authors_cols].drop_duplicates(subset=['author_id'])


In [12]:
# Create abstracts dataframe with un-inverted entries
abstracts = merged_clean[['id', 'abstract']].rename(columns={
    'id': 'work_id'
})
abstracts = abstracts.dropna(subset=['abstract']).drop_duplicates()

In [13]:
# Create a bridge dataframe, linking authors with works
bridge = authors[['work_id', 'author_id']].drop_duplicates()

In [14]:
# Create relationships dataframe
relationships_cols = [
    'id', 'seed_id', 'relationship'
]
relationships = merged_clean[relationships_cols].rename(columns={'id': 'work_id'}).drop_duplicates()

In [17]:
conn = sqlite3.connect("openalex.db")
cursor = conn.cursor()

# Create the 'works' table
cursor.execute("DROP TABLE IF EXISTS works;")
cursor.execute("""
CREATE TABLE works (
    id TEXT PRIMARY KEY,
    title TEXT,
    primary_location_landing_page_url TEXT,
    publication_date TEXT,
    publication_year INTEGER,
    language TEXT,
    type TEXT,
    open_access_is_oa BOOLEAN,
    cited_by_count INTEGER,
    fwci REAL,
    has_fulltext BOOLEAN,
    fulltext_origin TEXT,
    citation_top_1 BOOLEAN,
    citation_top_10 BOOLEAN
);
""")

# Create the 'authors' table
cursor.execute("DROP TABLE IF EXISTS authors;")
cursor.execute("""
CREATE TABLE authors (
    author_id TEXT,
    author_display_name TEXT,
    author_orcid TEXT,
    raw_author_name TEXT,
    author_position TEXT,
    is_corresponding BOOLEAN,
    PRIMARY KEY (author_id)
);
""")

# Create the 'abstracts' table
cursor.execute("DROP TABLE IF EXISTS abstracts;")
cursor.execute("""
CREATE TABLE abstracts (
    work_id TEXT PRIMARY KEY,
    abstract TEXT,
    FOREIGN KEY (work_id) REFERENCES works(id)
);
""")

# Create the 'bridge' table
cursor.execute("DROP TABLE IF EXISTS bridge;")
cursor.execute("""
CREATE TABLE bridge (
    work_id TEXT,
    author_id TEXT,
    PRIMARY KEY (work_id, author_id),
    FOREIGN KEY (work_id) REFERENCES works(id),
    FOREIGN KEY (author_id) REFERENCES authors(author_id)
);
""")

#Create  relationships table
cursor.execute("DROP TABLE IF EXISTS relationships;")
cursor.execute("""
CREATE TABLE relationships (
    work_id TEXT,
    seed_id TEXT,
    relationship TEXT,
    PRIMARY KEY (work_id, seed_id, relationship),
    FOREIGN KEY (work_id) REFERENCES works(id),
    FOREIGN KEY (seed_id) REFERENCES works(id)
);
""")

conn.commit()

# Insert the dataframes into the right tables
works.to_sql("works", conn, if_exists="append", index=False)
authors_clean.to_sql("authors", conn, if_exists="append", index=False)
abstracts.to_sql("abstracts", conn, if_exists="append", index=False)
bridge.to_sql("bridge", conn, if_exists="append", index=False)
relationships.to_sql("relationships", conn, if_exists="append", index=False)

conn.commit()
conn.close()