In [13]:
import pandas as pd
import json
import yaml
import numpy as np
import requests
import re
import random
import time
from tqdm.notebook import tqdm


# Looking into the use of retired HGNCs in paper abstracts using the EuropePMC search API

## 1. Overview
Please write purpose of the analyses


## 2. Define query parameters, set up query
We use the EuropePMC search API call ([documentation](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API/search)) to retrieve all matching publications for each of the identifiers. Because each query is limited to 25 results, we need to define a function that iterates over the page cursors until the whole list of matches is retrieved.

In [14]:
# Define the query and processing parameters
USER_AGENT = "Mozilla/5.0"
page_size = 1000
cursor_mark = '*'
format_type = 'json'
base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resultType=core&cursorMark={}&pageSize=25&format={}"
relevant_columns = ['pmid', 'pmcid', 'doi', 'title', 'pubYear', 'abstractText'] # set yourself to filter the result


In [15]:
def search_identifier(identifier, base_url=base_url, cursor_mark=cursor_mark, page_size=page_size, format_type=format_type, relevant_columns=relevant_columns):
    """Searches EuropePMC for the provided identifier, returns list with all data for matches"""
    # Initialize the list to store retracted articles
    matches = []
    # Make the initial request to get the total number of results
    url = base_url.format(identifier, cursor_mark, format_type)
    try:
        response = requests.get(url).json()
        total = response['hitCount']
        # Calculate the number of requests needed to retrieve all results
        matches.extend(response['resultList']['result'])
        num_requests = (total + page_size - 1) // page_size + 1
        # Iterate through each page and append the results to the list
        for i in range(num_requests):
            cursor_mark = response['nextCursorMark']
            if cursor_mark is None:
                break
            url = base_url.format(identifier, cursor_mark, format_type)
    
            try:
                response = requests.get(url).json()
                matches.extend(response['resultList']['result'])
            except Exception as e:
                #print("An error occurred: " + str(e))
                pass
            
    except Exception as e:
        #print("An error occurred: " + str(e))
        pass

    # Get the intersection of columns between the DataFrame and relevant_columns
    df = pd.DataFrame(matches)
    common_columns = list(set(df.columns) & set(relevant_columns))

    # Create a new DataFrame with only the common columns
    try:
        res = df.loc[:, common_columns]
    except KeyError as e:
        print(f"Error: {e} column not found.")
        res = df

    res = res.fillna("NA")

    return res



### Example

Uses one ifentifier to see how it works ('cbbm')

In [16]:
test = pd.DataFrame(search_identifier('cbbm'))
print('Columns:\n- ' + "\n- ".join(list(test.columns)))
test


Columns:
- pmcid
- doi
- abstractText
- pubYear
- title
- pmid


Unnamed: 0,pmcid,doi,abstractText,pubYear,title,pmid
0,PMC10176687,10.1186/s13068-023-02329-9,<h4>Background</h4>Anaerobic Saccharomyces cer...,2023,Quantification and mitigation of byproduct for...,37173767
1,,,Autotrophic carbon-fixing bacteria affect carb...,2023,Soil carbon-fixing bacterial communities respo...,
2,,10.1016/j.scitotenv.2023.164423,Grazing exclusion changes soil physical-chemic...,2023,Recovery through proper grazing exclusion prom...,37236486
3,,10.1007/s00415-023-11761-8,<h4>Background</h4>X-Linked dystonia-parkinson...,2023,Oculomotor abnormalities indicate early execut...,37191726
4,PMC10145897,10.3390/ma16082951,Adhesives are increasingly being employed in i...,2023,Development of a Unified Specimen for Adhesive...,37109787
...,...,...,...,...,...,...
70,,10.1111/jcpe.13273,<h4>Objectives</h4>To determine the volume sta...,2020,Volume stability of the augmented sinus using ...,32092169
71,PMC9040747,10.1128/mbio.03629-21,Carboxysomes are anabolic bacterial microcompa...,2022,Decoding the Absolute Stoichiometric Compositi...,35343789
72,PMC9198552,10.3389/fmed.2022.885187,Non-invasive spatially resolved functional ima...,2022,Phase-Sensitive Measurements of Depth-Dependen...,35721092
73,PMC9716276,10.3389/fphys.2022.1017381,Thyroid hormones (THs) play a major role regul...,2022,Temperature modulates systemic and central act...,36467699


## 3. Apply for all HGNCs
We have a table of HGNCs in [hgnc.tsv](data/hgnc.tsv). We'll apply `search_identifier` for each identifier, then remove all duplicated matches.

Load our dataset:

In [17]:
hgnc_df = pd.read_csv('data/hgnc.tsv', sep='\t')
hgnc_df.describe()

Unnamed: 0,primary_hgnc_id,secondary_hgnc_id,primary_symbol,secondary_symbol
count,78907,5184,78907,61036
unique,43726,5159,43726,59127
top,Entry Withdrawn,HGNC:21128,Entry Withdrawn,MT1
freq,1790,7,1790,11


Define the function to build a data frame with all unique hits. It will take a long time to run, partly because we wait some seconds between ids.

In [18]:
def multiple_search(column, hgnc_df=hgnc_df):
    results_df = pd.DataFrame()
    total = len(hgnc_df[column])
    print(f'Looking up: {column}')
    for i in tqdm(range(len(hgnc_df[column])), total=total, desc="Progress"):
        id = hgnc_df[column][i]
        if column == 'primary_symbol':
            other = 'secondary_symbol'
        if column == 'secondary_symbol':
            other = 'primary_symbol'
        other_id = hgnc_df[other][i]            
        search = search_identifier(identifier=id)
        search[column] = [id for i in range(len(search))]
        search[other] = [other_id for i in range(len(search))]
        # Generate a random number between 5 and 15 (inclusive)
        random_seconds = random.uniform(5, 15)
        # Pause the program for the random number of seconds
        time.sleep(random_seconds)
        results_df = pd.concat([results_df, search], ignore_index=True)
    results_df.drop_duplicates(inplace=True)
    results_df['type'] = [column for i in range(len(results_df))]
    return results_df


Try on a subset

In [19]:
subset = hgnc_df.head(10)
primaries = multiple_search('primary_symbol', subset)

Looking up: primary_symbol


Progress:   0%|          | 0/10 [00:00<?, ?it/s]

In [20]:
secondaries = multiple_search('secondary_symbol', subset)

Looking up: secondary_symbol


Progress:   0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
primaries.describe()

Unnamed: 0,pmcid,doi,abstractText,pubYear,title,pmid,primary_symbol,secondary_symbol,type
count,580.0,580,580.0,580,580,580.0,580,580,580
unique,406.0,451,443.0,17,454,439.0,8,10,1
top,,10.1371/journal.pone.0187457,,2022,ExpressionDB: An open source platform for dist...,,A1BG-AS1,CPAMD9,primary_symbol
freq,59.0,6,19.0,149,6,22.0,180,75,580


In [22]:
primaries

Unnamed: 0,pmcid,doi,abstractText,pubYear,title,pmid,primary_symbol,secondary_symbol,type
0,,10.1007/s13577-021-00554-8,The dysregulated long non-coding RNA A1BG anti...,2021,Long non-coding RNA A1BG-AS1 promotes tumorige...,34115333,A1BG-AS1,NCRNA00181,primary_symbol
1,,10.1002/jcb.28315,Extensive evidence indicate that long noncodin...,2019,lncRNA A1BG-AS1 suppresses proliferation and i...,30556161,A1BG-AS1,NCRNA00181,primary_symbol
2,PMC9977555,10.1155/2023/6072438,<h4>Introduction</h4>Diabetes is a chronic inf...,2023,Bioinformatics Analysis of the Inflammation-As...,36874406,A1BG-AS1,NCRNA00181,primary_symbol
3,,10.21203/rs.3.rs-1928440/v1,<h4>Background: </h4> The present study mined ...,2022,Bioinformatics analysis of the inflammation-as...,,A1BG-AS1,NCRNA00181,primary_symbol
4,PMC9760568,10.1016/j.envres.2022.114828,<h4>Background</h4>DNA methylation programming...,2023,Epigenome-wide analysis of maternal exposure t...,36400229,A1BG-AS1,NCRNA00181,primary_symbol
...,...,...,...,...,...,...,...,...,...
575,PMC8282479,10.1016/j.jaut.2021.102687,The impact of SARS-CoV-2 infection in patients...,2021,COVID-19 infection among autoimmune rheumatic ...,34311142,AARD,C8orf85,primary_symbol
576,,10.21203/rs.3.rs-774165/v1,In this study we are reporting a prediction mo...,2021,Modeling of Carbon Dioxide Fixation Rate by Mi...,,AARD,C8orf85,primary_symbol
577,,10.1039/d1cp02033a,Among the different thermophysical properties ...,2021,A global transform for the general formulation...,34590662,AARD,C8orf85,primary_symbol
578,PMC8692556,10.1038/s41598-021-03596-y,The solubilities of clemastine fumarate in sup...,2021,Measurement and modeling of clemastine fumarat...,34934101,AARD,C8orf85,primary_symbol


In [23]:
secondaries.describe()

Unnamed: 0,pmcid,doi,abstractText,pubYear,title,pmid,secondary_symbol,primary_symbol,type
count,174.0,174.0,174.0,174,174,174.0,174,174,174
unique,157.0,167.0,171.0,23,173,170.0,9,8,1
top,,,,2023,Isoniazid prevents Nrf2 translocation by inhib...,,A1BG-AS,A1BG-AS1,secondary_symbol
freq,17.0,7.0,3.0,26,2,4.0,75,77,174


In [24]:
secondaries

Unnamed: 0,pmcid,doi,abstractText,pubYear,title,pmid,secondary_symbol,primary_symbol,type
0,PMC8006465,10.3389/fnagi.2021.639428,Alzheimer's disease (AD) is characterized by s...,2021,A Meta-Analysis of Brain DNA Methylation Acros...,33790779,NCRNA00181,A1BG-AS1,secondary_symbol
1,PMC4640166,10.1186/s12864-015-2034-y,<h4>Background</h4>DNA methylation is an impor...,2015,Sex differences in DNA methylation assessed by...,26553366,NCRNA00181,A1BG-AS1,secondary_symbol
2,,10.1002/lary.30840,<h4>Objective</h4>Biological data on the benef...,2023,Impact of Rehydration Following Systemic Dehyd...,37345579,A1BG-AS,A1BG-AS1,secondary_symbol
3,PMC10093134,10.3390/diagnostics13071318,"Nowadays, in the case of suspected prostate ca...",2023,Research of Prostate Cancer Urinary Diagnostic...,37046536,A1BG-AS,A1BG-AS1,secondary_symbol
4,PMC9977555,10.1155/2023/6072438,<h4>Introduction</h4>Diabetes is a chronic inf...,2023,Bioinformatics Analysis of the Inflammation-As...,36874406,A1BG-AS,A1BG-AS1,secondary_symbol
...,...,...,...,...,...,...,...,...,...
169,PMC5546599,10.1371/journal.pone.0181342,Decreased cell-substratum adhesion is cruciall...,2017,Establishment of highly metastatic KRAS mutant...,28786996,C8orf85,AARD,secondary_symbol
170,PMC2741491,10.1186/1471-2164-10-392,<h4>Background</h4>Recent transcriptomic analy...,2009,Identification of novel endogenous antisense t...,19698135,C8orf85,AARD,secondary_symbol
171,PMC5096291,10.1186/s13148-016-0281-7,<h4>Background</h4>Evidence is accumulating th...,2016,Effect of prenatal DHA supplementation on the ...,27822319,C8orf85,AARD,secondary_symbol
172,PMC4522592,10.1016/j.redox.2015.06.020,Isoniazid is used either alone or in combinati...,2015,Isoniazid prevents Nrf2 translocation by inhib...,26202867,C8orf85,AARD,secondary_symbol
