In [None]:
import pandas as pd
import numpy as np
import requests
import re
import random
import time
from tqdm.notebook import tqdm
import pandas as pd
import plotly.express as px
import ipywidgets

# Looking into the use of retired HGNCs in paper abstracts using the EuropePMC search API

## 1. Overview
Please write purpose of the analyses


## 2. Define query parameters, set up query
We use the EuropePMC search API call ([documentation](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API/search)) to retrieve all matching publications for each of the identifiers. Because each query is limited to 25 results, we need to define a function that iterates over the page cursors until the whole list of matches is retrieved.

In [None]:
# Define the query and processing parameters
USER_AGENT = "Mozilla/5.0"
page_size = 1000
cursor_mark = '*'
format_type = 'json'
base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resultType=core&cursorMark={}&pageSize=25&format={}"
relevant_columns = ['pmid', 'pmcid', 'doi', 'title', 'pubYear', 'abstractText'] # set yourself to filter the result


In [None]:
def search_identifier(identifier, base_url=base_url, cursor_mark=cursor_mark, page_size=page_size, format_type=format_type, relevant_columns=relevant_columns):
    """Searches EuropePMC for the provided identifier, returns a list with all data for matches"""
    # Initialize the list to store retracted articles
    matches = []
    # Make the initial request to get the total number of results
    url = base_url.format(identifier, cursor_mark, format_type)
    try:
        response = requests.get(url).json()
        total = response['hitCount']
        # Check if there are any results
        if 'resultList' not in response:
            print("No results found.")
            return pd.DataFrame()
        # Calculate the number of requests needed to retrieve all results
        matches.extend(response['resultList']['result'])
        num_requests = (total + page_size - 1) // page_size
        # Iterate through each page and append the results to the list
        while response['nextCursorMark'] is not None:
            cursor_mark = response['nextCursorMark']
            url = base_url.format(identifier, cursor_mark, format_type)
            try:
                response = requests.get(url).json()
                matches.extend(response['resultList']['result'])
            except Exception as e:
                #print("An error occurred: " + str(e))
                break   
    except Exception as e:
        #print("An error occurred: " + str(e))
        pass
    # Get the intersection of columns between the DataFrame and relevant_columns
    df = pd.DataFrame(matches)
    common_columns = list(set(df.columns) & set(relevant_columns))
    # Create a new DataFrame with only the common columns
    try:
        res = df.loc[:, common_columns]
    except KeyError as e:
        print(f"Error: {e} column not found.")
        res = df
    res = res.fillna("NA")
    return res


### Example

Uses one ifentifier to see how it works ('cbbm')

In [None]:
test = pd.DataFrame(search_identifier('cbbm'))
test


## 3. Apply for all HGNCs
We have a table of HGNCs in [hgnc.tsv](data/hgnc.tsv). We'll apply `search_identifier` for each identifier, then remove all duplicated matches.

Load our dataset:

In [None]:
hgnc_df = pd.read_csv('data/hgnc.tsv', sep='\t')
hgnc_df.describe()

Define the function to build a data frame with all unique hits. It will take a long time to run, partly because we wait some seconds between ids.

In [None]:
def multiple_search(column, hgnc_df=hgnc_df):
    results_df = pd.DataFrame()
    total = len(hgnc_df[column])
    for i in tqdm(range(len(hgnc_df[column])), total=total, desc="Progress"):
        id = hgnc_df[column][i]
        print(f"\rProcessing {column}: {id} ({i} of {total}) ", end="", flush=True)
        primary_id = hgnc_df['primary_hgnc_id'][i]
        secondary_id = hgnc_df['secondary_hgnc_id'][i]
        if id not in ['NA', 'Entry Withdrawn']:
            if column == 'primary_symbol':
                other = 'secondary_symbol'
            if column == 'secondary_symbol':
                other = 'primary_symbol'
            other_id = hgnc_df[other][i]            
            search = search_identifier(identifier=id)
            # Check if id and other_id exist in the abstracts
            if 'abstractText' in search.columns:
                in_abstract = ['yes' if re.search(re.escape(id), search['abstractText'][i]) else 'no'for i in range(len(search))]
                other_in_abstract = ['yes' if re.search(re.escape(other_id), search['abstractText'][i]) else 'no' for i in range(len(search))]
            search[column] = [id for i in range(len(search))]
            search[other] = [other_id for i in range(len(search))]
            search['primary_hgnc_id'] = [primary_id for i in range(len(search))]
            search['secondary_hgnc_id'] = [secondary_id for i in range(len(search))]
            search['in_abstract'] = in_abstract
            search['other_in_abstract'] = other_in_abstract
            # Generate a random number between 5 and 15 (inclusive)
            random_seconds = random.uniform(5, 15)
            # Pause the program for the random number of seconds
            time.sleep(random_seconds)
            results_df = pd.concat([results_df, search], ignore_index=True)
            results_df.drop_duplicates(inplace=True)
            results_df['type'] = [column for i in range(len(results_df))]
        else:
            pass
    return results_df


#### 3.2. Searching for publications matching all identifiers in the table

We split the search 

Construct results table for primary identifiers:

In [None]:
primaries = multiple_search(column='primary_symbol', hgnc_df=hgnc_df)
primaries.describe()

Cosntruct results table for secondary identifiers:

In [None]:
secondaries = multiple_search(column='secondary_symbol', hgnc_df=hgnc_df)
secondaries.describe()

Merge:

In [None]:
hgnc_abs = pd.concat(primaries, secondaries)
hgnc_abs.describe()

Generate some visualizations:

In [None]:
def generate_plotly_plot(dataframe):
    # Group the dataframe by 'pubYear' and 'primary_symbol' columns and count occurrences
    primary_symbol_counts = dataframe.groupby(['pubYear', 'primary_symbol']).size().reset_index(name='count')
    # Group the dataframe by 'pubYear' and 'secondary_symbol' columns and count occurrences
    secondary_symbol_counts = dataframe.groupby(['pubYear', 'secondary_symbol']).size().reset_index(name='count')
    # Concatenate the two dataframes to combine counts for both 'primary_symbol' and 'secondary_symbol'
    combined_counts = pd.concat([primary_symbol_counts, secondary_symbol_counts])
    # Create the plot using plotly express
    fig = px.bar(combined_counts, x='pubYear', y='count', color='primary_symbol',
                 labels={'count': 'Occurrences', 'pubYear': 'Publication Year', 'primary_symbol': 'Symbol'},
                 title='Occurrences of Categorical Variables Over Time')
    return fig

In [None]:
# Generate the plot
plot = generate_plotly_plot(hgnc_abs)

# Show the plot
plot.show()

Save results

In [None]:
hgnc_abs.to_csv('results/abs.tsv', sep = '\t', index = False)

Note: this method might lead to false positives -we are not actually confirming that the symbol is in the abstract. Maybe use regex before to filter out the FPs?