In [None]:
# Import Libraries
from pandas.api.types import is_numeric_dtype
from collections import defaultdict
import plotly.graph_objects as go
from Bio import Entrez, Medline
import matplotlib.pyplot as plt
from pathlib import Path
import pubchempy as pcp
from tqdm import tqdm
import networkx as nx
import pandas as pd
import numpy as np
import requests
import signal
import time
import re

# Set email address (required for NCBI API usage)
Entrez.email = 'sks6765@psu.edu'

In [None]:
def create_graph_viz(dataframe, column1, column2, label=1):
    # Creating an interactive graph
    G = nx.Graph()
    
    # Add edges to the graph
    for index, row in dataframe.iterrows():
        G.add_edge(row[column1], row[column2], label=row[label] if label != 1 else 1)
    
    # Add edge labels
    edge_labels = {(u, v): d['label'] for u, v, d in G.edges(data=True)}
    nx.set_edge_attributes(G, edge_labels, 'label')
    
    # Position nodes using spring layout
    pos = nx.spring_layout(G)
    
    # Extract node and edge information
    node_x = [pos[node][0] for node in G.nodes()]
    node_y = [pos[node][1] for node in G.nodes()]
    edge_x = []
    edge_y = []
    edge_text = []
    for edge in G.edges():
        source, target = edge
        x0, y0 = pos[source]
        x1, y1 = pos[target]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        edge_text.append(G.get_edge_data(*edge)['label'])  # Use existing edge labels
    
    # Create a Plotly figure
    fig = go.Figure(data=[go.Scatter(
        x=edge_x,
        y=edge_y,
        mode='lines',
        line_shape='spline',
        opacity=0.5,
        hoverinfo='none'
    ),
    go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers',
        hoverinfo='text',
        hovertext=[f'Node {node}' for node in G.nodes()]
    ),
    go.Scatter(
        x=[(pos[edge[0]][0] + pos[edge[1]][0]) / 2 for edge in G.edges()],
        y=[(pos[edge[0]][1] + pos[edge[1]][1]) / 2 for edge in G.edges()],
        mode='text',
        text=edge_text,
        textposition='middle center',
        hoverinfo='none'
    )])
    
    # Customize the layout
    fig.update_layout(
        showlegend=False,
        hovermode='x',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
    
    # Display the figure
    fig.show()

### DDI Sampling

In [None]:
ddi = pd.read_csv("../base_data/bio-decagon-combo.csv")

In [None]:
class TimeoutError(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutError

def CID_information(CID: int, timeout: int = 2):

    # Set the signal handler
    signal.signal(signal.SIGALRM, timeout_handler)

    # Set the alarm
    signal.alarm(timeout)

    try:
        cleaned_cid = int(re.sub(r"CID[m|s]*0*", "", CID))
        compound = pcp.Compound.from_cid(cleaned_cid)
        try:
            compound_name = compound.synonyms[0]
        except:
            compound_name = compound.iupac_name
        return (compound_name, compound.canonical_smiles)
    except TimeoutError:
        print("Timeout: Function took too long to complete.")
        return None
    finally:
        # Disable the alarm
        signal.alarm(0)

In [None]:
random_state=85
np.random.seed(random_state)

# Creating the graph from the ddi dataframe
G = nx.from_pandas_edgelist(ddi, 'STITCH 1', 'STITCH 2')

# Get the largest connected component
largest_component = max(nx.connected_components(G), key=len)

# Sample nodes from the largest component
sample_size = 200
sample_nodes = np.random.choice(list(largest_component), size=sample_size, replace=False)

# Get the subgraph induced by the sampled nodes
subgraph = G.subgraph(sample_nodes)

# Convert the subgraph back to a DataFrame
sampled_df = nx.to_pandas_edgelist(subgraph)

sampled_df.rename(columns={"source": "STITCH 1", "target": "STITCH 2"}, inplace=True)

# Merging the sampled df with base ddi to get the side effect names
sampled_df = pd.merge(sampled_df, ddi).drop_duplicates(subset=["STITCH 1", "STITCH 2"])

In [None]:
# Sanity Check the output

# Visualizing sampled_df as a graph to see that there are no disconnected edges
create_graph_viz(sampled_df, "STITCH 1", "STITCH 2")

all_drugs_according_to_df = len(set(sampled_df['STITCH 1'].to_list()).union(set(sampled_df['STITCH 2'].to_list())))

# Verify that you get the same value from the sampled_df as a graph
G = nx.from_pandas_edgelist(sampled_df, 'STITCH 1', 'STITCH 2')
number_of_drug_nodes = len(G.nodes())

assert all_drugs_according_to_df == number_of_drug_nodes

In [None]:
def save_dataframe(all_information):
    pd.DataFrame(all_information, columns=["drug_1_CID", "drug_1_name", "drug_1_SMILES", "relationship", "drug_2_CID", "drug_2_name", 
                                               "drug_2_SMILES"]).to_csv("DDI_subset.csv", index=False)

number_of_ddis_to_retain = 500
all_information = []

for idx, sample in tqdm(enumerate(sampled_df.itertuples(index=False))):    
    drug_1_CID = sample[0]
    drug_2_CID = sample[1]
    relationship = sample[3]

    drug_1_info = CID_information(drug_1_CID)
    drug_2_info = CID_information(drug_2_CID)
    
    try:
        # I want the drug name to not include any numbers or brackets, i.e., only keep regular words.
        if (not re.search(r"[\W\d]|(sulfate)", drug_1_info[0])) and (not re.search(r"[\W\d]|(sulfate)", drug_2_info[0])):

            # This ensures that when I am retrieving the background information, I will get 20 pubmed hits.
            drug1_query = f"{drug_1_info[0]} AND hasabstract AND Humans AND (AD OR AE OR PK OR PD OR CO OR TU OR DE)"
            with Entrez.esearch(db='pubmed', term=drug1_query, retmax=20, sort="relevance") as handle:
                paper_list_drug1 = Entrez.read(handle)["IdList"]
    
            drug2_query = f"{drug_2_info[0]} AND hasabstract AND Humans AND (AD OR AE OR PK OR PD OR CO OR TU OR DE)"
            with Entrez.esearch(db='pubmed', term=drug2_query, retmax=20, sort="relevance") as handle:
                paper_list_drug2 = Entrez.read(handle)["IdList"]
            
            if len(paper_list_drug1) == 20 and len(paper_list_drug2) == 20:            
                all_information.append((drug_1_CID, drug_1_info[0], drug_1_info[1], relationship, drug_2_CID, drug_2_info[0], drug_2_info[1]))
    except:
        continue

    # timeout after 50 API calls.
    if (idx % 50 == 0) and (idx != 0):
        print(f"Samples collected so far: {len(all_information)}. Saving checkpoint and cooling down ...")
        save_dataframe(all_information)
        time.sleep(5)        
    
    if len(all_information) == number_of_ddis_to_retain:
        print("Necessary number of DDIs obtained... exiting")
        save_dataframe(all_information)
        break

### DPI Sampling

In [None]:
"""
All drugs in my database: 105
Drugs that have protein interactions: 67
Drugs that have protein interactions according to the actions table: 52
Total proteins found (from the interactions, i.e., dpis_needed): 563
Total proteins I could query from STRING: 560

Total DPIs obtained from all drugs (dpis_needed): 3578
Total DPIs retained after querying STRING (annotated_dpis): 3539

After merging with the actions table - which I need to do to get the Drug-Protein relationship, 72 proteins remain.

>>Thus, finally, we have 105 total drugs, 52 that participate in DPI & 72 unique proteins.<<
"""

In [None]:
# Getting all the DPIs and their annotations

dpi = pd.read_csv("../base_data/bio-decagon-targets-all.csv")
ddi_subset = pd.read_csv("DDI_subset.csv")

all_drugs = np.unique(np.concatenate((ddi_subset["drug_1_CID"].unique(), ddi_subset["drug_2_CID"].unique())))

# This will return less number of drugs since NOT ALL DRUGS HAVE DPIS!
dpis_needed = dpi.query("STITCH in @all_drugs")

dpis = []
for row in tqdm(dpis_needed.itertuples()):  
    # Retrieving protein data from the given gene ID
    url = f"https://string-db.org/api/json/get_string_ids?identifiers={row.Gene}&species=9606"
    try:
        response = requests.get(url).json()[0]
        dpis.append((row.STITCH, response["stringId"], response["preferredName"], response["annotation"]))  
    except:
        continue

# Saving so that I don't have to requery STRING
pd.DataFrame(dpis, columns=["item_id_a", "item_id_b", "protein_name", "protein_desc"]).to_csv("annotated_dpis.csv", index=False)

In [None]:
annotated_dpis = pd.read_csv("annotated_dpis.csv")
actions_subset = pd.read_csv("../base_data/actions.csv")
merged = pd.merge(annotated_dpis, actions_subset)
merged.rename(columns={"item_id_a": "cid", "item_id_b": "stringId"}, inplace=True)

# Adding drug names to the DPI dataframe for completeness
d1_names = ddi_subset[["drug_1_CID", "drug_1_name"]].set_index("drug_1_CID")["drug_1_name"].to_dict()
d2_names = ddi_subset[["drug_2_CID", "drug_2_name"]].set_index("drug_2_CID")["drug_2_name"].to_dict()
drug_cid_df = pd.DataFrame({**d1_names, **d2_names}.items(), columns=["cid", "drug_name"])

pd.merge(merged, drug_cid_df).drop_duplicates().to_csv("DPI_subset.csv", index=False)

## Collecting Background information

Keywords obtained from https://pubmed.ncbi.nlm.nih.gov/help/#proximity-searching

### Drug info

In [None]:
ddi_subset = pd.read_csv("DDI_subset.csv")
all_drugs = np.unique(np.concatenate((ddi_subset["drug_1_name"].unique(), ddi_subset["drug_2_name"].unique())))

In [None]:
def create_drug_info(drug: str):
    '''
    AD - Administration and Dosage
    AE - Adverse Effects
    PK - Pharmacokinetics
    PD - Pharmacology
    CO - Complications
    TU - Therapeutic Use
    DE - Drug Effects
    '''
    drug_query = f"{drug} AND hasabstract AND Humans AND (AD OR AE OR PK OR PD OR CO OR TU OR DE)"

    # Getting relevant papers for given drug
    with Entrez.esearch(db='pubmed', term=drug_query, retmax=20, sort="relevance") as handle:
        paper_list = Entrez.read(handle)["IdList"]
    
    # Getting a parsable record for each paper
    with Entrez.efetch(db='pubmed', rettype='medline', retmode="text", id=paper_list) as handle:
        records = Medline.parse(handle)
        record_list = []
        for rec in records:
            record_list.append(rec)

    # Collecting relevant information
    metadata = []
    all_abstracts_string = ""
    for paper_id, record in zip(paper_list, record_list):
        try:
            metadata.append((drug, paper_id, record["TI"], record["AU"], record["MH"], f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/"))
            all_abstracts_string = all_abstracts_string + record["AB"] + "\n"
        except:
            continue

    with Path(f"../background_information_data/drug_data/{drug}.txt").open("w") as output_file:
        output_file.write(all_abstracts_string)
    
    return metadata

In [None]:
# Gathering all the relevant information 
metadata = []
for drug in tqdm(all_drugs):
    metadata.extend(create_drug_info(drug))

pd.DataFrame(metadata, columns=["drug_name", "pubmed_id", "title", "authors", "mesh_terms", "paper_url"]).to_csv("../background_information_data/drug_data/metadata.csv")

### Protein info

In [None]:
dpi_subset = pd.read_csv("DPI_subset.csv")
all_proteins = dpi_subset[["protein_name", "protein_desc"]].set_index("protein_name")["protein_desc"].to_dict()

In [None]:
def create_protein_info(protein: str, protein_desc: str):
    '''
    CH - chemistry
    ME - metabolism
    PH - physiology
    GE - genetics
    '''
    protein_query = f"{protein} AND hasabstract AND Humans AND (CH OR ME OR PH OR GE)"

    # Getting relevant papers for given protein
    with Entrez.esearch(db='pubmed', term=protein_query, retmax=20, sort="relevance") as handle:
        paper_list = Entrez.read(handle)["IdList"]
    
    # Getting a parsable record for each paper
    with Entrez.efetch(db='pubmed', rettype='medline', retmode="text", id=paper_list) as handle:
        records = Medline.parse(handle)
        record_list = []
        for rec in records:
            record_list.append(rec)

    # Collecting relevant information
    metadata = []
    all_abstracts_string = protein_desc + "\n" # Adding the annotation information from STRING
    for paper_id, record in zip(paper_list, record_list):
        try:
            metadata.append((protein, paper_id, record["TI"], record["AU"], record["MH"], f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/"))
            all_abstracts_string = all_abstracts_string + record["AB"] + "\n"
        except:
            continue

    with Path(f"../background_information_data/protein_data/{protein}.txt").open("w") as output_file:
        output_file.write(all_abstracts_string)
    
    return metadata

In [None]:
metadata = []
for protein_name, protein_desc in tqdm(all_proteins.items()):
    metadata.extend(create_protein_info(protein_name, protein_desc))

pd.DataFrame(metadata, columns=["protein_name", "pubmed_id", "title", "authors", "mesh_terms", "paper_url"]).to_csv("../background_information_data/protein_data/metadata.csv")