In [None]:
# Import Libraries
from pandas.api.types import is_numeric_dtype
from collections import defaultdict
import plotly.graph_objects as go
from chemspipy import ChemSpider
from Bio import Entrez, Medline
import matplotlib.pyplot as plt
from pathlib import Path
import pubchempy as pcp
from tqdm import tqdm
import networkx as nx
import pandas as pd
import numpy as np
import wikipedia
import requests
import signal
import pickle
import time
import re

# Set email address (required for NCBI API usage)
Entrez.email = 'sks6765@psu.edu'

# Chemspider API key
cs = ChemSpider('AoAVRxmbgZ1ZSsk4Zlbua1jev4EWDFSI7XB2U19B')

In [None]:
def create_graph_viz(dataframe, column1, column2, label=1):
    # Creating an interactive graph
    G = nx.Graph()
    
    # Add edges to the graph
    for index, row in dataframe.iterrows():
        G.add_edge(row[column1], row[column2], label=row[label] if label != 1 else 1)
    
    # Add edge labels
    edge_labels = {(u, v): d['label'] for u, v, d in G.edges(data=True)}
    nx.set_edge_attributes(G, edge_labels, 'label')
    
    # Position nodes using spring layout
    pos = nx.spring_layout(G)
    
    # Extract node and edge information
    node_x = [pos[node][0] for node in G.nodes()]
    node_y = [pos[node][1] for node in G.nodes()]
    edge_x = []
    edge_y = []
    edge_text = []
    for edge in G.edges():
        source, target = edge
        x0, y0 = pos[source]
        x1, y1 = pos[target]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        edge_text.append(G.get_edge_data(*edge)['label'])  # Use existing edge labels
    
    # Create a Plotly figure
    fig = go.Figure(data=[go.Scatter(
        x=edge_x,
        y=edge_y,
        mode='lines',
        line_shape='spline',
        opacity=0.5,
        hoverinfo='none'
    ),
    go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers',
        hoverinfo='text',
        hovertext=[f'Node {node}' for node in G.nodes()]
    ),
    go.Scatter(
        x=[(pos[edge[0]][0] + pos[edge[1]][0]) / 2 for edge in G.edges()],
        y=[(pos[edge[0]][1] + pos[edge[1]][1]) / 2 for edge in G.edges()],
        mode='text',
        text=edge_text,
        textposition='middle center',
        hoverinfo='none'
    )])
    
    # Customize the layout
    fig.update_layout(
        showlegend=False,
        hovermode='x',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
    
    # Display the figure
    fig.show()

# DDI Sampling

In [None]:
def search_cas_number(cas_number):
    url = f"https://commonchemistry.cas.org/api/detail?cas_rn={cas_number}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()["name"]
        if data:
            return data
        else:
            return None
    else:
        return None

def CID_information(CID: int):
    cleaned_cid = int(re.sub(r"CID[m|s]*0*", "", CID))
    compound = pcp.Compound.from_cid(cleaned_cid)
    
    try:
        compound_name = cs.get_compound(cs.filter_results(cs.filter_inchikey(compound.inchikey))[0]).common_name
    except:
        if compound.synonyms:
            flag = 0
            for name in compound.synonyms:
                if not re.search(r"^[a-zA-Z\s]+$", name):
                    flag = 1
                    compound_name = name
                    break

            if flag == 0:
                compound_name = compound.synonyms[0]
        
                # Checking if the returned name is a valid CAS number
                if re.search(r"\b\d{2,7}-\d{2}-\d\b", compound_name):
                    cas_name = search_cas_number(compound_name)
                    if (cas_name is not None) and (re.search(r"^[a-zA-Z\s]+$", cas_name)):
                        compound_name = cas_name
        else:
            compound_name = compound.iupac_name

    return (compound_name, compound.canonical_smiles)

## Run this cell only once! - Create and save the deterministic subgraph dataframe

In [None]:
ddi = pd.read_csv("../data/base_data/bio-decagon-combo.csv")

G = nx.from_pandas_edgelist(ddi, 'STITCH 1', 'STITCH 2')

# Get the largest connected component
largest_component = max(nx.connected_components(G), key=len)

largest_component.remove("CID006398525") # This node has NO synonym, common name, chemspider entry, etc.

# Sample nodes from the largest component
sample_size = 500
sample_nodes = np.random.choice(list(largest_component), size=sample_size, replace=False)

# Get the subgraph induced by the sampled nodes
subgraph = G.subgraph(sample_nodes)

# Convert the subgraph back to a DataFrame
sampled_df = nx.to_pandas_edgelist(subgraph)

sampled_df.rename(columns={"source": "STITCH 1", "target": "STITCH 2"}, inplace=True)

# Merging the sampled df with base ddi to get the side effect names
sampled_df = pd.merge(sampled_df, ddi).drop_duplicates(subset=["STITCH 1", "STITCH 2"])

sampled_df.to_csv("../data/mined_data/sampled_ddi.csv", index=False)

In [None]:
# Sanity Check the output

# Visualizing sampled_df as a graph to see that there are no disconnected edges
create_graph_viz(sampled_df, "STITCH 1", "STITCH 2")

all_drugs_according_to_df = len(set(sampled_df['STITCH 1'].to_list()).union(set(sampled_df['STITCH 2'].to_list())))

# Verify that you get the same value from the sampled_df as a graph
G = nx.from_pandas_edgelist(sampled_df, 'STITCH 1', 'STITCH 2')
number_of_drug_nodes = len(G.nodes())

assert all_drugs_according_to_df == number_of_drug_nodes

sampled_df

## Run these cells to get the drug info

In [None]:
sampled_df = pd.read_csv("../data/mined_data/sampled_ddi.csv")

In [None]:
all_drugs = set(sampled_df["STITCH 1"].unique()).union(set(sampled_df["STITCH 2"].unique()))
drug_info = {}
for drug_cid in tqdm(all_drugs):
    drug_info[drug_cid] = CID_information(drug_cid)

# Saving so that I don't have to requery API (rate limits)
with Path("../data/mined_data/drug_annotations.pkl").open("wb") as f:
    pickle.dump(drug_info, f)

In [None]:
with Path("../data/mined_data/drug_annotations.pkl").open("rb") as f:
    drug_info = pickle.load(f)

In [None]:
enhanced_rows = []
for row in tqdm(sampled_df.itertuples(index=False)):
    drug_1_CID = row[0]
    drug_2_CID = row[1]
    relationship = row[3]

    drug_1_info = drug_info[drug_1_CID]
    drug_2_info = drug_info[drug_2_CID]
    
    drug_1_name = drug_1_info[0]
    drug_1_SMILES = drug_1_info[1]

    drug_2_name = drug_2_info[0]
    drug_2_SMILES = drug_2_info[1]

    enhanced_rows.append((drug_1_CID, drug_1_name, drug_1_SMILES, relationship, drug_2_CID, drug_2_name, drug_2_SMILES))

pd.DataFrame(data=enhanced_rows, 
             columns=["drug_1_CID", "drug_1_name", "drug_1_SMILES", "relationship",
                      "drug_2_CID", "drug_2_name", "drug_2_SMILES"]).to_csv("../data/mined_data/DDI_subset.csv", index=False)

# DPI Sampling

In [None]:
# Getting all the DPIs and their annotations

dpi = pd.read_csv("../data/base_data/bio-decagon-targets-all.csv")
ddi_subset = pd.read_csv("../data/mined_data/DDI_subset.csv")

all_drugs = np.unique(np.concatenate((ddi_subset["drug_1_CID"].unique(), ddi_subset["drug_2_CID"].unique())))

# This will return less number of drugs since NOT ALL DRUGS HAVE DPIS!
dpis_needed = dpi.query("STITCH in @all_drugs")

In [None]:
# Mapping Genes to proteins
unique_genes = dpis_needed["Gene"].unique()
gene_protein = {}
for gene in tqdm(unique_genes):  
    # Retrieving protein data from the given gene ID
    url = f"https://string-db.org/api/json/get_string_ids?identifiers={gene}&species=9606"
    try:
        response = requests.get(url).json()[0]
        gene_protein[gene] = (response["stringId"], response["preferredName"], response["annotation"])  
    except:
        continue

with Path("../data/mined_data/protein_annotations.pkl").open("wb") as f:
    pickle.dump(gene_protein, f)

In [None]:
with Path("../data/mined_data/protein_annotations.pkl").open("rb") as f:
    gene_protein = pickle.load(f)

gene_protein_df = pd.DataFrame.from_dict(gene_protein, orient="index").reset_index()
gene_protein_df.rename(columns={"index": "Gene", 0: "stringID", 1: "protein_name", 2:"protein_desc"}, inplace=True)
annotated_dpis = pd.merge(dpis_needed, gene_protein_df)
annotated_dpis.rename(columns={"STITCH": "item_id_a", "stringID": "item_id_b"}, inplace=True)

actions_subset = pd.read_csv("../data/base_data/actions.csv")
merged = pd.merge(annotated_dpis, actions_subset)
merged.rename(columns={"item_id_a": "cid", "item_id_b": "stringId"}, inplace=True)

# Adding drug names to the DPI dataframe for completeness
d1_names = ddi_subset[["drug_1_CID", "drug_1_name"]].set_index("drug_1_CID")["drug_1_name"].to_dict()
d2_names = ddi_subset[["drug_2_CID", "drug_2_name"]].set_index("drug_2_CID")["drug_2_name"].to_dict()
drug_cid_df = pd.DataFrame({**d1_names, **d2_names}.items(), columns=["cid", "drug_name"])

pd.merge(merged, drug_cid_df).drop_duplicates().to_csv("../data/mined_data/DPI_subset.csv", index=False)

# Collecting Background information

In [None]:
ddi_subset = pd.read_csv("../data/mined_data/DDI_subset.csv")
dpi_subset = pd.read_csv("../data/mined_data/DPI_subset.csv")

all_drugs = set(ddi_subset["drug_1_name"].unique()).union(set(ddi_subset["drug_2_name"].unique()))

'''
Had to filter the drugs since wikipedia was erroneoulsy mapping some identifiers which are PubChem specifc such as 12080-13-9, etc.
Better to thus stick to commercial named drugs with a lot of info.
'''
filtered_drugs = list(filter(lambda x: re.search(r"^[a-zA-Z\s]+$", x), all_drugs))

all_proteins = dpi_subset[["protein_name", "protein_desc"]].set_index("protein_name")["protein_desc"].to_dict()

In [None]:
# Define constants for reference sections and Wikipedia search parameters
REFERENCE_SECTIONS = ["== References ==", "== See also ==", "== Further reading ==", "== External links =="]
SEARCH_RESULTS = 1

def clean_doc(doc):
    """Clean and extract relevant content from a Wikipedia page."""
    return re.split("|".join(REFERENCE_SECTIONS), doc)[0].strip()

def get_wikipedia_content(entity):
    try:
        page_title = wikipedia.search(entity, results=SEARCH_RESULTS)[0]
        page_content = wikipedia.page(page_title, auto_suggest=False).content
        return clean_doc(page_content)
    except (IndexError, wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e:
        return None

def create_drug_info(drug: str):
    '''
    AD - Administration and Dosage
    AE - Adverse Effects
    PK - Pharmacokinetics
    PD - Pharmacology
    CO - Complications
    TU - Therapeutic Use
    DE - Drug Effects
    '''
    drug_query = f"({drug}[TI]) AND (hasabstract) AND (english[la]) AND (medline[sb]) AND ({drug}[AD] OR \
    {drug}[AE] OR {drug}[PK] OR {drug}[PD] OR {drug}[CO] OR {drug}[TU] OR {drug}[DE] OR {drug}[TO] OR {drug}[CT] \
    OR {drug}[DT] OR {drug}[PO])"

    # Getting relevant papers for given drug
    with Entrez.esearch(db='pubmed', term=drug_query, retmax=20, sort="relevance") as handle:
        paper_list = Entrez.read(handle)["IdList"]

    if paper_list == []:
        return None
    
    # Getting a parsable record for each paper
    with Entrez.efetch(db='pubmed', rettype='medline', retmode="text", id=paper_list) as handle:
        records = Medline.parse(handle)
        record_list = []
        for rec in records:
            record_list.append(rec)

    filtered_record_list = list(filter(lambda rec: "MH" in rec and not any(re.match("animals*", x, re.IGNORECASE) for x in rec["MH"]), record_list))

    # Collecting relevant information
    metadata = []
    all_abstracts_string = ""
    for record in filtered_record_list:
        try:
            metadata.append((drug, record["PMID"], record["TI"], record["AU"], record["MH"], 
                             f"https://pubmed.ncbi.nlm.nih.gov/{record["PMID"]}/"))
            all_abstracts_string = all_abstracts_string + record["AB"] + "\n"
        except:
            continue

    return (all_abstracts_string, metadata)

def create_protein_info(protein: str, protein_desc: str):
    '''
    CH - chemistry
    ME - metabolism
    PH - physiology
    GE - genetics
    AN - Analysis
    BI - Biosynthesis
    CS - Chemical Synthesis
    DF - Deficiency
    '''
    protein_query = f"({protein}[TI]) AND (hasabstract) AND (english[la]) AND (medline[sb]) AND ({protein}[CH] OR \
    {protein}[ME] OR {protein}[PH] OR {protein}[GE] OR {protein}[AN] OR {protein}[BI] OR {protein}[CS] OR {protein}[DF])"

    # Getting relevant papers for given protein
    with Entrez.esearch(db='pubmed', term=protein_query, retmax=20, sort="relevance") as handle:
        paper_list = Entrez.read(handle)["IdList"]

    if paper_list == []:
        return None
    
    # Getting a parsable record for each paper
    with Entrez.efetch(db='pubmed', rettype='medline', retmode="text", id=paper_list) as handle:
        records = Medline.parse(handle)
        record_list = []
        for rec in records:
            record_list.append(rec)

    filtered_record_list = list(filter(lambda rec: "MH" in rec and not any(re.match("animals*", x, re.IGNORECASE) for x in rec["MH"]), record_list))
    
    # Collecting relevant information
    metadata = []
    all_abstracts_string = protein_desc + "\n" # Adding the annotation information from STRING
    for record in filtered_record_list:
        try:
            metadata.append((protein, record["PMID"], record["TI"], record["AU"], record["MH"], 
                             f"https://pubmed.ncbi.nlm.nih.gov/{record["PMID"]}/"))
            all_abstracts_string = all_abstracts_string + record["AB"] + "\n"
        except:
            continue
    
    return (all_abstracts_string, metadata)

def write_content(entity, content, entity_type, source):
    file_path = Path(f"../data/background_information_data/{entity_type}_data/{source}/{entity}.txt")
    try:
        with file_path.open("w") as f:
            f.write(content)
    except IOError as e:
        print(f"Error writing to file for {drug}: {e}")

In [None]:
metadata = []
for idx, drug in tqdm(enumerate(filtered_drugs)):
    wiki_content = get_wikipedia_content(drug)      
    pubmed_content = create_drug_info(drug)
    if wiki_content is not None:
        write_content(drug, wiki_content, "drug", "Wiki")
    if pubmed_content is not None:
        write_content(drug, pubmed_content[0], "drug", "PubMed")
        metadata.extend(pubmed_content[1])
    if (idx % 10 == 0) and (idx != 0):
        time.sleep(5)
pd.DataFrame(metadata, columns=["drug_name", "pubmed_id", "title", "authors", 
                                "mesh_terms", "paper_url"]).to_csv("../data/background_information_data/drug_data/PubMed/metadata.csv")

In [None]:
metadata = []
for idx, (protein_name, protein_desc) in tqdm(enumerate(all_proteins.items())):
    wiki_content = get_wikipedia_content(protein_name)
    pubmed_content = create_protein_info(protein_name, protein_desc) 
    if wiki_content is not None:
        write_content(protein_name, wiki_content, "protein", "Wiki")
    if pubmed_content is not None:
        write_content(protein_name, pubmed_content[0], "protein", "PubMed")
        metadata.extend(pubmed_content[1])
    if (idx % 10 == 0) and (idx != 0):
        time.sleep(5)
pd.DataFrame(metadata, columns=["protein_name", "pubmed_id", "title", "authors", 
                                "mesh_terms", "paper_url"]).to_csv("../data/background_information_data/protein_data/PubMed/metadata.csv")