#### Import Libraries

In [None]:
import pubchempy as pcp
from Bio import Entrez
from tqdm import tqdm
import pandas as pd
import numpy as np
import requests
import re

In [None]:
def CID_information(CID: int):
    cleaned_cid = int(re.sub(r"CID[m|s]*0*", "", CID))
    compound = pcp.Compound.from_cid(cleaned_cid)
    try:
        compound_name = compound.synonyms[0]
    except:
        compound_name = compound.iupac_name
    return (compound_name, compound.canonical_smiles)

In [None]:
# Extra code - might use later. This is to retrieve info from the gene db.

def get_protein_info(record):
    return f"{record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']} ({record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']})",\
    record[0]["Entrezgene_summary"],\
    record[0]["Entrezgene_prot"]

cleaned_gene_id = int(re.sub("9606.ENSP0*", "", gene))
Entrez.email = "sample_email@example.org" #Doesn't really matter
handle = Entrez.efetch(db="gene", id=cleaned_gene_id, rettype="gb", retmode="xml")
record = Entrez.read(handle)

try:
    all_information["gene name"], all_information["gene summary"], all_information["proteins from gene"] = get_protein_info(record)
except:
    continue

Entrez.email = "sample_email@example.org" #Doesn't really matter
handle = Entrez.efetch(db="gene", id=7448, rettype="gb", retmode="xml")
record = Entrez.read(handle)

all_information = {}
all_information["gene name"] = f"{record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']} ({record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']})"
all_information["gene summary"] = record[0]["Entrezgene_summary"]
all_information["proteins from gene"] = record[0]["Entrezgene_prot"]
all_information

#### DDI sampling

In [None]:
ddi = pd.read_csv("bio-decagon-combo.csv")

In [None]:
drug_1_name = []
drug_1_SMILES = []
relationships = []
drug_2_name = []
drug_2_SMILES = []

for _ in tqdm(range(100)):
    while True:
        sampled_ddi = ddi.sample(n=1)
        drug_1_CID = sampled_ddi["STITCH 1"].to_list()[0]
        drug_2_CID = sampled_ddi["STITCH 2"].to_list()[0]
        relationship = sampled_ddi["Side Effect Name"].to_list()[0]
        
        drug_1_info = CID_information(drug_1_CID)
        drug_2_info = CID_information(drug_2_CID)
    
        # I want the drug name to not include any numbers or brackets, i.e., only keep regular words.
        try:
            if not (re.search(r"[^a-zA-Z\d\s:]", drug_1_info[0]) or re.search(r"[^a-zA-Z\d\s:]", drug_2_info[0])):
                break
        except:
            continue

    drug_1_name.append(drug_1_info[0])
    drug_1_SMILES.append(drug_1_info[1])
    relationships.append(relationship)
    drug_2_name.append(drug_2_info[0])
    drug_2_SMILES.append(drug_2_info[1])
    """print(f"Natural Language Triple: {drug_1_info[0]} -- {relationship} -- {drug_2_info[0]}")
    print(f"{drug_1_info[0]} SMILES: {drug_1_info[1]}")
    print(f"{drug_2_info[0]} SMILES: {drug_2_info[1]}")"""

In [None]:
'''
I might see certain elements like 'ALENDRONIC ACID' appearing more than once. However, they all have different relationships with
drug 2.
'''
pd.DataFrame({"drug_1_name": drug_1_name,
              "drug_1_SMILES": drug_1_SMILES,
              "relationship": relationships,
              "drug_2_name": drug_2_name,
              "drug_2_SMILES": drug_2_SMILES}).to_csv("DDI_subset.csv", index=False)

In [131]:
from Bio import Entrez, Medline

# Set your email address (required for NCBI API usage)
Entrez.email = 'your_email@example.com'

# Define the drug name (e.g., "Aspirin")
drug_name = '"alendronic acid" AND hasabstract'

handle = Entrez.esearch(db='pubmed', term=drug_name, retmax=5)
paper_list = Entrez.read(handle)["IdList"]
handle.close()

handle = Entrez.efetch(db='pubmed', rettype='medline', retmode="text", id=paper_list)
d = Medline.parse(handle)

In [132]:
s = []
for r in d:
    s.append(r)

In [133]:
s[0]

{'PMID': '39856849',
 'OWN': 'NLM',
 'STAT': 'PubMed-not-MEDLINE',
 'LR': '20250130',
 'IS': '2304-8158 (Print) 2304-8158 (Electronic) 2304-8158 (Linking)',
 'VI': '14',
 'IP': '2',
 'DP': '2025 Jan 9',
 'TI': 'An Aptamer Sensor Based on Alendronic Acid-Modified Upconversion Nanoparticles Combined with Magnetic Separation for Rapid and Sensitive Detection of Thiamethoxam.',
 'LID': '10.3390/foods14020182 [doi] 182',
 'AB': 'The widespread use of thiamethoxam has led to pesticide residues that have sparked global concerns regarding ecological and human health risks. A pressing requirement exists for a detection method that is both swift and sensitive. Herein, we introduced an innovative fluorescence biosensor constructed from alendronic acid (ADA)-modified upconversion nanoparticles (UCNPs) linked with magnetic nanoparticles (MNPs) via aptamer recognition for the detection of thiamethoxam. Through base pairing, thiamethoxam-specific aptamer-functionalized MNPs (apt-MNPs) were integrated

In [100]:
d["PubmedArticle"][1]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]

StringElement('Pulmonary fibrosis (PF) results in a progressive decline of lung function due to scarring. Drugs are among the most common causes of PF. The objective of our study was to reveal the structure of drugs involved in PF development.', attributes={'Label': 'BACKGROUND/OBJECTIVES', 'NlmCategory': 'OBJECTIVE'})

In [91]:
from bs4 import BeautifulSoup

abstract_text = d["PubmedArticle"][3]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]
soup = BeautifulSoup(abstract_text, 'html.parser')
cleaned_text = soup.get_text()
print(cleaned_text)

Glucocorticoid-induced osteoporosis (GIOP) is the leading cause of iatrogenic osteoporosis due to the widespread clinical use of glucocorticoids (GC) as immunosuppressants. Previous research identified the proline-rich tyrosine kinase 2, Pyk2, as a critical mediator of GC-induced bone loss, and that blocking Pyk2 could protect the skeleton from adverse GC actions. However, systemic administration of current Pyk2 inhibitors causes harmful side effects, such as skin lesions. To address this, we developed bone-targeted (BT) Pyk2 inhibitors by conjugating them with bisphosphonates (BP), ensuring adherence to the bone matrix and reducing impact on noncalcified tissues. We synthesized BT-Amide by linking a derivative of TAE-226, a Pyk2 inhibitor, with alendronic acid. Oral administration (gavage) of BT-Amide prevented GC-induced bone loss in mice without causing skin lesions, or elevation of any organ toxicity markers. These findings introduce BT-Amide as the first orally effective bone-targ

#### DPI sampling

In [None]:
# I'm reading the STITCH database file directly since it already has the relationship label
dpi = pd.read_csv("9606.actions.v5.0.tsv", sep="\t", chunksize=10_000).get_chunk()

In [None]:
'''
DPI is a directed graph. As there is a column in the dataframe that explains if A acts on B or not, I am leveraging that.
1. I want strong DPI's, i.e., (interaction ) score > 800. - I think I need to remove this, otherwise not getting hits.
2. I want item_a to act on item_b, i.e., a_is_acting = t.
3. I want item_a to be a drug, i.e., item_id_a needs to have a CID.
'''

while True:
    sampled_dpi = dpi[(dpi["a_is_acting"] == "t") & (dpi["item_id_a"].str.contains('CID'))].sample(n=1)
    all_information = {}
    
    drug = sampled_dpi["item_id_a"].to_list()[0]
    protein = sampled_dpi["item_id_b"].to_list()[0]
    relationship = sampled_dpi["mode"].to_list()[0]
    
    # Getting drug information
    drug_info = CID_information(drug)

    if re.search(r"[^a-zA-Z\d\s:]", drug_info[0]):
        continue
    
    all_information["Drug name"] = drug_info[0]
    all_information["Drug SMILES"] = drug_info[1]

    all_information["relationship"] = relationship
    
    # Getting protein information directly from STITCH DB since it is in a STITCH protein format.
    url = f"https://string-db.org/api/json/resolve?identifier={protein}&species=9606"
    response = requests.get(url).json()

    try:
        if len(response[0]["annotation"]) >= 400:
            all_information["protein name"] = response[0]["preferredName"]
            all_information["protein info"] = response[0]["annotation"]
            break
    except:
        continue

all_information

#### PPI sampling

In [None]:
ppi = pd.read_csv("bio-decagon-ppi.csv")

In [None]:
#I want strong interacting proteins. I'm passing the gene ID directly since STRING maps these ID's automatically to the gene's main protein. 

'''
nscore - neighborhood score, (computed from the inter-gene nucleotide count).
fscore - fusion score (derived from fused proteins in other species).
pscore - cooccurence score of the phyletic profile (derived from similar absence/presence patterns of genes).
hscore - homology score, the degree of homology of the interactors (normally not reported in STRING).
ascore - coexpression score (derived from similar pattern of mRNA expression measured by DNA arrays and similar technologies).
escore - experimental score (derived from experimental data, such as, affinity chromatography).
dscore - database score (derived from curated data of various databases).
tscore - textmining score (derived from the co-occurrence of gene/protein names in abstracts).
'''

while True:
    sampled_ppi = ppi.sample(n=1)
    all_information = {}
    
    gene_ids = [str(sampled_ppi["Gene 1"].to_list()[0]), str(sampled_ppi["Gene 2"].to_list()[0])]
    species = 9606  # Human
    
    url = f"https://string-db.org/api/json/network?identifiers={'%0d'.join(gene_ids)}&species={species}"
    response = requests.get(url).json()
    if response != [] and response[0]["score"] > 0.8:
        print(response)
        break