# Interactor Finder
The `InteractorFinder` class is used to identify specifically linked nodes to your starting case. Here, we show how one initializes this class by choosing the MAPT protein with phosphorylation modification, and finds all causally linked neighbors.

In [1]:
#!pip install drugintfinder
from drugintfinder.finder import InteractorFinder

import pandas as pd

# Initialize with base information
finder = InteractorFinder(node_name="MAPT", node_type="protein", neighbor_edge_type="causal", print_sql=True)

# Select for matching starting protein nodes (i.e. MAPT protein) and find all interactors
neighbors = finder.find_interactors()
neighbors

Unnamed: 0,id,pmod_type,target_bel,target_symbol,target_type,relation_type,interactor_bel,interactor_name,interactor_type,pmid,pmc,target_species
0,1,pho,"p(HGNC:""MAPT"",loc(MESHA:""Cerebrospinal Fluid"")...",MAPT,protein,increases,"p(HGNC:""TREM2"",var(""p.Arg47His""))",TREM2,protein,28768545,PMC5541421,9606
1,4,pho,"p(HGNC:""MAPT"",pmod(Ph,S,357))",MAPT,protein,increases,"act(p(HGNC:""GSK3B""),ma(kin))",,activity,17389597,,9606
2,5,pho,"p(HGNC:""MAPT"",pmod(Ph,S,357))",MAPT,protein,increases,"act(p(HGNC:""GSK3B""),ma(kin))",,activity,17360711,,9606
3,6,pho,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,increases,"a(MESHC:""calyculin A"")",calyculin A,abundance,21297267,,9606
4,7,pho,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,increases,"act(p(HGNC:""CDK5""),ma(kin))",,activity,12387894,,9606
...,...,...,...,...,...,...,...,...,...,...,...,...
36986,44072,,"p(HGNC:""MAPT"",pmod(Ph,T))",MAPT,protein,increases,"act(p(HGNC:""CDK5""),ma(kin))",,activity,23362255,PMC3597833,9606
36987,44073,,"p(HGNC:""MAPT"",pmod(Ph,T))",MAPT,protein,decreases,"p(HGNC:""PIN1"")",PIN1,protein,23362255,PMC3597833,9606
36988,44074,,"p(HGNC:""MAPT"",pmod(Ph,T))",MAPT,protein,decreases,"act(p(HGNC:""PPP2CA""),ma(phos))",,activity,23362255,PMC3597833,9606
36989,44075,,"p(HGNC:""MAPT"",loc(MESHA:""Brain""),pmod(Ph))",MAPT,protein,increases,"p(HGNC:""TREM2"",var(""?""))",TREM2,protein,28768545,PMC5541421,9606


### Druggable Interactors
While knowing the neighbors of select nodes is useful, knowing which ones can be targeted by drugs and compounds is even more informative. The InteractorFinder class has a method for searching out those special neighbors. By default, these neighbors are isolated to proteins as the KG restricts drug-target interactions to those occurring between proteins and compounds.

In [2]:
druggable_ints = finder.druggable_interactors()
druggable_ints

Unnamed: 0,id,drug,drugbank_id,chembl_id,pubchem_id,interactor_type,interactor_bel,interactor_name,capsule_interactor_bel,capsule_interactor_type,...,target_bel,target_symbol,target_type,relation_type,pmod_type,pmid,pmc,rel_rid,drug_rel_rid,drug_rel_actions
0,1,"N'-(Pyrrolidino[2,1-B]Isoindolin-4-On-8-Yl)-N-...",DB04186,CHEMBL141247,445840.0,protein,"p(HGNC:""CDK2"")",CDK2,,,...,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,directly_increases,pho,8282104,,#570:10,#1898:10624,
1,2,"1-(3,5-DICHLOROPHENYL)-5-METHYL-1H-1,2,4-TRIAZ...",DB07852,,2763754.0,protein,"p(HGNC:""CDK2"")",CDK2,,,...,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,directly_increases,pho,8282104,,#570:10,#1898:10917,
2,3,N(6)-dimethylallyladenine,DB08768,CHEMBL476189,92180.0,protein,"p(HGNC:""CDK2"")",CDK2,,,...,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,directly_increases,pho,8282104,,#570:10,#1898:11306,
3,4,"(5E)-2-Amino-5-(2-pyridinylmethylene)-1,3-thia...",DB07529,,46937079.0,protein,"p(HGNC:""CDK2"")",CDK2,,,...,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,directly_increases,pho,8282104,,#570:10,#1898:11332,
4,5,"4-{5-[(Z)-(2-IMINO-4-OXO-1,3-THIAZOLIDIN-5-YLI...",DB07534,CHEMBL233149,5729339.0,protein,"p(HGNC:""CDK2"")",CDK2,,,...,"p(HGNC:""MAPT"",pmod(Ph,S,199))",MAPT,protein,directly_increases,pho,8282104,,#570:10,#1899:10523,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57951,57952,Bufexamac,DB13346,CHEMBL94394,2466.0,protein,"p(HGNC:""HDAC6"")",HDAC6,"p(HGNC:""HDAC6"")",protein,...,"p(HGNC:""MAPT"",pmod(Ph,S,369))",MAPT,protein,increases,pho,30935091,PMC6480207,#557:788,#1900:11395,inhibitor
57952,57953,Debio-1347,,,,protein,"p(HGNC:""HDAC6"")",HDAC6,"p(HGNC:""HDAC6"")",protein,...,"p(HGNC:""MAPT"",pmod(Ph,S,369))",MAPT,protein,increases,pho,30935091,PMC6480207,#557:788,#1911:11461,inhibitor
57953,57954,indirubin-3'-monoxime,,,,protein,"p(HGNC:""HDAC6"")",HDAC6,"p(HGNC:""HDAC6"")",protein,...,"p(HGNC:""MAPT"",pmod(Ph,S,369))",MAPT,protein,increases,pho,30935091,PMC6480207,#557:788,#1912:11461,inhibitor
57954,57955,quercetin,,,,protein,"p(HGNC:""HDAC6"")",HDAC6,"p(HGNC:""HDAC6"")",protein,...,"p(HGNC:""MAPT"",pmod(Ph,S,369))",MAPT,protein,increases,pho,30935091,PMC6480207,#557:788,#1913:11461,inhibitor


# Ranker
The `Ranker` class is used to generate useful statistics about identified druggable interactors. Because this scoring is restricted to druggable interactors, only the starting node and pmods are needed to initialize.  

The ranking algorithm requires a bit of information to score everything, the first time it is used, it will need to download information on BioAssays and other resources which it will then store in a locally made SQLite database. The total space used is less than 100 MB.  

The download will take a couple of minutes during the first ranking, but subsequent rankings will be much faster.

In [3]:
from drugintfinder.ranker import Ranker

ranker = Ranker(symbol="MAPT", pmods=["pho"], reward=1, penalty=-1)
ranker.rank()  # Performs the ranking
summary = ranker.summarize()  # Create a summary os the statistics
summary

Counting edges: 100%|██████████| 80/80 [00:00<00:00, 734.57it/s]


Unnamed: 0,Drug,Target,Synergizes,Number of BioAssays for Target,Number of Causal Edges for Target,Drug Patent Ongoing,Generic Version of Drug Available,Number of Drug Targets
0,Procyclidine,CHRM1,,1115,9,No,Yes,4
1,"(2R)-1-[4-({6-[(2,6-Difluorophenyl)amino]-4-py...",CDK2,,1518,15,No,No,1
2,Olopatadine,S100B,,6,16,No,Yes,8
3,Methyldopa,DDC,,8,2,No,Yes,2
4,"2-{[(6-OXO-1,6-DIHYDROPYRIDIN-3-YL)METHYL]AMIN...",ABL1,,1929,10,No,No,1
...,...,...,...,...,...,...,...,...
853,Lithium carbonate,GSK3B,,1197,115,No,Yes,4
854,Semaglutide,GLP1R,Yes,120,6,No,No,1
855,5-[1-(4-methoxyphenyl)-1H-benzimidazol-6-yl]-1...,GSK3B,,1197,115,No,No,1
856,Resveratrol,SNCA,,64,47,No,No,26


In [4]:
bioassays_per_drug = []

for interactor in summary.Target.unique():
    drugs = len(summary[summary.Target == interactor].Drug.unique())
    bioassays = summary[summary.Target == interactor]["Number of BioAssays for Target"].iloc[0]
    bioassays_per_drug.append({"Protein": interactor, "BioAssays per Drug": bioassays // drugs})

ratio_df = pd.DataFrame(bioassays_per_drug)
ratio_df.sort_values("BioAssays per Drug", ascending=False)[:10]

Unnamed: 0,Protein,BioAssays per Drug
70,F2,1096
57,STAT3,587
74,RPS6KB1,383
76,CSNK1D,362
35,GSK3A,356
41,MAPK11,233
52,MAPK13,205
43,CDK5R1,196
24,HDAC6,193
77,MARK1,185


In [5]:
import pandas as pd
summary[summary.Target == "APP"]["Number of BioAssays for Target"].iloc[0]

1127

In [7]:
summary.sort_values(by="Number of BioAssays for Target", ascending=False)

Unnamed: 0,Drug,Target,Synergizes,Number of BioAssays for Target,Number of Causal Edges for Target,Drug Patent Ongoing,Generic Version of Drug Available,Number of Drug Targets
327,Romidepsin,HDAC6,,1938,180,Yes,Yes,5
497,quercetin,HDAC6,,1938,180,No,No,
651,indirubin-3'-monoxime,HDAC6,,1938,180,No,No,
382,Panobinostat,HDAC6,,1938,180,No,No,11
740,Pracinostat,HDAC6,,1938,180,No,No,4
...,...,...,...,...,...,...,...,...
586,"N,N-Bis(3-(D-gluconamido)propyl)deoxycholamide",IGF1,,1,59,No,No,2
261,"Zinc sulfate, unspecified form",APOB,,1,0,No,No,97
845,"N-Dodecyl-N,N-Dimethyl-3-Ammonio-1-Propanesulf...",IGF1,,1,59,No,No,2
476,Thiamine,TPK1,,1,0,No,Yes,1


# PPI Analysis
Next, we perform an analysis of the identified proteins using information gathered by [e(BE:L)](https://github.com/e-bel/ebel).
The following commands will download data from 4 major PPI databases: BioGRID, Pathway Commons, StringDB, and IntAct,
and check which pathways/interactions are known for every identified secondary target.

In [8]:
# Uncomment the following line if you need to install e(BE:L)
#!pip install ebel git+https://github.com/orientechnologies/pyorient

import pandas as pd
from ebel import Bel
bel = Bel()

ModuleNotFoundError: No module named 'ebel'

## Download PPI Information
The following cell downloads information from the PPI databases and inserts into a RDBMS (SQLlite [default] or MySQL).  
**WARNING** This step may take awhile.

In [None]:
bel.biogrid.update()
bel.intact.update()
bel.stringdb.update()
bel.pathway_commons.update()

## Gather Hits
Now we check each PPI database for associated information on each secondary target.

In [None]:
proteins = list(summary.Target.unique())

### Pathway Commons

In [None]:
sql = f"""Select
    pc.participant_a a,
    pc.interaction_type int_type,
    pc.participant_b b,
    group_concat(distinct pn.name) pathway_names,
    group_concat(distinct s.source) sources,
    group_concat(distinct p.pmid) pmids
from
    pathway_commons pc left join
    pathway_commons__pathway_name pc_pn on (pc.id=pc_pn.pathway_commons_id) left join
    pathway_commons_pathway_name pn on (pc_pn.pathway_commons_pathway_name_id = pn.id) left join
    pathway_commons__source pc_s on (pc.id=pc_s.pathway_commons_id) left join
    pathway_commons_source s on (pc_s.pathway_commons_source_id=s.id) left join
    pathway_commons_pmid p on (p.pathway_commons_id=pc.id)
where
    (pc.participant_a in {proteins} and pc.participant_b = 'MAPT') or
    (pc.participant_b in {proteins} and pc.participant_a = 'MAPT')
group by
    pc.participant_a, pc.interaction_type, pc.participant_b"""

pc_hits = pd.read_sql(sql, engine)

### BioGRID

In [None]:
sql = f"""Select
    ia.symbol a,
    ib.symbol b,
    bes.experimental_system,
    bes.experimental_system_type
from
    biogrid b inner join
    biogrid_interactor ia on (b.biogrid_a_id=ia.biogrid_id) inner join
    biogrid_interactor ib on (b.biogrid_b_id=ib.biogrid_id) inner join
    biogrid_experimental_system bes on (b.experimental_system_id=bes.id)
where
    (ia.symbol = 'MAPT' and ib.symbol in {proteins}) or
    (ib.symbol = 'MAPT' and ia.symbol in {proteins})"""

biogrid_hits = pd.read_sql(sql, engine)

### IntAct

In [None]:
sql = f"""Select
    ha.symbol as symbol_a,
    hb.symbol as symbol_b,
    i.confidence_value, 
    i.detection_method, 
    i.interaction_type, 
    i.pmid
from 
    intact i inner join 
    hgnc_uniprot hua on (i.int_a_uniprot_id=hua.accession) inner join 
    hgnc ha on (hua.hgnc_id=ha.id) inner join 
    hgnc_uniprot hub on (i.int_b_uniprot_id=hub.accession) inner join 
    hgnc hb on (hub.hgnc_id=hb.id)
where 
    (ha.symbol='MAPT' and hb.symbol in {proteins}) or
    (hb.symbol='MAPT' and ha.symbol in {proteins})
order by confidence_value desc
"""
intact_hits = pd.read_sql(sql, engine)

### StringDB

In [None]:
sql = f"""Select * 
from 
    stringdb 
where 
    (symbol1='MAPT' and symbol2 in {proteins}) or
    (symbol2='MAPT' and symbol1 in {proteins})
order by combined_score desc
"""
stringdb_hits = pd.read_sql(sql, engine)

# Connecting to a Different Knowledge Graph
By default, this package connects to the Alzheimer's Disease based Knowledge Graph (KG) developed under the MAVO project, available at https://graphstore.scai.fraunhofer.de. There are other KGs available, however, and here you can choose to connect to a different one if desired.

The commented out code shows how one can connect instead to the COVID KG.

In [None]:
from ebel_rest import connect
connect(user="covid_user", password="covid", db_name="covid", server="https://graphstore.scai.fraunhofer.de")