In [13]:
import pandas as pd
import requests

In [18]:
df = pd.read_csv("assay_target_map.csv")

In [19]:
df

Unnamed: 0,ASSAY_ID,assay_chembl_id,target_chembl_id,target_type
0,737823,CHEMBL1741322,CHEMBL3356,SINGLE PROTEIN
1,737824,CHEMBL1741323,CHEMBL3622,SINGLE PROTEIN
2,688422,CHEMBL1614544,CHEMBL1293235,SINGLE PROTEIN
3,688810,CHEMBL1613836,CHEMBL1615322,NUCLEIC-ACID
4,688812,CHEMBL1613838,CHEMBL1293294,SINGLE PROTEIN
...,...,...,...,...
196,1301866,CHEMBL3214913,CHEMBL612545,UNCHECKED
197,1301890,CHEMBL3214967,CHEMBL2029197,SINGLE PROTEIN
198,1301859,CHEMBL3214906,CHEMBL2029198,SINGLE PROTEIN
199,1301893,CHEMBL3214970,CHEMBL612545,UNCHECKED


In [20]:
import requests
import pandas as pd
from pandarallel import pandarallel

# Initialize pandarallel with a progress bar
pandarallel.initialize(progress_bar=True)

# Function to get target details from ChEMBL API with the correct Accept header
def get_target_details(target_chembl_id):
    url = f'https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}'
    headers = {'Accept': 'application/json'}  # Request JSON response
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an exception for 4xx/5xx status codes
        
        if response.headers.get('Content-Type') == 'application/json':
            target_data = response.json()
            # Extract target name and additional details
            target_name = target_data.get('pref_name', 'Unknown')
            organism = target_data.get('organism', 'Unknown')
            return target_name, organism
        else:
            return 'Unknown', 'Unknown'
    except requests.exceptions.RequestException as e:
        return 'Unknown', 'Unknown'

# Function to apply to each row in parallel
def apply_get_target_details(target_chembl_id):
    return pd.Series(get_target_details(target_chembl_id))

# Apply the function to 'target_chembl_id' column in parallel
df[['target_name', 'organism']] = df["target_chembl_id"].parallel_apply(apply_get_target_details)

# Display the updated DataFrame
df


INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

Unnamed: 0,ASSAY_ID,assay_chembl_id,target_chembl_id,target_type,target_name,organism
0,737823,CHEMBL1741322,CHEMBL3356,SINGLE PROTEIN,Cytochrome P450 1A2,Homo sapiens
1,737824,CHEMBL1741323,CHEMBL3622,SINGLE PROTEIN,Cytochrome P450 2C19,Homo sapiens
2,688422,CHEMBL1614544,CHEMBL1293235,SINGLE PROTEIN,Prelamin-A/C,Homo sapiens
3,688810,CHEMBL1613836,CHEMBL1615322,NUCLEIC-ACID,microRNA 21,Homo sapiens
4,688812,CHEMBL1613838,CHEMBL1293294,SINGLE PROTEIN,Ras-related protein Rab-9A,Homo sapiens
...,...,...,...,...,...,...
196,1301866,CHEMBL3214913,CHEMBL612545,UNCHECKED,Unchecked,
197,1301890,CHEMBL3214967,CHEMBL2029197,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 3,Homo sapiens
198,1301859,CHEMBL3214906,CHEMBL2029198,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 4,Homo sapiens
199,1301893,CHEMBL3214970,CHEMBL612545,UNCHECKED,Unchecked,


In [50]:
# UniProt API base URL
uniprot_api_base_url = "https://rest.uniprot.org/uniprotkb/search"

# Function to search UniProt for gene symbols by target name
def get_gene_symbol(target_name):
    query = f"query={target_name}&fields=gene_names"
    url = f"{uniprot_api_base_url}?{query}&format=json"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            results = response.json()
            if 'results' in results and results['results']:
                # Return the first gene symbol found
                return results['results'][0].get('genes', [{}])[0].get('geneName', {}).get('value', 'Unknown')
        return 'Unknown'
    except Exception as e:
        return 'Unknown'

# Initialize a progress bar
tqdm.pandas()

# Apply the gene symbol search to the DataFrame's target_name column
df['gene_symbol'] = df['target_name'].parallel_apply(lambda x: get_gene_symbol(x) if x != 'Unchecked' else 'Unknown')

# Display the updated DataFrame with gene symbols
df

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

Unnamed: 0,ASSAY_ID,assay_chembl_id,target_chembl_id,target_type,target_name,organism,gene_symbol
0,737823,CHEMBL1741322,CHEMBL3356,SINGLE PROTEIN,Cytochrome P450 1A2,Homo sapiens,CYP1A2
1,737824,CHEMBL1741323,CHEMBL3622,SINGLE PROTEIN,Cytochrome P450 2C19,Homo sapiens,CYP2C19
2,688422,CHEMBL1614544,CHEMBL1293235,SINGLE PROTEIN,Prelamin-A/C,Homo sapiens,LMNA
3,688810,CHEMBL1613836,CHEMBL1615322,NUCLEIC-ACID,microRNA 21,Homo sapiens,TNRC6A
4,688812,CHEMBL1613838,CHEMBL1293294,SINGLE PROTEIN,Ras-related protein Rab-9A,Homo sapiens,RAB9A
...,...,...,...,...,...,...,...
196,1301866,CHEMBL3214913,CHEMBL612545,UNCHECKED,Unchecked,,Unknown
197,1301890,CHEMBL3214967,CHEMBL2029197,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 3,Homo sapiens,RAPGEF6
198,1301859,CHEMBL3214906,CHEMBL2029198,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 4,Homo sapiens,RAPGEF4
199,1301893,CHEMBL3214970,CHEMBL612545,UNCHECKED,Unchecked,,Unknown


In [92]:
df.to_csv("detailed_target_map.csv")

In [53]:
for i in df.gene_symbol.unique():
    print(i)

CYP1A2
CYP2C19
LMNA
TNRC6A
RAB9A
Smn1
TDP2
MAPT
RORC
ALDH1A1
Unknown
CFH
ampC
NPC1
TRAP
CYP2D6
CYP2C9
CYP3A4
dnaB
MTOR
HSPB1
NPPB
HSP82
Crhr2
AFAP1
LARP6
qoxA
Hsp83
PADI4
APOBEC3B
CCT4
WIPI1
CEP104
STXBP4
ENOX2
mab-10
F3
cdh-4
U2SURP
TKT10
RPS9A
DHRS11
MAP3K14
SNRPF
TRAK2
ATAD2
NAT2
gag-pro-pol
TSPO
GAPC1
dddP
C1RL
TRIM22
EIF1AD
ABL2
CD1D
HDC
CDC42SE2
DLD1
Bap1
ST3GAL4
UBE2E1
SENP3
XBP1
ALPG
DDIT4
GluPho
EGLN3
Kdm4A
arc
NFE2L2
GCN5
TP53
Mphosph8
Txnrd1
dim-5
CBX6
BAZ2B
ALOX15B
VDR
ATXN2L
PolI
RAN
FEN1
RGS9
SLC2A1
POLL
SMAD7
UCHL3
PMP2
GMNN
CLPP
Gls
DRD1
GCG
WRN
BRCA1
NR1I2
NUP98
TARDBP
Snca
RAPGEF4
PLK1
nth
RAPGEF6


In [61]:
interactions = pd.read_csv("string_interactions_short.tsv", sep='\t')
interactions

Unnamed: 0,#node1,node2,node1_string_id,node2_string_id,neighborhood_on_chromosome,gene_fusion,phylogenetic_cooccurrence,homology,coexpression,experimentally_determined_interaction,database_annotated,automated_textmining,combined_score
0,AASDH,POLI,9606.ENSP00000205214,9606.ENSP00000462664,0.000,0.0,0.0,0.0,0.354,0.000,0.000,0.166,0.439
1,AASDH,CYP2D6,9606.ENSP00000205214,9606.ENSP00000496150,0.048,0.0,0.0,0.0,0.064,0.309,0.188,0.160,0.503
2,AASDH,CYP2C19,9606.ENSP00000205214,9606.ENSP00000360372,0.048,0.0,0.0,0.0,0.064,0.309,0.188,0.160,0.503
3,AASDH,CYP27B1,9606.ENSP00000205214,9606.ENSP00000228606,0.048,0.0,0.0,0.0,0.064,0.309,0.188,0.166,0.507
4,AASDH,CYP2C9,9606.ENSP00000205214,9606.ENSP00000260682,0.048,0.0,0.0,0.0,0.064,0.309,0.188,0.179,0.514
...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,TP53,XBP1,9606.ENSP00000269305,9606.ENSP00000216037,0.000,0.0,0.0,0.0,0.042,0.000,0.000,0.529,0.529
177,TP53,UBE2E1,9606.ENSP00000269305,9606.ENSP00000303709,0.000,0.0,0.0,0.0,0.000,0.306,0.000,0.277,0.477
178,TP53,TXNRD1,9606.ENSP00000269305,9606.ENSP00000434516,0.000,0.0,0.0,0.0,0.043,0.292,0.000,0.470,0.609
179,TP53,WRN,9606.ENSP00000269305,9606.ENSP00000298139,0.000,0.0,0.0,0.0,0.000,0.620,0.400,0.851,0.963


In [63]:
import json

# Load the JSON file containing the task splits
with open('data_split.json', 'r') as f:
    task_splits = json.load(f)

train_tasks = task_splits['train']
test_tasks = task_splits['test']

In [64]:
train_df = df[df.ASSAY_ID.isin(train_tasks)]
train_df

Unnamed: 0,ASSAY_ID,assay_chembl_id,target_chembl_id,target_type,target_name,organism,gene_symbol
1,737824,CHEMBL1741323,CHEMBL3622,SINGLE PROTEIN,Cytochrome P450 2C19,Homo sapiens,CYP2C19
2,688422,CHEMBL1614544,CHEMBL1293235,SINGLE PROTEIN,Prelamin-A/C,Homo sapiens,LMNA
3,688810,CHEMBL1613836,CHEMBL1615322,NUCLEIC-ACID,microRNA 21,Homo sapiens,TNRC6A
4,688812,CHEMBL1613838,CHEMBL1293294,SINGLE PROTEIN,Ras-related protein Rab-9A,Homo sapiens,RAB9A
5,688816,CHEMBL1613842,CHEMBL1293232,SINGLE PROTEIN,Survival motor neuron protein,Homo sapiens,Smn1
...,...,...,...,...,...,...,...
194,688671,CHEMBL1614211,CHEMBL5619,SINGLE PROTEIN,DNA-(apurinic or apyrimidinic site) lyase,Homo sapiens,nth
197,1301890,CHEMBL3214967,CHEMBL2029197,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 3,Homo sapiens,RAPGEF6
198,1301859,CHEMBL3214906,CHEMBL2029198,SINGLE PROTEIN,Rap guanine nucleotide exchange factor 4,Homo sapiens,RAPGEF4
199,1301893,CHEMBL3214970,CHEMBL612545,UNCHECKED,Unchecked,,Unknown


In [80]:

test_df = df[df.ASSAY_ID.isin(test_tasks)]
target_terms= test_df.gene_symbol.to_list()
target_terms

['ampC',
 'TRAP',
 'CYP2C9',
 'CYP2C19',
 'CYP3A4',
 'Unknown',
 'Unknown',
 'GCN5',
 'BAZ2B',
 'VDR',
 'FEN1',
 'RGS9',
 'SMAD7',
 'PMP2',
 'ATXN2L',
 'BRCA1',
 'TARDBP',
 'Unknown']

In [89]:
test_df.gene_symbol.value_counts()

gene_symbol
Unknown    3
ampC       1
TRAP       1
CYP2C9     1
CYP2C19    1
CYP3A4     1
GCN5       1
BAZ2B      1
VDR        1
FEN1       1
RGS9       1
SMAD7      1
PMP2       1
ATXN2L     1
BRCA1      1
TARDBP     1
Name: count, dtype: int64

In [90]:
test_df[test_df["gene_symbol"]=="Unknown"]

Unnamed: 0,ASSAY_ID,assay_chembl_id,target_chembl_id,target_type,target_name,organism,gene_symbol
34,1495405,CHEMBL3562136,CHEMBL612545,UNCHECKED,Unchecked,,Unknown
118,737053,CHEMBL1738598,CHEMBL612545,UNCHECKED,Unchecked,,Unknown
195,845206,CHEMBL2114817,CHEMBL4377,SINGLE PROTEIN,"Guanine nucleotide-binding protein G(s), subun...",Homo sapiens,Unknown


In [88]:
import pandas as pd
import networkx as nx

# Create an undirected graph using NetworkX
G = nx.Graph()

# Add edges to the graph from the DataFrame
for index, row in interactions.iterrows():
    G.add_edge(row['#node1'], row['node2'])

# Function to find all nodes within 2 steps (including itself)
def find_nodes_within_two_steps(graph, gene):
    return set(nx.single_source_shortest_path_length(graph, gene, cutoff=2).keys())

# Example: Given a list of genes
genes_of_interest  = test_df.gene_symbol.unique()

# Find nodes within 2 steps for each gene in the list
results = {}
for gene in genes_of_interest:
    if gene in G:
        results[gene] = find_nodes_within_two_steps(G, gene)
    else:
        results[gene] = None  # If the gene isn't in the graph

# Print the results
for gene, neighbors in results.items():
    print(f"Genes within 2 steps of {gene}: {neighbors}")


Genes within 2 steps of ampC: None
Genes within 2 steps of TRAP: None
Genes within 2 steps of CYP2C9: {'SMN1', 'CYP2C9', 'AASDH', 'BRCA1', 'CYP2D6', 'APOBEC3B', 'POLL', 'KAT2A', 'MAPT', 'BAP1', 'SENP3', 'KDM4A', 'NFE2L2', 'CCT4', 'GLS', 'SMAD7', 'F3', 'NR1I2', 'WRN', 'LMNA', 'SNCA', 'MTOR', 'SLC2A1', 'TNRC6A', 'CYP27B1', 'XBP1', 'CFH', 'ALDH1A1', 'UBE2E1', 'POLI', 'FEN1', 'NPC1', 'PADI4', 'DDIT4', 'ACP5', 'CBX6', 'TP53', 'PLK1', 'TXNRD1', 'EGLN3', 'GCG', 'ALOX15B', 'CYP1A2', 'NAT2', 'HSPB1', 'TSPO', 'CYP2C19', 'CYP3A4'}
Genes within 2 steps of CYP2C19: {'CYP2C9', 'AASDH', 'CYP2D6', 'NFE2L2', 'F3', 'NR1I2', 'SNCA', 'MTOR', 'CYP27B1', 'ALDH1A1', 'POLI', 'NPC1', 'TP53', 'GCG', 'ALOX15B', 'CYP1A2', 'NAT2', 'TSPO', 'CYP2C19', 'CYP3A4'}
Genes within 2 steps of CYP3A4: {'SMN1', 'CYP2C9', 'AASDH', 'BRCA1', 'CYP2D6', 'APOBEC3B', 'POLL', 'KAT2A', 'MAPT', 'BAP1', 'SENP3', 'KDM4A', 'NFE2L2', 'CCT4', 'GLS', 'SMAD7', 'F3', 'NR1I2', 'WRN', 'LMNA', 'SNCA', 'MTOR', 'SLC2A1', 'TNRC6A', 'CYP27B1', 'XBP1'

In [81]:
import networkx as nx

# Create an undirected graph using NetworkX
G = nx.Graph()

# Add edges to the graph from the DataFrame
for index, row in interactions.iterrows():
    G.add_edge(row['#node1'], row['node2'])

# Function to find all nodes within 2 steps (including itself)
def find_nodes_within_two_steps(graph, gene):
    return set(nx.single_source_shortest_path_length(graph, gene, cutoff=2).keys())

# Example: Given a list of genes
genes_of_interest = test_df.gene_symbol.unique()

# Create an empty set to hold all unique genes within 2 steps
combined_genes_within_two_steps = set()

# Find nodes within 2 steps for each gene in the list and combine into one set
for gene in genes_of_interest:
    if gene in G:
        combined_genes_within_two_steps.update(find_nodes_within_two_steps(G, gene))

# Convert the set to a sorted list (optional, for easier reading)
combined_genes_list = sorted(list(combined_genes_within_two_steps))

# Print the combined result
print("Combined list of genes within 2 steps of all genes of interest:")
print(combined_genes_list)


Combined list of genes within 2 steps of all genes of interest:
['AASDH', 'ACP5', 'ALDH1A1', 'ALOX15B', 'APOBEC3B', 'ATAD2', 'ATXN2L', 'BAP1', 'BAZ2B', 'BRCA1', 'CBX6', 'CCT4', 'CFH', 'CLPP', 'CYP1A2', 'CYP27B1', 'CYP2C19', 'CYP2C9', 'CYP2D6', 'CYP3A4', 'DDIT4', 'EGLN3', 'F3', 'FEN1', 'GCG', 'GLS', 'GMNN', 'HSPB1', 'KAT2A', 'KDM4A', 'LMNA', 'MAPT', 'MTOR', 'NAT2', 'NFE2L2', 'NPC1', 'NR1I2', 'NUP98', 'PADI4', 'PLK1', 'POLI', 'POLL', 'RAN', 'RGS9', 'SENP3', 'SLC2A1', 'SMAD7', 'SMN1', 'SNCA', 'SNRPF', 'TARDBP', 'TDP2', 'TNRC6A', 'TP53', 'TRAK2', 'TSPO', 'TXNRD1', 'U2SURP', 'UBE2E1', 'UCHL3', 'WIPI1', 'WRN', 'XBP1']


In [82]:
combined_genes_list

['AASDH',
 'ACP5',
 'ALDH1A1',
 'ALOX15B',
 'APOBEC3B',
 'ATAD2',
 'ATXN2L',
 'BAP1',
 'BAZ2B',
 'BRCA1',
 'CBX6',
 'CCT4',
 'CFH',
 'CLPP',
 'CYP1A2',
 'CYP27B1',
 'CYP2C19',
 'CYP2C9',
 'CYP2D6',
 'CYP3A4',
 'DDIT4',
 'EGLN3',
 'F3',
 'FEN1',
 'GCG',
 'GLS',
 'GMNN',
 'HSPB1',
 'KAT2A',
 'KDM4A',
 'LMNA',
 'MAPT',
 'MTOR',
 'NAT2',
 'NFE2L2',
 'NPC1',
 'NR1I2',
 'NUP98',
 'PADI4',
 'PLK1',
 'POLI',
 'POLL',
 'RAN',
 'RGS9',
 'SENP3',
 'SLC2A1',
 'SMAD7',
 'SMN1',
 'SNCA',
 'SNRPF',
 'TARDBP',
 'TDP2',
 'TNRC6A',
 'TP53',
 'TRAK2',
 'TSPO',
 'TXNRD1',
 'U2SURP',
 'UBE2E1',
 'UCHL3',
 'WIPI1',
 'WRN',
 'XBP1']

In [83]:
selectedtraintasks = train_df[train_df["gene_symbol"].isin(combined_genes_list)]["ASSAY_ID"].to_list()
selectedtraintasks

['737824',
 '688422',
 '688810',
 '688724',
 '688739',
 '688238',
 '688157',
 '688360',
 '688653',
 '688422_1',
 '688238_1',
 '737822',
 '737823_1',
 '1301435',
 '737357',
 '809027',
 '809031',
 '809032',
 '809035',
 '809061',
 '809079',
 '809113',
 '809292',
 '809302',
 '809316',
 '809322',
 '809324',
 '809330',
 '809426',
 '737262',
 '737390',
 '737452',
 '737421',
 '752533',
 '737173',
 '737407',
 '736963',
 '752328',
 '752411',
 '752563',
 '752477',
 '845045',
 '845102',
 '845177',
 '845185',
 '845186',
 '845225',
 '954305',
 '1301428',
 '1301717']

In [84]:
import pickle

# Save the list to a file
with open("selectedtraintasks.pkl", "wb") as f:
    pickle.dump(selectedtraintasks, f)