In [6]:
import pandas as pd
import requests


def get_targets(aid_list):
    """ function to get target information for a list of aids """
    # convert list of identifers to str
    aid_list = list(map(str, aid_list))

    # make the base URL for the PubChem POST Request
    url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/targets/ProteinGI,ProteinName,GeneID,GeneSymbol/json"
    
    headers = {'Content-Type': 'multipart/form-data'}
    data = {'aid': ','.join(aid_list)}

    response = requests.post(url, data=data)

    return response.json()

In [7]:
dr_aids = pd.read_table('dr_aids.txt', header=None, names=['AIDS'])

In [9]:
target_json = get_targets(dr_aids['AIDS'].values)

targets = pd.DataFrame(target_json['InformationList']['Information'])

targets.head()

Unnamed: 0,AID,GI,GeneID,ProteinName,GeneSymbol
0,1508648,,,,
1,1508647,[130339],[24660],[Peripheral myelin protein 22],[Pmp22]
2,1508645,,,,
3,1508644,[130339],[24660],[Peripheral myelin protein 22],[Pmp22]
4,1508642,,,,


Each target column is a list (some assays can have multiple targets I guess).  The `explode` function in pandas is really useful to expand a column of lists.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html

In [33]:
# example of multiple target

targets.query("AID == 743473")

Unnamed: 0,AID,GI,GeneID,ProteinName,GeneSymbol
506,743473,"[51831776, 119587431, 119587435, 408684256, 51...","[4224, 4225, 4318, 4313, 4317, 4319, 4322, 432...","[MEP1A protein, matrix metalloproteinase 13 (c...","[MEP1A, MEP1B, MMP9, MMP2, MMP8, MMP10, MMP13,..."


In [20]:
dataframes = []

for col in ['GI', 'GeneID', 'ProteinName', 'GeneSymbol']:
    new_df = targets.set_index('AID')[col].explode().reset_index()
    dataframes.append(new_df)

In [24]:
from functools import reduce

targets_clean = reduce(lambda x, y: pd.merge(x, y, on = 'AID'), dataframes)
targets_clean.to_csv('data/target_info.csv')
targets_clean.head()

Unnamed: 0,AID,GI,GeneID,ProteinName,GeneSymbol
0,1508648,,,,
1,1508647,130339.0,24660.0,Peripheral myelin protein 22,Pmp22
2,1508645,,,,
3,1508644,130339.0,24660.0,Peripheral myelin protein 22,Pmp22
4,1508642,,,,


In [27]:
targets_clean.AID.nunique()

3977

In [28]:
target_counts = targets_clean['GeneSymbol'].value_counts()
target_counts.head()

MMP14    732
MMP2     731
MMP8     729
MEP1B    729
MMP13    729
Name: GeneSymbol, dtype: int64

In [36]:
gene_groups = targets_clean.groupby('GeneSymbol')

Number of unique AIDs for each target..

In [38]:
aid_gene_counts = gene_groups['AID'].nunique()

In [39]:
aid_gene_counts.sort_values()

GeneSymbol
LDHA             1
HHV4tp2_gp31     1
HSD17B4          1
HSP90AB1         1
HSPA1A           1
                ..
PKM             20
CASP3           20
LOC116160065    21
Scarb1          23
Kcnq2           26
Name: AID, Length: 607, dtype: int64

In [40]:
print(f"There are {len(aid_gene_counts)} unique genes")

There are 607 unique genes


In [45]:
aid_gene_filtered = aid_gene_counts[aid_gene_counts > 1]
print(f"There are {len(aid_gene_filtered)} genes that map to more than 1 target")

There are 442 genes that map to more than 1 target
