# ***RUN NOTEBOOK IN ORDER

# Dealing with multicomponent compounds

# Data Cleaning Full (antag)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from requests import exceptions
import re
%matplotlib inline

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")


Read the antagonist assay data into a data frame

In [2]:
#Reading in Summary ANTAGONIST CSV
antag_pre = pd.read_csv('AID_720725_datatable_all.csv')
antag_pre = antag_pre.iloc[3:]
antag_pre.tail(3)

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,Ratio Potency (uM),...,530 nm Activity,530 nm Potency (uM),530 nm Efficacy (%),460 nm Activity,460 nm Potency (uM),460 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
10486,10484,144214047.0,5794.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0.0,inactive,,0,LightBiologicals
10487,10485,144214048.0,3034285.0,Inconclusive,10.0,,,inconclusive agonist,inconclusive agonist,23.4434,...,inactive,,0,inconclusive agonist,6.53319,77.9241,inactive,,0,LightBiologicals
10488,10486,144214049.0,6623.0,Active,48.0,,,active antagonist,active antagonist,40.0017,...,inactive,,0,inconclusive antagonist,40.3064,-89.3918,inactive,,0,SIGMA


In [3]:
antag_pre = antag_pre[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
antag_pre.tail(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
10486,5794.0,Inactive,inactive,,0.0
10487,3034285.0,Inconclusive,inconclusive agonist,23.4434,91.4381
10488,6623.0,Active,active antagonist,40.0017,-90.7806


Deal with the NULL values: remove the rows without CIDs.

In [4]:
antag_pre.isnull().sum()

PUBCHEM_CID                  149
PUBCHEM_ACTIVITY_OUTCOME       0
Activity Summary               0
Ratio Potency (uM)          8433
Ratio Efficacy (%)             0
dtype: int64

In [5]:
antag_pre = antag_pre[ antag_pre['PUBCHEM_CID'].notnull() ]
antag_pre.shape

(10337, 5)

Convert the PUBCHEM_CID (floats) into an integer type.

In [6]:
antag_pre['PUBCHEM_CID'] = antag_pre['PUBCHEM_CID'].astype(int)
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
3,12850184,Inactive,inactive,,0.0
4,89753,Inactive,inactive,,0.0
5,9403,Active,active antagonist,7.56242,-104.384


In [7]:
antag_pre['Activity Summary'].value_counts()

inactive                               7640
inconclusive                           1025
active antagonist                       451
inconclusive antagonist (cytotoxic)     397
active agonist                          268
inconclusive agonist                    257
inconclusive antagonist                 242
inconclusive agonist (cytotoxic)         57
Name: Activity Summary, dtype: int64

In [8]:
antag_pre['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

Inactive        7640
Inconclusive    2246
Active           451
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [9]:
antag_pre.shape

(10337, 5)

In [10]:
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
3,12850184,Inactive,inactive,,0.0
4,89753,Inactive,inactive,,0.0
5,9403,Active,active antagonist,7.56242,-104.384


# Identifying Covalent Unit Count (antag)

Compute the number of CID chunks based on the number of CIDs and the chunk size.

In [11]:
chunk_size = 200
num_cids = len(antag_pre['PUBCHEM_CID'])

if num_cids % chunk_size == 0 :
    num_chunks = int( num_cids / chunk_size )
else :
    num_chunks = int( num_cids / chunk_size ) + 1

print("# CIDs = ", num_cids)
print("# CID Chunks = ", num_chunks, "(chunked by ", chunk_size, ")")

# CIDs =  10337
# CID Chunks =  52 (chunked by  200 )


Retrieve the covalent unit counts for each chunk of CIDs.

In [12]:
import time

frames = []  # temporary list to store data frames from each PUG-REST request.

#print("chunkid", 'idx1', 'idx2', 'length', sep="\t")

for i in range(0, num_chunks) :
    
    idx1 = chunk_size * i
    idx2 = chunk_size * (i + 1)
    cidstr = ",".join(antag_pre.PUBCHEM_CID[idx1:idx2].astype(str).tolist())

    url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + cidstr + '/property/CovalentUnitCount/TXT')
    data = pd.read_csv( urllib.request.urlopen(url), header=None, names=['cov_units'] )
    frames.append(data)
    
    # Uncomment this line to check the progress
    #print(i, idx1, idx2, len(data), sep="\t")
    
    if (i % 5 == 0 ) :
        time.sleep(1)  # To comply with the PubChem usage polocy (no more than 5 requests per sec)

#    if ( i == 2 ) : break  #- for debugging

cov_units = pd.concat(frames,ignore_index=True)
print(cov_units.shape)

(10337, 1)


# Dealing with multicomponent molecules (antag)

In [13]:
antag_pre['CovalentUnitCount'] = cov_units.cov_units.tolist()
antag_pre.CovalentUnitCount.isnull().sum()

0

In [14]:
antag_pre['CovalentUnitCount'].value_counts()

1     8308
2     1560
3      348
5       49
4       43
6       10
9        6
7        5
12       2
11       2
8        2
10       1
41       1
Name: CovalentUnitCount, dtype: int64

In [15]:
cid_multicompo = antag_pre[ antag_pre.CovalentUnitCount > 1 ].PUBCHEM_CID.tolist()
len(cid_multicompo)

2029

The multicomponent CID list has some duplicate CIDs.  Removing them will reduce the number of PUG-REST requests later.

In [16]:
cid_multicompo.sort()
 
cid_multicompo_uniq = sorted( set(cid_multicompo) )
cid_multicompo_dup = []
    
for mycid in cid_multicompo_uniq:
        if ( cid_multicompo.count( mycid ) > 1 ):
            cid_multicompo_dup.append( mycid )

print(len(cid_multicompo_dup))
print(cid_multicompo_dup)

cid_multicompo = cid_multicompo_uniq

238
[2767, 5795, 5807, 5935, 5946, 5963, 5974, 6014, 6099, 8152, 8478, 8638, 8667, 8691, 8715, 8722, 8753, 8813, 8816, 8870, 8980, 9279, 9280, 9351, 9373, 9409, 9703, 9787, 10866, 11048, 11057, 11065, 11224, 11313, 11545, 11693, 11933, 12447, 12456, 12484, 12525, 13144, 13266, 13506, 14184, 14250, 14710, 14842, 15106, 16013, 16015, 16230, 16961, 17170, 17730, 18340, 18487, 19379, 19458, 19518, 19604, 22420, 22584, 22960, 22985, 23392, 23394, 24434, 24482, 24502, 24633, 24639, 26041, 26758, 27461, 27503, 27872, 31202, 31280, 32731, 33286, 33557, 36605, 38852, 39424, 44072, 54891, 54900, 54911, 55182, 56704, 60496, 60560, 60714, 60754, 60822, 60934, 60962, 61100, 61444, 62311, 62581, 62655, 62882, 62884, 62935, 64142, 64927, 66245, 68589, 68624, 71412, 71587, 75311, 83823, 90473, 92151, 92965, 102428, 107882, 108005, 108938, 135242, 155434, 161803, 164457, 166033, 235227, 441244, 441308, 441325, 441337, 441374, 443939, 516871, 516919, 517121, 517326, 517383, 517414, 517546, 522325, 53372

Because downloading parent compound information through PUG-REST takes a long time, we want to store the data in a file when they are downloaded for later use. When the cell is run next time, it will read the data from the file, rather than downloading them from PubChem.

In [17]:
import os.path
import json

cid_parent= {}
file_parent = 'cid_parent.json'

if os.path.isfile(file_parent) and os.path.getsize(file_parent) :
    
    with open('cid_parent.json') as f:
        cid_parent = json.load(f)

    # By default, the keys will be loaded as strings, but
    # they are expected to be integers later in the notebook.
    cid_parent = { int(k):v for k,v in cid_parent.items()}
        
    if ( len(cid_parent) != len(cid_multicompo) ) :
        print("CID counts mismatch:", len(cid_multicompo), "cid_multicompo", len(cid_parent), "cid_parent")

else:

    for i in range(0, len(cid_multicompo)) :

        try:
        
            mycid = cid_multicompo[i]
            url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(mycid) + '/cids/TXT?cids_type=parent')
            res = urllib.request.urlopen(url)
            parent = res.read()
            cid_parent[ mycid ] = int(parent.decode().rstrip())
    
        except:
    
            cid_parent[ mycid ] = None
    
        if (i % 5 == 0 ) : time.sleep(1)
        if (i % 200 == 0 ) : print("processing ", i , "of ", len(cid_multicompo))

    print("# Download of parent CIDs has been complete")

    with open('cid_parent.json', 'w') as f:
        json.dump(cid_parent, f)
    
print("# len(cid_parent):", len(cid_parent) )

# len(cid_parent): 1748


In [18]:
# Store CIDs without parent compounds.
cid_noparent = [ key for key,val in cid_parent.items() if not val  ]
print(len(cid_noparent))

207


In [19]:
# Loop over each row in the data frame
for idx, row in antag_pre.iterrows() :
    if ( row['CovalentUnitCount'] > 1 ) :
        antag_pre.loc[ idx, 'PUBCHEM_CID' ] = cid_parent[ row['PUBCHEM_CID'] ]

In [20]:
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
3,5460352.0,Inactive,inactive,,0.0,3
4,10690.0,Inactive,inactive,,0.0,3
5,9403.0,Active,active antagonist,7.56242,-104.384,1


In [21]:
# Double check if the substitution has been done correctly.
count_no_sub = 0
for mycid in cid_multicompo :
    x = antag_pre[ antag_pre['PUBCHEM_CID'] == mycid ]
    if ( len(x) != 0 ) :
        print("Warning:", mycid, "is still found", len(x))
        count_no_sub += 1

if ( count_no_sub == 0 ) :
    print("All multicomponent compounds have been replaced!")
else :
    print("Warning:", count_no_sub, "compound(s) have not been replaced")

All multicomponent compounds have been replaced!


In [22]:
antag_pre = antag_pre[ antag_pre['PUBCHEM_CID'].notnull() ]
antag_pre['PUBCHEM_CID'] = antag_pre['PUBCHEM_CID'].astype(int)
antag_pre.shape

(10101, 6)

In [23]:
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
3,5460352,Inactive,inactive,,0.0,3
4,10690,Inactive,inactive,,0.0,3
5,9403,Active,active antagonist,7.56242,-104.384,1


# Dealing with conflicts (antag)

In [24]:
antag_pre.isnull().sum()

PUBCHEM_CID                    0
PUBCHEM_ACTIVITY_OUTCOME       0
Activity Summary               0
Ratio Potency (uM)          8119
Ratio Efficacy (%)             0
CovalentUnitCount              0
dtype: int64

In [25]:
antag_pre['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

Inactive        7449
Inconclusive    2205
Active           447
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [26]:
antag_pre['Activity Summary'].value_counts()

inactive                               7449
inconclusive                           1008
active antagonist                       447
inconclusive antagonist (cytotoxic)     386
active agonist                          266
inconclusive agonist                    252
inconclusive antagonist                 237
inconclusive agonist (cytotoxic)         56
Name: Activity Summary, dtype: int64

In [27]:
antag_pre[ 'PUBCHEM_CID'].head(10)

3      5460352
4        10690
5         9403
6     13218779
8        16043
9        16043
10       11295
11     2724372
12      637566
13         994
Name: PUBCHEM_CID, dtype: int32

In [28]:
set(antag_pre[ antag_pre['PUBCHEM_CID'] == 994 ].loc[:,'Activity Summary'].tolist())

{'inactive'}

In [29]:
# Generate unique CID list
antag_cid = antag_pre['PUBCHEM_CID'].tolist()

# Drop the CIDs with conflicting activity declaration.
for mycid in antag_cid :
    
    activities = set( antag_pre[ antag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist())
    if ( len(activities) > 1 ):
        #print(mycid, len(activities), activities)
        antag_pre = antag_pre[ antag_pre['PUBCHEM_CID'] != mycid ]

In [30]:
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
3,5460352,Inactive,inactive,,0.0,3
4,10690,Inactive,inactive,,0.0,3
5,9403,Active,active antagonist,7.56242,-104.384,1


In [31]:
antag_pre.shape

(8816, 6)

In [32]:
antag_pre['Activity Summary'].value_counts()

inactive                               6949
inconclusive                            681
active antagonist                       341
inconclusive antagonist (cytotoxic)     289
active agonist                          222
inconclusive agonist                    153
inconclusive antagonist                 143
inconclusive agonist (cytotoxic)         38
Name: Activity Summary, dtype: int64

# Data Cleaning Full (ag)

Read the antagonist assay data into a data frame

In [33]:
#Reading in Summary ANTAGONIST CSV
ag_pre = pd.read_csv('AID_720719_datatable_all.csv')
ag_pre = ag_pre.iloc[3:]
ag_pre.tail(3)

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,Ratio Potency (uM),Ratio Efficacy (%),530 nm Activity,530 nm Potency (uM),530 nm Efficacy (%),460 nm Activity,460 nm Potency (uM),460 nm Efficacy (%),Blue (460 nm) auto fluorescence outcome,Sample Source
10487,10484,144214047.0,5794.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,inactive,,0,inactive,LightBiologicals
10488,10485,144214048.0,3034285.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,inactive,,0,inactive,LightBiologicals
10489,10486,144214049.0,6623.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,inactive,,0,inactive,SIGMA


In [34]:
ag_pre = ag_pre[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
ag_pre.tail(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
10487,5794.0,Inactive,inactive,,0
10488,3034285.0,Inactive,inactive,,0
10489,6623.0,Inactive,inactive,,0


Deal with the NULL values: remove the rows without CIDs.

In [35]:
ag_pre.isnull().sum()

PUBCHEM_CID                  150
PUBCHEM_ACTIVITY_OUTCOME       1
Activity Summary               1
Ratio Potency (uM)          9685
Ratio Efficacy (%)             1
dtype: int64

In [36]:
ag_pre = ag_pre[ ag_pre['PUBCHEM_CID'].notnull() ]
ag_pre.shape

(10337, 5)

Convert the PUBCHEM_CID (floats) into an integer type.

In [37]:
ag_pre['PUBCHEM_CID'] = ag_pre['PUBCHEM_CID'].astype(int)
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
4,12850184,Inactive,inactive,,0
5,89753,Inactive,inactive,,0
6,9403,Inconclusive,inconclusive,,0


In [38]:
ag_pre['Activity Summary'].value_counts()

inactive                              9043
inconclusive                           882
active agonist                         211
inconclusive agonist                    81
active antagonist                       53
inconclusive antagonist                 44
inconclusive agonist (fluorescent)      23
Name: Activity Summary, dtype: int64

In [39]:
ag_pre['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

Inactive        9043
Inconclusive    1083
Active           211
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [40]:
ag_pre.shape

(10337, 5)

In [41]:
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%)
4,12850184,Inactive,inactive,,0
5,89753,Inactive,inactive,,0
6,9403,Inconclusive,inconclusive,,0


# Identifying Covalent Unit Count (ag)

Compute the number of CID chunks based on the number of CIDs and the chunk size.

In [42]:
chunk_size = 200
num_cids = len(ag_pre['PUBCHEM_CID'])

if num_cids % chunk_size == 0 :
    num_chunks = int( num_cids / chunk_size )
else :
    num_chunks = int( num_cids / chunk_size ) + 1

print("# CIDs = ", num_cids)
print("# CID Chunks = ", num_chunks, "(chunked by ", chunk_size, ")")

# CIDs =  10337
# CID Chunks =  52 (chunked by  200 )


Retrieve the covalent unit counts for each chunk of CIDs.

In [43]:
import time

frames = []  # temporary list to store data frames from each PUG-REST request.

#print("chunkid", 'idx1', 'idx2', 'length', sep="\t")

for i in range(0, num_chunks) :
    
    idx1 = chunk_size * i
    idx2 = chunk_size * (i + 1)
    cidstr = ",".join(ag_pre.PUBCHEM_CID[idx1:idx2].astype(str).tolist())

    url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + cidstr + '/property/CovalentUnitCount/TXT')
    data = pd.read_csv( urllib.request.urlopen(url), header=None, names=['cov_units'] )
    frames.append(data)
    
    # Uncomment this line to check the progress
    #print(i, idx1, idx2, len(data), sep="\t")
    
    if (i % 5 == 0 ) :
        time.sleep(1)  # To comply with the PubChem usage polocy (no more than 5 requests per sec)

#    if ( i == 2 ) : break  #- for debugging

cov_units = pd.concat(frames,ignore_index=True)
print(cov_units.shape)

(10337, 1)


# Dealing with Multicomponent Molecules (ag)

In [44]:
ag_pre['CovalentUnitCount'] = cov_units.cov_units.tolist()
ag_pre.CovalentUnitCount.isnull().sum()

0

In [45]:
ag_pre['CovalentUnitCount'].value_counts()

1     8308
2     1560
3      348
5       49
4       43
6       10
9        6
7        5
12       2
11       2
8        2
10       1
41       1
Name: CovalentUnitCount, dtype: int64

In [46]:
cid_multicompo = ag_pre[ ag_pre.CovalentUnitCount > 1 ].PUBCHEM_CID.tolist()
len(cid_multicompo)

2029

The multicomponent CID list has some duplicate CIDs.  Removing them will reduce the number of PUG-REST requests later.

In [47]:
cid_multicompo.sort()
 
cid_multicompo_uniq = sorted( set(cid_multicompo) )
cid_multicompo_dup = []
    
for mycid in cid_multicompo_uniq:
        if ( cid_multicompo.count( mycid ) > 1 ):
            cid_multicompo_dup.append( mycid )

print(len(cid_multicompo_dup))
print(cid_multicompo_dup)

cid_multicompo = cid_multicompo_uniq

238
[2767, 5795, 5807, 5935, 5946, 5963, 5974, 6014, 6099, 8152, 8478, 8638, 8667, 8691, 8715, 8722, 8753, 8813, 8816, 8870, 8980, 9279, 9280, 9351, 9373, 9409, 9703, 9787, 10866, 11048, 11057, 11065, 11224, 11313, 11545, 11693, 11933, 12447, 12456, 12484, 12525, 13144, 13266, 13506, 14184, 14250, 14710, 14842, 15106, 16013, 16015, 16230, 16961, 17170, 17730, 18340, 18487, 19379, 19458, 19518, 19604, 22420, 22584, 22960, 22985, 23392, 23394, 24434, 24482, 24502, 24633, 24639, 26041, 26758, 27461, 27503, 27872, 31202, 31280, 32731, 33286, 33557, 36605, 38852, 39424, 44072, 54891, 54900, 54911, 55182, 56704, 60496, 60560, 60714, 60754, 60822, 60934, 60962, 61100, 61444, 62311, 62581, 62655, 62882, 62884, 62935, 64142, 64927, 66245, 68589, 68624, 71412, 71587, 75311, 83823, 90473, 92151, 92965, 102428, 107882, 108005, 108938, 135242, 155434, 161803, 164457, 166033, 235227, 441244, 441308, 441325, 441337, 441374, 443939, 516871, 516919, 517121, 517326, 517383, 517414, 517546, 522325, 53372

Because downloading parent compound information through PUG-REST takes a long time, we want to store the data in a file when they are downloaded for later use. When the cell is run next time, it will read the data from the file, rather than downloading them from PubChem.

In [48]:
import os.path
import json

cid_parent= {}
file_parent = 'cid_parent_ag.json'

if os.path.isfile(file_parent) and os.path.getsize(file_parent) :
    
    with open(file_parent) as f:
        cid_parent = json.load(f)

    # By default, the keys will be loaded as strings, but
    # they are expected to be integers later in the notebook.
    cid_parent = { int(k):v for k,v in cid_parent.items()}
        
    if ( len(cid_parent) != len(cid_multicompo) ) :
        print("CID counts mismatch:", len(cid_multicompo), "cid_multicompo", len(cid_parent), "cid_parent")

else:

    for i in range(0, len(cid_multicompo)) :

        try:
        
            mycid = cid_multicompo[i]
            url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(mycid) + '/cids/TXT?cids_type=parent')
            res = urllib.request.urlopen(url)
            parent = res.read()
            cid_parent[ mycid ] = int(parent.decode().rstrip())
    
        except:
    
            cid_parent[ mycid ] = None
    
        if (i % 5 == 0 ) : time.sleep(1)
        if (i % 200 == 0 ) : print("processing ", i , "of ", len(cid_multicompo))

    print("# Download of parent CIDs has been complete")

    with open(file_parent, 'w') as f:
        json.dump(cid_parent, f)
    
print("# len(cid_parent):", len(cid_parent) )

# len(cid_parent): 1748


In [49]:
# Store CIDs without parent compounds.
cid_noparent = [ key for key,val in cid_parent.items() if not val  ]
print(len(cid_noparent))

220


In [50]:
# Loop over each row in the data frame
for idx, row in ag_pre.iterrows() :
    if ( row['CovalentUnitCount'] > 1 ) :
        ag_pre.loc[ idx, 'PUBCHEM_CID' ] = cid_parent[ row['PUBCHEM_CID'] ]

In [51]:
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
4,5460352.0,Inactive,inactive,,0,3
5,10690.0,Inactive,inactive,,0,3
6,9403.0,Inconclusive,inconclusive,,0,1


In [52]:
# Double check if the substitution has been done correctly.
count_no_sub = 0
for mycid in cid_multicompo :
    x = ag_pre[ ag_pre['PUBCHEM_CID'] == mycid ]
    if ( len(x) != 0 ) :
        print("Warning:", mycid, "is still found", len(x))
        count_no_sub += 1

if ( count_no_sub == 0 ) :
    print("All multicomponent compounds have been replaced!")
else :
    print("Warning:", count_no_sub, "compound(s) have not been replaced")

All multicomponent compounds have been replaced!


In [53]:
ag_pre = ag_pre[ ag_pre['PUBCHEM_CID'].notnull() ]
ag_pre['PUBCHEM_CID'] = ag_pre['PUBCHEM_CID'].astype(int)
ag_pre.shape

(10088, 6)

In [54]:
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
4,5460352,Inactive,inactive,,0,3
5,10690,Inactive,inactive,,0,3
6,9403,Inconclusive,inconclusive,,0,1


# Dealing with conflicts (ag)

In [55]:
ag_pre.isnull().sum()

PUBCHEM_CID                    0
PUBCHEM_ACTIVITY_OUTCOME       0
Activity Summary               0
Ratio Potency (uM)          9352
Ratio Efficacy (%)             0
CovalentUnitCount              0
dtype: int64

In [56]:
ag_pre['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

Inactive        8828
Inconclusive    1057
Active           203
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [57]:
ag_pre['Activity Summary'].value_counts()

inactive                              8828
inconclusive                           862
active agonist                         203
inconclusive agonist                    80
active antagonist                       48
inconclusive antagonist                 44
inconclusive agonist (fluorescent)      23
Name: Activity Summary, dtype: int64

In [58]:
ag_pre[ 'PUBCHEM_CID'].head(10)

4      5460352
5        10690
6         9403
7     13218779
9        16043
10       16043
11       11295
12     2724372
13      637566
14         994
Name: PUBCHEM_CID, dtype: int32

In [59]:
# Generate unique CID list
ag_cid = ag_pre['PUBCHEM_CID'].tolist()

# Drop the CIDs with conflicting activity declaration.
for mycid in ag_cid :
    
    activities = set( ag_pre[ ag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist())
    if ( len(activities) > 1 ):
        #print(mycid, len(activities), activities)
        ag_pre = ag_pre[ ag_pre['PUBCHEM_CID'] != mycid ]

In [60]:
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),CovalentUnitCount
4,5460352,Inactive,inactive,,0,3
5,10690,Inactive,inactive,,0,3
6,9403,Inconclusive,inconclusive,,0,1


In [61]:
ag_pre.shape

(9466, 6)

In [62]:
ag_pre['Activity Summary'].value_counts()

inactive                              8525
inconclusive                           651
active agonist                         172
inconclusive agonist                    50
active antagonist                       31
inconclusive agonist (fluorescent)      19
inconclusive antagonist                 18
Name: Activity Summary, dtype: int64

# Combining Dataframes and Cleaning (both)

## Method 1

In [63]:
antag_cid = antag_pre['PUBCHEM_CID'].tolist()
ag_cid = ag_pre['PUBCHEM_CID'].tolist()
all_cid = list(set( antag_cid + ag_cid ))
len(all_cid)

7374

In [64]:
final_act = {}

for mycid in list(all_cid) :
    
    antag_act = list(set( antag_pre[ antag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist()))
    ag_act = list(set( ag_pre[ ag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist()))
    all_act = list(set( antag_act + ag_act ))
    
    if ( len(all_act) == 1) :
        final_act[ mycid ] = all_act[0]

df_activity = pd.DataFrame(final_act.items(), columns=['cid', 'activity'])
df_activity = df_activity.sort_values('cid').reset_index(drop=True)
df_activity = df_activity[ (df_activity['activity'] == 'active agonist') | \
                           (df_activity['activity'] == 'active antagonist' ) | \
                           (df_activity['activity'] == 'inactive') ]
df_activity.activity.value_counts()

inactive             5639
active agonist        104
active antagonist      30
Name: activity, dtype: int64

## Method 2 - USING

In [65]:
#antag_cid_df = antag_pre[ antag_pre['Activity Summary'] == 'active antagonist' ]
#ag_cid_df = ag_pre[ ag_pre['Activity Summary'] == 'active agonist' ]
#inact_cid_df_antag = antag_pre[ antag_pre['Activity Summary'] == 'inactive' ]
#inact_cid_df_ag = ag_pre[ ag_pre['Activity Summary'] == 'inactive' ]

#df_activity2 = pd.concat([ antag_cid_df, ag_cid_df, inact_cid_df_antag, inact_cid_df_ag])

In [66]:
cids_ant = antag_pre[ antag_pre['Activity Summary'] == 'active antagonist' ].PUBCHEM_CID.tolist()
cids_ago = ag_pre[ ag_pre['Activity Summary'] == 'active agonist' ].PUBCHEM_CID.tolist()
cids_inact1 = antag_pre[ antag_pre['Activity Summary'] == 'inactive' ].PUBCHEM_CID.tolist()
cids_inact2 = ag_pre[ ag_pre['Activity Summary'] == 'inactive' ].PUBCHEM_CID.tolist()

cids_conflict = np.intersect1d(cids_ant, cids_ago)
cids_ant = np.setdiff1d( cids_ant, cids_conflict )
cids_ago = np.setdiff1d( cids_ago, cids_conflict )

cids_inact = np.intersect1d(cids_inact1, cids_inact2)

final_act = {}

for mycid in cids_ant :
    final_act[ mycid ] = 'active antagonist'
    
for mycid in cids_ago :
    final_act[ mycid ] = 'active agonist'
    
for mycid in cids_inact :
    final_act[ mycid ] = 'inactive'

df_activity2 = pd.DataFrame(final_act.items(), columns=['cid', 'activity'])
df_activity2 = df_activity2.sort_values('cid').reset_index(drop=True)
df_activity2.activity.value_counts()

inactive             5263
active antagonist     274
active agonist        135
Name: activity, dtype: int64

In [76]:
df_activity2 = df_activity2.rename(columns = {'cid' : 'PUBCHEM_CID', 'activity' : 'Activity Summary'})

In [77]:
#Writing df_activity2 to csv

df_activity2.to_csv('FINAL_Merged_Cleaned_CSV_7-19', index = False)