# ***RUN NOTEBOOK IN ORDER

# Dealing with multicomponent compounds

# Data Cleaning Full (antag)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from requests import exceptions
import re
%matplotlib inline

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")


Read the antagonist assay data into a data frame

In [2]:
#Reading in Summary ANTAGONIST CSV
antag_pre = pd.read_csv('AID_588533_datatable_all.csv')
antag_pre = antag_pre.iloc[5:]
antag_pre.shape

(2858, 114)

In [3]:
antag_pre = antag_pre[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME']]
antag_pre.tail(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME
2860,10788.0,Inactive
2861,12901.0,Inactive
2862,6872.0,Inconclusive


Deal with the NULL values: remove the rows without CIDs.

In [4]:
antag_pre.isnull().sum()

PUBCHEM_CID                 14
PUBCHEM_ACTIVITY_OUTCOME     0
dtype: int64

In [5]:
antag_pre = antag_pre[ antag_pre['PUBCHEM_CID'].notnull() ]
antag_pre.shape

(2844, 2)

Convert the PUBCHEM_CID (floats) into an integer type.

In [6]:
antag_pre['PUBCHEM_CID'] = antag_pre['PUBCHEM_CID'].astype(int)
antag_pre = antag_pre.rename(columns = {'PUBCHEM_ACTIVITY_OUTCOME' : 'Activity Summary'})
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,Activity Summary
5,5995,Active
6,26041,Inconclusive
7,5281576,Inconclusive


In [7]:
antag_pre['Activity Summary'].value_counts()

Inactive        2066
Inconclusive     712
Active            66
Name: Activity Summary, dtype: int64

In [8]:
antag_pre.shape

(2844, 2)

In [9]:
antag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,Activity Summary
5,5995,Active
6,26041,Inconclusive
7,5281576,Inconclusive


# Identifying Covalent Unit Count (antag)

Compute the number of CID chunks based on the number of CIDs and the chunk size.

In [10]:
chunk_size = 200
num_cids = len(antag_pre['PUBCHEM_CID'])

if num_cids % chunk_size == 0 :
    num_chunks = int( num_cids / chunk_size )
else :
    num_chunks = int( num_cids / chunk_size ) + 1

print("# CIDs = ", num_cids)
print("# CID Chunks = ", num_chunks, "(chunked by ", chunk_size, ")")

# CIDs =  2844
# CID Chunks =  15 (chunked by  200 )


Retrieve the covalent unit counts for each chunk of CIDs.

In [11]:
import time

frames = []  # temporary list to store data frames from each PUG-REST request.

#print("chunkid", 'idx1', 'idx2', 'length', sep="\t")

for i in range(0, num_chunks) :
    
    idx1 = chunk_size * i
    idx2 = chunk_size * (i + 1)
    cidstr = ",".join(antag_pre.PUBCHEM_CID[idx1:idx2].astype(str).tolist())

    url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + cidstr + '/property/CovalentUnitCount/TXT')
    data = pd.read_csv( urllib.request.urlopen(url), header=None, names=['cov_units'] )
    frames.append(data)
    
    # Uncomment this line to check the progress
    #print(i, idx1, idx2, len(data), sep="\t")
    
    if (i % 5 == 0 ) :
        time.sleep(1)  # To comply with the PubChem usage polocy (no more than 5 requests per sec)

#    if ( i == 2 ) : break  #- for debugging

cov_units = pd.concat(frames,ignore_index=True)
print(cov_units.shape)

(2844, 1)


# Dealing with multicomponent molecules (antag)

In [12]:
antag_pre['CovalentUnitCount'] = cov_units.cov_units.tolist()
antag_pre.CovalentUnitCount.isnull().sum()

0

In [13]:
antag_pre['CovalentUnitCount'].value_counts()

1     2616
2      159
3       41
5       16
4        5
9        3
6        3
10       1
Name: CovalentUnitCount, dtype: int64

In [14]:
cid_multicompo = antag_pre[ antag_pre.CovalentUnitCount > 1 ].PUBCHEM_CID.tolist()
len(cid_multicompo)

228

The multicomponent CID list has some duplicate CIDs.  Removing them will reduce the number of PUG-REST requests later.

In [15]:
cid_multicompo.sort()
 
cid_multicompo_uniq = sorted( set(cid_multicompo) )
cid_multicompo_dup = []
    
for mycid in cid_multicompo_uniq:
        if ( cid_multicompo.count( mycid ) > 1 ):
            cid_multicompo_dup.append( mycid )

print(len(cid_multicompo_dup))
print(cid_multicompo_dup)

cid_multicompo = cid_multicompo_uniq

31
[5807, 5963, 8478, 8691, 8722, 8813, 11057, 11065, 11933, 13266, 15106, 22420, 22456, 23392, 62882, 108005, 443939, 2733525, 3032581, 3423265, 5284441, 5284484, 11074431, 11972286, 23668193, 23668195, 23668198, 23675274, 23678874, 44134384, 54680782]


Because downloading parent compound information through PUG-REST takes a long time, we want to store the data in a file when they are downloaded for later use. When the cell is run next time, it will read the data from the file, rather than downloading them from PubChem.

In [16]:
import os.path
import json

cid_parent= {}
file_parent = 'cid_parent.json'

if os.path.isfile(file_parent) and os.path.getsize(file_parent) :
    
    with open('cid_parent.json') as f:
        cid_parent = json.load(f)

    # By default, the keys will be loaded as strings, but
    # they are expected to be integers later in the notebook.
    cid_parent = { int(k):v for k,v in cid_parent.items()}
        
    if ( len(cid_parent) != len(cid_multicompo) ) :
        print("CID counts mismatch:", len(cid_multicompo), "cid_multicompo", len(cid_parent), "cid_parent")

else:

    for i in range(0, len(cid_multicompo)) :

        try:
        
            mycid = cid_multicompo[i]
            url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(mycid) + '/cids/TXT?cids_type=parent')
            res = urllib.request.urlopen(url)
            parent = res.read()
            cid_parent[ mycid ] = int(parent.decode().rstrip())
    
        except:
    
            cid_parent[ mycid ] = None
    
        if (i % 5 == 0 ) : time.sleep(1)
        if (i % 200 == 0 ) : print("processing ", i , "of ", len(cid_multicompo))

    print("# Download of parent CIDs has been complete")

    with open('cid_parent.json', 'w') as f:
        json.dump(cid_parent, f)
    
print("# len(cid_parent):", len(cid_parent) )

CID counts mismatch: 195 cid_multicompo 1748 cid_parent
# len(cid_parent): 1748


In [17]:
# Store CIDs without parent compounds.
cid_noparent = [ key for key,val in cid_parent.items() if not val  ]
print(len(cid_noparent))

207


In [18]:
#Making copy of antag_pre
antag_cov = antag_pre

#dropping rows where covalent unit count is = 1
antag_cov = antag_cov[antag_cov.CovalentUnitCount != 1]
antag_cov.shape
#228 multicomponent CIDs (2844 total)

(228, 3)

In [19]:
#getting CID list of multicomponent compounds
CID_list_multi = antag_cov['PUBCHEM_CID'].tolist()

In [20]:
#Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = antag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        



len(parent_CID_list)

228

In [21]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1

In [22]:
#adding parent_CID's to antag_cov DF
antag_cov['Parent_CIDs'] = parent_CID_list

In [23]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called antag_multi_to_parent
antag_multi_to_parent = antag_cov.drop(antag_cov.loc[antag_cov['Parent_CIDs'] == '0'].index)

##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
antag_multi_to_parent = antag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
antag_multi_to_parent = antag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [24]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
antag_pre_minus_multi = antag_pre[~antag_pre.PUBCHEM_CID.isin(CID_list_multi)]

In [25]:
#Combining antag_pre with antag_multi_to_parent
antag = antag_pre_minus_multi.append(antag_multi_to_parent)

In [26]:
antag_pre = antag
antag_pre.shape

(2811, 3)

# Dealing with conflicts (antag)

In [27]:
antag_pre['Activity Summary'].value_counts()

Inactive        2046
Inconclusive     702
Active            63
Name: Activity Summary, dtype: int64

In [28]:
antag_pre[ 'PUBCHEM_CID'].head(10)

5        5995
7     5281576
8     5281575
9     5280795
10       7529
11       8419
13      10215
14      61186
15       8061
16    6440940
Name: PUBCHEM_CID, dtype: object

In [29]:
# Generate unique CID list
antag_cid = antag_pre['PUBCHEM_CID'].tolist()

# Drop the CIDs with conflicting activity declaration.
for mycid in antag_cid :
    
    activities = set( antag_pre[ antag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist())
    if ( len(activities) > 1 ):
        #print(mycid, len(activities), activities)
        antag_pre = antag_pre[ antag_pre['PUBCHEM_CID'] != mycid ]

In [30]:
antag_pre.shape

(2484, 3)

In [33]:
antag_pre['Activity Summary'].value_counts()

Inactive        1887
Inconclusive     550
Active            47
Name: Activity Summary, dtype: int64

# Data Cleaning Full (ag)

Read the antagonist assay data into a data frame

In [42]:
#Reading in Summary ANTAGONIST CSV
ag_pre = pd.read_csv('AID_588532_datatable_all.csv')
ag_pre = ag_pre.iloc[5:]
ag_pre.head(3)

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Phenotype,Potency,Efficacy,...,W530-Activity at 76.79 uM,W530-Activity at 182.0 uM,W530-Activity at 407.0 uM,W530-Activity at 910.0 uM,W530-Activity at 2034.7 uM,W530-Activity at 4832.4 uM,W530-Activity at 10805.3 uM,W530-Activity at 24160.5 uM,W530-Activity at 54023.0 uM,Compound QC
5,1,11112138.0,5995.0,Inconclusive,10.0,http://assay.nih.gov/htsws/rest/display/nucrec...,,Inconclusive,25.1189,40.4377,...,6.7296,,,,,,,,,QC'd by Prestwick
6,2,17388662.0,26041.0,Inconclusive,10.0,http://assay.nih.gov/htsws/rest/display/nucrec...,,Inconclusive,1.4125,34.0858,...,-15.5195,,,,,,,,,QC'd by NIEHS/NTP
7,3,17388663.0,5281576.0,Inactive,0.0,http://assay.nih.gov/htsws/rest/display/nucrec...,,Inactive,,,...,-20.9504,,,,,,,,,QC'd by NIEHS/NTP


In [43]:
ag_pre = ag_pre[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME']]
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME
5,5995.0,Inconclusive
6,26041.0,Inconclusive
7,5281576.0,Inactive


Deal with the NULL values: remove the rows without CIDs.

In [44]:
ag_pre.isnull().sum()

PUBCHEM_CID                 14
PUBCHEM_ACTIVITY_OUTCOME     0
dtype: int64

In [45]:
ag_pre = ag_pre[ ag_pre['PUBCHEM_CID'].notnull() ]
ag_pre.shape

(2844, 2)

In [49]:
ag_pre['PUBCHEM_CID'] = ag_pre['PUBCHEM_CID'].astype(int)
ag_pre = ag_pre.rename(columns = {'PUBCHEM_ACTIVITY_OUTCOME' : 'Activity Summary'})
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,Activity Summary
5,5995,Inconclusive
6,26041,Inconclusive
7,5281576,Inactive


Convert the PUBCHEM_CID (floats) into an integer type.

In [50]:
ag_pre['PUBCHEM_CID'] = ag_pre['PUBCHEM_CID'].astype(int)
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,Activity Summary
5,5995,Inconclusive
6,26041,Inconclusive
7,5281576,Inactive


In [52]:
ag_pre['Activity Summary'].value_counts()

Inactive        2398
Inconclusive     438
Active             8
Name: Activity Summary, dtype: int64

In [54]:
ag_pre.shape

(2844, 2)

In [55]:
ag_pre.head(3)

Unnamed: 0,PUBCHEM_CID,Activity Summary
5,5995,Inconclusive
6,26041,Inconclusive
7,5281576,Inactive


# Identifying Covalent Unit Count (ag)

Compute the number of CID chunks based on the number of CIDs and the chunk size.

In [56]:
chunk_size = 200
num_cids = len(ag_pre['PUBCHEM_CID'])

if num_cids % chunk_size == 0 :
    num_chunks = int( num_cids / chunk_size )
else :
    num_chunks = int( num_cids / chunk_size ) + 1

print("# CIDs = ", num_cids)
print("# CID Chunks = ", num_chunks, "(chunked by ", chunk_size, ")")

# CIDs =  2844
# CID Chunks =  15 (chunked by  200 )


Retrieve the covalent unit counts for each chunk of CIDs.

In [57]:
import time

frames = []  # temporary list to store data frames from each PUG-REST request.

#print("chunkid", 'idx1', 'idx2', 'length', sep="\t")

for i in range(0, num_chunks) :
    
    idx1 = chunk_size * i
    idx2 = chunk_size * (i + 1)
    cidstr = ",".join(ag_pre.PUBCHEM_CID[idx1:idx2].astype(str).tolist())

    url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + cidstr + '/property/CovalentUnitCount/TXT')
    data = pd.read_csv( urllib.request.urlopen(url), header=None, names=['cov_units'] )
    frames.append(data)
    
    # Uncomment this line to check the progress
    #print(i, idx1, idx2, len(data), sep="\t")
    
    if (i % 5 == 0 ) :
        time.sleep(1)  # To comply with the PubChem usage polocy (no more than 5 requests per sec)

#    if ( i == 2 ) : break  #- for debugging

cov_units = pd.concat(frames,ignore_index=True)
print(cov_units.shape)

(2844, 1)


# Dealing with Multicomponent Molecules (ag)

In [58]:
ag_pre['CovalentUnitCount'] = cov_units.cov_units.tolist()
ag_pre.CovalentUnitCount.isnull().sum()

0

In [59]:
ag_pre['CovalentUnitCount'].value_counts()

1     2616
2      159
3       41
5       16
4        5
9        3
6        3
10       1
Name: CovalentUnitCount, dtype: int64

In [60]:
cid_multicompo = ag_pre[ ag_pre.CovalentUnitCount > 1 ].PUBCHEM_CID.tolist()
len(cid_multicompo)

228

The multicomponent CID list has some duplicate CIDs.  Removing them will reduce the number of PUG-REST requests later.

In [61]:
cid_multicompo.sort()
 
cid_multicompo_uniq = sorted( set(cid_multicompo) )
cid_multicompo_dup = []
    
for mycid in cid_multicompo_uniq:
        if ( cid_multicompo.count( mycid ) > 1 ):
            cid_multicompo_dup.append( mycid )

print(len(cid_multicompo_dup))
print(cid_multicompo_dup)

cid_multicompo = cid_multicompo_uniq

31
[5807, 5963, 8478, 8691, 8722, 8813, 11057, 11065, 11933, 13266, 15106, 22420, 22456, 23392, 62882, 108005, 443939, 2733525, 3032581, 3423265, 5284441, 5284484, 11074431, 11972286, 23668193, 23668195, 23668198, 23675274, 23678874, 44134384, 54680782]


Because downloading parent compound information through PUG-REST takes a long time, we want to store the data in a file when they are downloaded for later use. When the cell is run next time, it will read the data from the file, rather than downloading them from PubChem.

In [62]:
import os.path
import json

cid_parent= {}
file_parent = 'cid_parent_ag.json'

if os.path.isfile(file_parent) and os.path.getsize(file_parent) :
    
    with open(file_parent) as f:
        cid_parent = json.load(f)

    # By default, the keys will be loaded as strings, but
    # they are expected to be integers later in the notebook.
    cid_parent = { int(k):v for k,v in cid_parent.items()}
        
    if ( len(cid_parent) != len(cid_multicompo) ) :
        print("CID counts mismatch:", len(cid_multicompo), "cid_multicompo", len(cid_parent), "cid_parent")

else:

    for i in range(0, len(cid_multicompo)) :

        try:
        
            mycid = cid_multicompo[i]
            url = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(mycid) + '/cids/TXT?cids_type=parent')
            res = urllib.request.urlopen(url)
            parent = res.read()
            cid_parent[ mycid ] = int(parent.decode().rstrip())
    
        except:
    
            cid_parent[ mycid ] = None
    
        if (i % 5 == 0 ) : time.sleep(1)
        if (i % 200 == 0 ) : print("processing ", i , "of ", len(cid_multicompo))

    print("# Download of parent CIDs has been complete")

    with open(file_parent, 'w') as f:
        json.dump(cid_parent, f)
    
print("# len(cid_parent):", len(cid_parent) )

CID counts mismatch: 195 cid_multicompo 1748 cid_parent
# len(cid_parent): 1748


In [73]:
# Store CIDs without parent compounds.
cid_noparent = [ key for key,val in cid_parent.items() if not val  ]
print(len(cid_noparent))

220


In [77]:
#Making copy of antag_pre
ag_cov = ag_pre

#dropping rows where covalent unit count is = 1
ag_cov = ag_cov[ag_cov.CovalentUnitCount != 1]
ag_cov.shape
#228 multicomponent CIDs (2844 total)

(228, 3)

In [78]:
#getting CID list of multicomponent compounds
CID_list_multi = ag_cov['PUBCHEM_CID'].tolist()

In [79]:
#Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = ag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        



len(parent_CID_list)

228

In [84]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1

In [85]:
#adding parent_CID's to antag_cov DF
ag_cov['Parent_CIDs'] = parent_CID_list

In [86]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called antag_multi_to_parent
ag_multi_to_parent = ag_cov.drop(ag_cov.loc[ag_cov['Parent_CIDs'] == '0'].index)

##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
ag_multi_to_parent = ag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
ag_multi_to_parent = ag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [88]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
ag_pre_minus_multi = ag_pre[~ag_pre.PUBCHEM_CID.isin(CID_list_multi)]

In [89]:
#Combining ag_pre with ag_multi_to_parent
ag = ag_pre_minus_multi.append(ag_multi_to_parent)

In [90]:
ag_pre = ag
ag_pre.shape

(2616, 3)

# Dealing with conflicts (ag)

In [92]:
ag_pre.isnull().sum()

Activity Summary     0
CovalentUnitCount    0
PUBCHEM_CID          0
dtype: int64

In [95]:
ag_pre['Activity Summary'].value_counts()

Inactive        2218
Inconclusive     391
Active             7
Name: Activity Summary, dtype: int64

In [96]:
ag_pre['Activity Summary'].value_counts()

Inactive        2218
Inconclusive     391
Active             7
Name: Activity Summary, dtype: int64

In [97]:
ag_pre[ 'PUBCHEM_CID'].head(10)

5        5995.0
7     5281576.0
8     5281575.0
9     5280795.0
10       7529.0
11       8419.0
13      10215.0
14      61186.0
15       8061.0
16    6440940.0
Name: PUBCHEM_CID, dtype: float64

In [98]:
# Generate unique CID list
ag_cid = ag_pre['PUBCHEM_CID'].tolist()

# Drop the CIDs with conflicting activity declaration.
for mycid in ag_cid :
    
    activities = set( ag_pre[ ag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist())
    if ( len(activities) > 1 ):
        #print(mycid, len(activities), activities)
        ag_pre = ag_pre[ ag_pre['PUBCHEM_CID'] != mycid ]

In [99]:
ag_pre.head(3)

Unnamed: 0,Activity Summary,CovalentUnitCount,PUBCHEM_CID
5,Inconclusive,1,5995.0
8,Inconclusive,1,5281575.0
9,Inactive,1,5280795.0


In [100]:
ag_pre.shape

(2376, 3)

In [108]:
ag_pre['Activity Summary'].value_counts()

Inactive        2088
Inconclusive     284
Active             4
Name: Activity Summary, dtype: int64

# Combining Dataframes and Cleaning (both)

## Method 1

In [246]:
antag_cid = antag_pre['PUBCHEM_CID'].tolist()
ag_cid = ag_pre['PUBCHEM_CID'].tolist()
all_cid = list(set( antag_cid + ag_cid ))
len(all_cid)

2275

In [254]:
final_act = {}

for mycid in list(all_cid) :
    
    antag_act = list(set( antag_pre[ antag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist()))
    ag_act = list(set( ag_pre[ ag_pre['PUBCHEM_CID'] == mycid].loc[:,'Activity Summary'].tolist()))
    all_act = list(set( antag_act + ag_act ))
    
    if ( len(all_act) == 1) :
        final_act[ mycid ] = all_act[0]

df_activity = pd.DataFrame(final_act.items(), columns=['PUBCHEM_CID', 'Activity Summary'])
df_activity = df_activity['PUBCHEM_CID'].astype(int)
df_activity = df_activity.sort_values('PUBCHEM_CID').reset_index(drop=True)
df_activity = df_activity[ (df_activity['Activity Summary'] == 'Active') | \
                           (df_activity['Activity Summary'] == 'Active' ) | \
                           (df_activity['Activity Summary'] == 'Inactive') ]
df_activity['Activity Summary'].value_counts()

ValueError: No axis named PUBCHEM_CID for object type <class 'type'>

## Method 2 - USING

In [65]:
#antag_cid_df = antag_pre[ antag_pre['Activity Summary'] == 'active antagonist' ]
#ag_cid_df = ag_pre[ ag_pre['Activity Summary'] == 'active agonist' ]
#inact_cid_df_antag = antag_pre[ antag_pre['Activity Summary'] == 'inactive' ]
#inact_cid_df_ag = ag_pre[ ag_pre['Activity Summary'] == 'inactive' ]

#df_activity2 = pd.concat([ antag_cid_df, ag_cid_df, inact_cid_df_antag, inact_cid_df_ag])

In [280]:
len(cids_ago)

4

In [291]:
cids_ant = antag_pre[ antag_pre['Activity Summary'] == 'Active' ].PUBCHEM_CID.tolist()
cids_ago = ag_pre[ ag_pre['Activity Summary'] == 'Active' ].PUBCHEM_CID.tolist()
cids_inact1 = antag_pre[ antag_pre['Activity Summary'] == 'Inactive' ].PUBCHEM_CID.tolist()
cids_inact2 = ag_pre[ ag_pre['Activity Summary'] == 'Inactive' ].PUBCHEM_CID.tolist()

#cids_inact = [i for i in set(cids_inact1) if i in cids_inact2]
cids_inact = cids_inact1 + cids_inact2

final_act = {}

for mycid in cids_ant :
    final_act[ mycid ] = 'Active'

    
for mycid in cids_ago :
    final_act[ mycid ] = 'Active'
    

for mycid in cids_inact :
    final_act[ mycid ] = 'Inactive'

df_activity2 = pd.DataFrame(final_act.items(), columns=['PUBCHEM_CID', 'Activity Summary']) 
df_activity2['PUBCHEM_CID'] = df_activity2['PUBCHEM_CID'].astype(int)    
#df_activity2 = df_activity2.duplicated('PUBCHEM_CID', keep = 'first')
df_activity2 = df_activity2.sort_values('PUBCHEM_CID').reset_index(drop=True)
df_activity2['Activity Summary'].value_counts()

Inactive    2101
Active        23
Name: Activity Summary, dtype: int64

In [233]:
#Removing all CID's in Tox21 Data

#Reading in Molecular Properties CSV
tox21 = pd.read_csv('Molecular_Properties_CSV')
tox21 = tox21.astype(float, errors = 'ignore')

#getting list of CID's from tox21 data
tox21_CIDs = tox21['PUBCHEM_CID']
tox21_CIDs = tox21_CIDs.tolist()
df_activity2.shape

(2124, 2)

In [234]:
#Removing all CID's in tox21 data from NCGC cleaned data (final_clean)
final_ncgc = df_activity2[~df_activity2.PUBCHEM_CID.isin(tox21_CIDs)]
final_ncgc.shape
final_ncgc['Activity Summary'].value_counts()
#only 46 active values total

Inactive    723
Active       16
Name: Activity Summary, dtype: int64

In [235]:
#Taking Dataframe of only Active values
##Dropping all inactive values
df_active = final_ncgc.drop(final_ncgc.loc[final_ncgc['Activity Summary'] == 'Inactive'].index)
df_active.shape

(16, 2)

In [260]:
final_dups = final_ncgc[final_ncgc.duplicated('PUBCHEM_CID', keep = False)] 
final_dups

Unnamed: 0,PUBCHEM_CID,Activity Summary
76,1486,Inactive
77,1486,Inactive
561,7242,Inactive
562,7242,Inactive
713,7732,Inactive
714,7732,Inactive
877,8411,Inactive
878,8411,Inactive
1241,15553,Inactive
1242,15553,Inactive


In [77]:
#Writing df_activity2 to csv

df_activity2.to_csv('FINAL_Merged_Cleaned_CSV_7-19', index = False)