# ***RUN NOTEBOOK IN ORDER

# Dealing with multicomponent compounds

# Data Cleaning Full (antag)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from requests import exceptions
import re
%matplotlib inline

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Summary ANTAGONIST CSV
antag_pre = pd.read_csv('AID_720725_datatable_all.csv')
antag_pre = antag_pre.iloc[3:]
antag_pre

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,Ratio Potency (uM),...,530 nm Activity,530 nm Potency (uM),530 nm Efficacy (%),460 nm Activity,460 nm Potency (uM),460 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
3,1,144203552.0,12850184.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inactive,,0,NCI
4,2,144203553.0,89753.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inactive,,0,NCI
5,3,144203554.0,9403.0,Active,57.0,,,active antagonist,active antagonist,7.56242,...,inactive,,0,active antagonist,7.56242,-95.0751,inconclusive antagonist,,0,NCI
6,4,144203555.0,13218779.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inactive,,0,NCI
7,5,144203556.0,142766.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inactive,,0,NCI
8,6,144203557.0,16043.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inconclusive antagonist,,0,NCI
9,7,144203558.0,16043.0,Inconclusive,25.0,,,inconclusive antagonist (cytotoxic),active antagonist,11.9856,...,inconclusive antagonist,23.9145,-61.7417,active antagonist,17.5925,-145.137,active antagonist,23.9145,-99.9932,SigmaAldrich
10,8,144203559.0,2724411.0,Inconclusive,25.0,,,inconclusive antagonist (cytotoxic),inconclusive antagonist,16.9301,...,inactive,,0,inconclusive antagonist,16.9301,-132.937,inconclusive antagonist,12.6958,-45.2176,NCI
11,9,144203560.0,2724372.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0,inactive,,0,NCI
12,10,144203561.0,637566.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inconclusive agonist,0.0848517,36.2494,inactive,,0,NCI


In [2]:
#Removing inconclusives
#antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive'].index)
#antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive agonist'].index)
#antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive antagonist'].index)
#antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
#antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
antag_pre = antag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
antag_pre.fillna(0, inplace = True)

##Dropping values with no CID
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
antag_pre = antag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

In [3]:
antag_pre.shape

(10337, 4)

# Identifying Covalent Unit Count (antag)

In [4]:
#List of all CID's
CID_list = antag_pre['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list = list(map(int, CID_list))

In [5]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1600]
list3 = CID_list[1600:2400]
#Switched to intervals of 700 (error if any larger)
list4 = CID_list[2400:3100]
#Switched to intervals of 600 (error if any larger)
list5 = CID_list[3100:3700]
list6 = CID_list[3700:4300]
list7 = CID_list[4300:4900]
list8 = CID_list[4900:5500]
list9 = CID_list[5500:6100]
#switched to intervals of 500 (integers get larger)
list10 = CID_list[6100:6600]
list11 = CID_list[6600:7100]
#switched to intervals of 400 (integers get larger)
list12 = CID_list[7100:7500]
list13 = CID_list[7500:7900]
list14 = CID_list[7900:8100]
list15 = CID_list[8100:8600]
list16 = CID_list[8600:9100]
list17 = CID_list[9100:9500]
list18 = CID_list[9500:9900]
list19 = CID_list[9900:10000]
list20 = CID_list[10000:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')
str5 = (str(list5)[1:-1])
str5 = str5.replace(' ', '')
str6 = (str(list6)[1:-1])
str6 = str6.replace(' ', '')
str7 = (str(list7)[1:-1])
str7 = str7.replace(' ', '')
str8 = (str(list8)[1:-1])
str8 = str8.replace(' ', '')
str9 = (str(list9)[1:-1])
str9 = str9.replace(' ', '')
str10 = (str(list10)[1:-1])
str10 = str10.replace(' ', '')
str11 = (str(list11)[1:-1])
str11 = str11.replace(' ', '')
str12 = (str(list12)[1:-1])
str12 = str12.replace(' ', '')
str13 = (str(list13)[1:-1])
str13 = str13.replace(' ', '')
str14 = (str(list14)[1:-1])
str14 = str14.replace(' ', '')
str15 = (str(list15)[1:-1])
str15 = str15.replace(' ', '')
str16 = (str(list16)[1:-1])
str16 = str16.replace(' ', '')
str17 = (str(list17)[1:-1])
str17 = str17.replace(' ', '')
str18 = (str(list18)[1:-1])
str18 = str18.replace(' ', '')
str19 = (str(list19)[1:-1])
str19 = str19.replace(' ', '')
str20 = (str(list20)[1:-1])
str20 = str20.replace(' ', '')

In [6]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the 16 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')
url5 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str5 + '/property/CovalentUnitCount/TXT')
html5 = urlopen(url5) 
soup5 = BeautifulSoup(html5, 'lxml')
url6 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str6 + '/property/CovalentUnitCount/TXT')
html6 = urlopen(url6) 
soup6 = BeautifulSoup(html6, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()
cov_units5 = soup5.get_text()
cov_units6 = soup6.get_text()

In [7]:
url7 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str7 + '/property/CovalentUnitCount/TXT')
html7 = urlopen(url7) 
soup7 = BeautifulSoup(html7, 'lxml')
url8 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str8 + '/property/CovalentUnitCount/TXT')
html8 = urlopen(url8)
soup8 = BeautifulSoup(html8, 'lxml')
url9 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str9 + '/property/CovalentUnitCount/TXT')
html9 = urlopen(url9) 
soup9 = BeautifulSoup(html9, 'lxml')
url10 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str10 + '/property/CovalentUnitCount/TXT')
html10 = urlopen(url10) 
soup10 = BeautifulSoup(html10, 'lxml')
url11 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str11 + '/property/CovalentUnitCount/TXT')
html11 = urlopen(url11) 
soup11 = BeautifulSoup(html11, 'lxml')
url12 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str12 + '/property/CovalentUnitCount/TXT')
html12 = urlopen(url12) 
soup12 = BeautifulSoup(html12, 'lxml')

cov_units7 = soup7.get_text()
cov_units8 = soup8.get_text()                      
cov_units9 = soup9.get_text()
cov_units10 = soup10.get_text()
cov_units11 = soup11.get_text()
cov_units12 = soup12.get_text()

In [8]:
url13 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str13 + '/property/CovalentUnitCount/TXT')
html13 = urlopen(url13) 
soup13 = BeautifulSoup(html13, 'lxml')
url14 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str14 + '/property/CovalentUnitCount/TXT')
html14 = urlopen(url14) 
soup14 = BeautifulSoup(html14, 'lxml')
url15 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str15 + '/property/CovalentUnitCount/TXT')
html15 = urlopen(url15) 
soup15 = BeautifulSoup(html15, 'lxml')
url16 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str16 + '/property/CovalentUnitCount/TXT')
html16 = urlopen(url16) 
soup16 = BeautifulSoup(html16, 'lxml')

cov_units13 = soup13.get_text()
cov_units14 = soup14.get_text()
cov_units15 = soup15.get_text()
cov_units16 = soup16.get_text()


In [9]:
url17 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str17 + '/property/CovalentUnitCount/TXT')
html17 = urlopen(url17) 
soup17 = BeautifulSoup(html17, 'lxml')
url18 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str18 + '/property/CovalentUnitCount/TXT')
html18 = urlopen(url18) 
soup18 = BeautifulSoup(html18, 'lxml')
url19 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str19 + '/property/CovalentUnitCount/TXT')
html19 = urlopen(url19) 
soup19 = BeautifulSoup(html19, 'lxml')
url20 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str20 + '/property/CovalentUnitCount/TXT')
html20 = urlopen(url20) 
soup20 = BeautifulSoup(html20, 'lxml')

cov_units17 = soup17.get_text()
cov_units18 = soup18.get_text()
cov_units19 = soup19.get_text()
cov_units20 = soup20.get_text()

#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4 + cov_units5 + cov_units6 + cov_units7 + cov_units8 + cov_units9 + cov_units10 + cov_units11 + cov_units12 + cov_units13 + cov_units14 + cov_units15 + cov_units16 + cov_units17 + cov_units18 + cov_units19 + cov_units20 

#cov_units string to cov_units list
cov_units = cov_units.split()
#len(cov_units)

# Dealing with multicomponent molecules (antag)

In [10]:
#adding cov_units column to antag
antag_pre['CovalentUnitCount'] = cov_units

#Making copy of antag_pre
antag_cov = antag_pre

#dropping rows where covalent unit count is = 1
antag_cov = antag_cov[antag_cov.CovalentUnitCount != '1']
#2029 CID's that are multicomponent
antag_cov.shape

(2029, 5)

In [11]:
#getting CID list of multicomponent compounds
CID_list_multi = antag_cov['PUBCHEM_CID'].tolist()

#Removing decimals
CID_list_multi = list(map(int, CID_list_multi))


In [12]:
#Getting list of covalent unit number for the multicomponent compounds
#takes like an hr to run
list_cov_multi = antag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        



len(parent_CID_list)

2029

In [13]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1


In [14]:
#adding parent_CID's to antag_cov DF
antag_cov['Parent_CIDs'] = parent_CID_list

In [15]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called antag_multi_to_parent
antag_multi_to_parent = antag_cov.drop(antag_cov.loc[antag_cov['Parent_CIDs'] == '0'].index)

##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
antag_multi_to_parent = antag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
antag_multi_to_parent = antag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [16]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
antag_pre_minus_multi = antag_pre[~antag_pre.PUBCHEM_CID.isin(CID_list_multi)]

In [17]:
#Combining antag_pre with antag_multi_to_parent
antag = antag_pre_minus_multi.append(antag_multi_to_parent)

# Dealing with conflicts (antag)

In [18]:
#Dropping duplicates from original antag dataframe
antag_no_dups = antag.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [19]:
##Finding DF of all duplicates
antag_dups = antag[antag.duplicated('PUBCHEM_CID', keep = False)] 
antag_dups = antag_dups.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [20]:
#Creating two seperate datasets for the three different cases (third case is all deleted)

In [21]:
##df_inactive = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'active agonist'].index)
df_inactive = df_inactive.drop(df_inactive.loc[antag_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive = df_inactive[df_inactive.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_clean = df_inactive.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [22]:
##df_active = both activity scores are active
###Drop all inactive values and drop any values that are no longer dups

df_active = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'inactive'].index)
df_active = df_active[df_active.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by ascending CID AND by ascending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active = df_active.sort_values(by = ['PUBCHEM_CID','Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_clean = df_active.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [23]:
##df_disagree = the activity scores disagree
##We removed all the rest because they didn't disagree

In [24]:
#Combining antag_no_dups with df_active_clean and df_inactive_clean to create the final database with no dups
frames1 = [antag_no_dups, df_active_clean, df_inactive_clean]

#FINAL CLEAN DF
antag_clean = pd.concat(frames1)
antag_clean.shape

(7954, 5)

In [25]:
#Removing inconclusives (just to see)
antag_clean_no_inc = antag_clean.drop(antag_clean.loc[antag_clean['Activity Summary'] == 'inconclusive'].index)
antag_clean_no_inc = antag_clean.drop(antag_clean.loc[antag_clean['Activity Summary'] == 'inconclusive agonist'].index)
antag_clean_no_inc = antag_clean.drop(antag_clean.loc[antag_clean['Activity Summary'] == 'inconclusive antagonist'].index)
antag_clean_no_inc = antag_clean.drop(antag_clean.loc[antag_clean['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
antag_clean_no_inc = antag_clean.drop(antag_clean.loc[antag_clean['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)
antag_clean_no_inc.shape

(7639, 5)

# Data Cleaning Full (ag)

In [26]:
#Reading in Summary AGONIST CSV
ag_pre = pd.read_csv('AID_720719_datatable_all.csv')
ag_pre = ag_pre.iloc[4:]
#ag_pre.head()

In [27]:
#Removing inconclusives
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive'].index)
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist'].index)
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist (fluorescent)'].index)
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive antagonist'].index)
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
#ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
ag_pre = ag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
ag_pre.fillna(0, inplace = True)

##Dropping values with no CID
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
ag_pre = ag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

In [28]:
ag_pre.shape

(10337, 4)

# Identifying Covalent Unit Count (ag)

In [29]:
#List of all CID's
CID_list = ag_pre['PUBCHEM_CID'].tolist()

In [30]:
#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

In [31]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1600]
list3 = CID_list[1600:2400]
#Switched to intervals of 700 (error if any larger)
list4 = CID_list[2400:3100]
#Switched to intervals of 600 (error if any larger)
list5 = CID_list[3100:3700]
list6 = CID_list[3700:4300]
list7 = CID_list[4300:4900]
list8 = CID_list[4900:5500]
list9 = CID_list[5500:6100]
#switched to intervals of 500 (integers get larger)
list10 = CID_list[6100:6600]
list11 = CID_list[6600:7100]
#switched to intervals of 400 (integers get larger)
list12 = CID_list[7100:7500]
list13 = CID_list[7500:7900]
list14 = CID_list[7900:8400]
list15 = CID_list[8400:8800]
list16 = CID_list[8800:9200]
list17 = CID_list[9200:9600]
list18 = CID_list[9600:10000]
list19 = CID_list[10000:10338]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')
str5 = (str(list5)[1:-1])
str5 = str5.replace(' ', '')
str6 = (str(list6)[1:-1])
str6 = str6.replace(' ', '')
str7 = (str(list7)[1:-1])
str7 = str7.replace(' ', '')
str8 = (str(list8)[1:-1])
str8 = str8.replace(' ', '')
str9 = (str(list9)[1:-1])
str9 = str9.replace(' ', '')
str10 = (str(list10)[1:-1])
str10 = str10.replace(' ', '')
str11 = (str(list11)[1:-1])
str11 = str11.replace(' ', '')
str12 = (str(list12)[1:-1])
str12 = str12.replace(' ', '')
str13 = (str(list13)[1:-1])
str13 = str13.replace(' ', '')
str14 = (str(list14)[1:-1])
str14 = str14.replace(' ', '')
str15 = (str(list15)[1:-1])
str15 = str15.replace(' ', '')
str16 = (str(list16)[1:-1])
str16 = str16.replace(' ', '')
str17 = (str(list17)[1:-1])
str17 = str17.replace(' ', '')
str18 = (str(list18)[1:-1])
str18 = str18.replace(' ', '')
str19 = (str(list19)[1:-1])
str19 = str19.replace(' ', '')


In [32]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the 16 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')
url5 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str5 + '/property/CovalentUnitCount/TXT')
html5 = urlopen(url5) 
soup5 = BeautifulSoup(html5, 'lxml')
url6 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str6 + '/property/CovalentUnitCount/TXT')
html6 = urlopen(url6) 
soup6 = BeautifulSoup(html6, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()
cov_units5 = soup5.get_text()
cov_units6 = soup6.get_text()

In [33]:
url7 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str7 + '/property/CovalentUnitCount/TXT')
html7 = urlopen(url7) 
soup7 = BeautifulSoup(html7, 'lxml')
url8 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str8 + '/property/CovalentUnitCount/TXT')
html8 = urlopen(url8)
soup8 = BeautifulSoup(html8, 'lxml')
url9 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str9 + '/property/CovalentUnitCount/TXT')
html9 = urlopen(url9) 
soup9 = BeautifulSoup(html9, 'lxml')
url10 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str10 + '/property/CovalentUnitCount/TXT')
html10 = urlopen(url10) 
soup10 = BeautifulSoup(html10, 'lxml')
url11 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str11 + '/property/CovalentUnitCount/TXT')
html11 = urlopen(url11) 
soup11 = BeautifulSoup(html11, 'lxml')
url12 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str12 + '/property/CovalentUnitCount/TXT')
html12 = urlopen(url12) 
soup12 = BeautifulSoup(html12, 'lxml')

cov_units7 = soup7.get_text()
cov_units8 = soup8.get_text()                      
cov_units9 = soup9.get_text()
cov_units10 = soup10.get_text()
cov_units11 = soup11.get_text()
cov_units12 = soup12.get_text()

In [34]:
url13 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str13 + '/property/CovalentUnitCount/TXT')
html13 = urlopen(url13) 
soup13 = BeautifulSoup(html13, 'lxml')
url14 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str14 + '/property/CovalentUnitCount/TXT')
html14 = urlopen(url14) 
soup14 = BeautifulSoup(html14, 'lxml')
url15 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str15 + '/property/CovalentUnitCount/TXT')
html15 = urlopen(url15) 
soup15 = BeautifulSoup(html15, 'lxml')
url16 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str16 + '/property/CovalentUnitCount/TXT')
html16 = urlopen(url16) 
soup16 = BeautifulSoup(html16, 'lxml')
url17 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str17 + '/property/CovalentUnitCount/TXT')
html17 = urlopen(url17) 
soup17 = BeautifulSoup(html17, 'lxml')

cov_units13 = soup13.get_text()
cov_units14 = soup14.get_text()
cov_units15 = soup15.get_text()
cov_units16 = soup16.get_text()
cov_units17 = soup17.get_text()

In [35]:
url18 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str18 + '/property/CovalentUnitCount/TXT')
html18 = urlopen(url18) 
soup18 = BeautifulSoup(html18, 'lxml')
url19 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str19 + '/property/CovalentUnitCount/TXT')
html19 = urlopen(url19) 
soup19 = BeautifulSoup(html19, 'lxml')

cov_units18 = soup18.get_text()
cov_units19 = soup19.get_text()


#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4 + cov_units5 + cov_units6 + cov_units7 + cov_units8 + cov_units9 + cov_units10 + cov_units11 + cov_units12 + cov_units13 + cov_units14 + cov_units15 + cov_units16 + cov_units17 + cov_units18 + cov_units19 

#cov_units string to cov_units list
cov_units = cov_units.split()
len(cov_units)

10337

# Dealing with Multicomponent Molecules (ag)

In [36]:
#adding cov_units column to ag
ag_pre['CovalentUnitCount'] = cov_units

#Making copy of antag_pre
ag_cov = ag_pre

#dropping rows where covalent unit count is = 1
ag_cov = ag_cov[ag_cov.CovalentUnitCount != '1']
#2029 CID's that are multicomponent
ag_cov.shape

(2029, 5)

In [37]:
#getting CID list of multicomponent compounds
CID_list_multi = ag_cov['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list_multi = [int(i) for i in CID_list_multi]

In [38]:
#Getting list of covalent unit number for the multicomponent compounds
#takes like an hr to run
list_cov_multi = ag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1

len(parent_CID_list)

2029

In [39]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1


In [40]:
#adding parent_CID's to antag_cov DF
ag_cov['Parent_CIDs'] = parent_CID_list

In [41]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called ag_multi_to_parent
ag_multi_to_parent = ag_cov.drop(ag_cov.loc[ag_cov['Parent_CIDs'] == '0'].index)


##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
ag_multi_to_parent = ag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
ag_multi_to_parent = ag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [42]:
#Removing all CID's with multicomponent compounds from ag_pre (original DF of all antag compounds)
ag_pre_minus_multi = ag_pre[~ag_pre.PUBCHEM_CID.isin(CID_list_multi)]


In [43]:
#Combining ag_pre with ag_multi_to_parent
ag = ag_pre_minus_multi.append(ag_multi_to_parent)

# Dealing with conflicts (ag)

In [44]:
#Dropping duplicates from original ag dataframe
ag_no_dups = ag.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [45]:
##Finding DF of all duplicates
ag_dups = ag[ag.duplicated('PUBCHEM_CID', keep = False)] 
ag_dups = ag_dups.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [46]:
#Creating two seperate datasets for the three different cases (third case is all deleted)

In [47]:
##df_inactive_ag = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both active.
df_inactive_ag = ag_dups.drop(ag_dups.loc[ag_dups['Activity Summary'] == 'active agonist'].index)
df_inactive_ag = df_inactive_ag.drop(df_inactive_ag.loc[ag_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive_ag = df_inactive_ag[df_inactive_ag.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_ag_clean = df_inactive_ag.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [48]:
##df_active_ag = both activity scores are active
###Drop all inactive values and drop any values that are no longer dups

df_active_ag = ag_dups.drop(ag_dups.loc[ag_dups['Activity Summary'] == 'inactive'].index)
df_active_ag = df_active_ag[df_active_ag.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by descending CID AND by descending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active_ag = df_active_ag.sort_values(by = ['PUBCHEM_CID', 'Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_ag_clean = df_active_ag.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [49]:
##df_disagree = the activity scores disagree
##We removed all the rest because they didn't disagree

In [50]:
#Combining antag_no_dups with df_active_clean and df_inactive_clean to create the final database with no dups
frames2 = [ag_no_dups, df_active_ag_clean, df_inactive_ag_clean]
#FINAL CLEAN DF
ag_clean = pd.concat(frames2)
ag_clean.shape
#(ag_clean['Activity Summary'] == 'inactive').sum()

(7857, 5)

In [51]:
#Removing inconclusives (just to see)
ag_clean_no_inc = ag_clean.drop(ag_clean.loc[ag_clean['Activity Summary'] == 'inconclusive'].index)
ag_clean_no_inc = ag_clean.drop(ag_clean.loc[ag_clean['Activity Summary'] == 'inconclusive agonist'].index)
ag_clean_no_inc = ag_clean.drop(ag_clean.loc[ag_clean['Activity Summary'] == 'inconclusive antagonist'].index)
ag_clean_no_inc = ag_clean.drop(ag_clean.loc[ag_clean['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
ag_clean_no_inc = ag_clean.drop(ag_clean.loc[ag_clean['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)
ag_clean_no_inc.shape

(7857, 5)

# Combining Dataframes and Cleaning (both)

In [52]:
#Combining antag_clean and ag_clean
frames3 = [antag_clean, ag_clean]
final_concat_covcount = pd.concat(frames3)

#Dropping Covalent Unit Column
final_concat = final_concat_covcount.drop(columns = ['CovalentUnitCount'], axis = 1)

#Changing CID's to int instead of string
final_concat['PUBCHEM_CID'] = (final_concat['PUBCHEM_CID']).astype(int)
#final_concat.dtypes

In [53]:
#Dropping duplicates from original final dataframe
final_no_dups = final_concat.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [54]:
##Finding DF of all duplicates
final_dups = final_concat[final_concat.duplicated('PUBCHEM_CID', keep = False)] 
final_dups = final_dups.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)
final_dups = final_dups.reset_index(drop = True)

In [55]:
##df_inactive_final = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'active agonist'].index)
df_inactive_final = df_inactive_final.drop(df_inactive_final.loc[final_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive_final = df_inactive_final[df_inactive_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_final_clean = df_inactive_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [56]:
##df_active_ag_final = both activity scores are active agonist
###Drop all inactive values and drop any values that are no longer dups
###Drop all active antagonist values

df_active_ag_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'inactive'].index)
df_active_ag_final = df_active_ag_final.drop(final_dups.loc[final_dups['Activity Summary'] == 'active antagonist'].index)
df_active_ag_final = df_active_ag_final[df_active_ag_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by descending CID AND by descending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active_ag_final = df_active_ag_final.sort_values(by = ['PUBCHEM_CID', 'Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_ag_final_clean = df_active_ag_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [57]:
##df_active_antag_final = both activity scores are active antagonists
###Drop all inactive values and drop any values that are no longer dups
###Drop all active agonist values

df_active_antag_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'inactive'].index)
df_active_antag_final = df_active_antag_final.drop(final_dups.loc[final_dups['Activity Summary'] == 'active agonist'].index)
df_active_antag_final = df_active_antag_final[df_active_antag_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by descending CID AND by descending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active_antag_final = df_active_antag_final.sort_values(by = ['PUBCHEM_CID', 'Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_antag_final_clean = df_active_antag_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [58]:
#getting lists of CID's of rows that fall into othr categories
inactive_list = df_inactive_final['PUBCHEM_CID'].tolist()
active_ag_list = df_active_ag_final['PUBCHEM_CID'].tolist()
active_antag_list = df_active_antag_final['PUBCHEM_CID'].tolist()
#combining lists
index_list = inactive_list + active_ag_list + active_antag_list

#Dropping those CID's from full dups dataframe final_dups
##This will create a dataframe with CID pairs that are either inactive/active or active/active
##Any remaining active/active CID pairs must be active agonist/antagonist pairs, and are inconclusive
##We will delete those values
final_dups_newindex = final_dups.set_index('PUBCHEM_CID')
df_disagree = final_dups_newindex.drop(labels = index_list)
df_disagree = df_disagree.reset_index()

In [59]:
#Sorting by CID
df_disagree = df_disagree.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [60]:
#We remove all inactive values (BC we are keeping active)
df_disagree = df_disagree.drop(df_disagree.loc[df_disagree['Activity Summary'] == 'inactive'].index)
#We then delete any remaining duplicates, because that means they were active agonist/antagonist pairs
df_disagree_clean = df_disagree.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [61]:
#We combine all the pieces into the full cleaned combined dataframe
frames4 = [final_no_dups, df_inactive_final_clean, df_active_ag_final_clean, df_active_antag_final_clean, df_disagree_clean]
final_clean = pd.concat(frames4)
final_clean.shape

(8714, 4)

In [62]:
##NEWLY ADDED STEP
#NOW REMOVING INCONCLUSIVES AT END
#Removing inconclusives (just to see)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive'].index)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive agonist'].index)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive antagonist'].index)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)
final_clean = final_clean.drop(final_clean.loc[final_clean['Activity Summary'] == 'inconclusive agonist (fluorescent)'].index)
final_clean.shape

(6472, 4)

In [63]:
final_clean['Activity Summary'].value_counts()

inactive             6043
active antagonist     232
active agonist        197
Name: Activity Summary, dtype: int64

In [64]:
#Writing to csv
final_clean.to_csv('TEST_Final_Merged_Cleaned_CSV_7-1', index = False)


In [1]:
final = pd.read_csv('TEST_Final_Merged_Cleaned_CSV_7-1')
final.shape

NameError: name 'pd' is not defined