# ***RUN NOTEBOOK IN ORDER

# Dealing with multicomponent compounds

# Antagonist Uncleaned Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from requests import exceptions
import re
%matplotlib inline

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Summary ANTAGONIST CSV
antag_pre = pd.read_csv('AID_720725_datatable_all.csv')
antag_pre = antag_pre.iloc[3:]
antag_pre.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,Ratio Potency (uM),...,530 nm Activity,530 nm Potency (uM),530 nm Efficacy (%),460 nm Activity,460 nm Potency (uM),460 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
3,1,144203552.0,12850184.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0.0,inactive,,0,NCI
4,2,144203553.0,89753.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0.0,inactive,,0,NCI
5,3,144203554.0,9403.0,Active,57.0,,,active antagonist,active antagonist,7.56242,...,inactive,,0,active antagonist,7.56242,-95.0751,inconclusive antagonist,,0,NCI
6,4,144203555.0,13218779.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0.0,inactive,,0,NCI
7,5,144203556.0,142766.0,Inactive,0.0,,,inactive,inactive,,...,inactive,,0,inactive,,0.0,inactive,,0,NCI


In [2]:
#Removing inconclusives
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive agonist'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive antagonist'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
antag_pre = antag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
antag_pre.fillna(0, inplace = True)

##Dropping values with no CID
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
antag_pre = antag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

# Identifying Covalent Unit Count Antagonists

In [3]:
#List of all CID's
CID_list = antag_pre['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list = list(map(int, CID_list))

In [4]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1600]
list3 = CID_list[1600:2400]
#Switched to intervals of 700 (error if any larger)
list4 = CID_list[2400:3100]
#Switched to intervals of 600 (error if any larger)
list5 = CID_list[3100:3700]
list6 = CID_list[3700:4300]
list7 = CID_list[4300:4900]
list8 = CID_list[4900:5500]
list9 = CID_list[5500:6100]
#switched to intervals of 500 (integers get larger)
list10 = CID_list[6100:6600]
list11 = CID_list[6600:7100]
#switched to intervals of 400 (integers get larger)
list12 = CID_list[7100:7500]
list13 = CID_list[7500:7900]
list14 = CID_list[7900:8100]
list15 = CID_list[8100:]
#Already used all CID's, don't need additional list
#list16 = CID_list[7600:7700]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')
str5 = (str(list5)[1:-1])
str5 = str5.replace(' ', '')
str6 = (str(list6)[1:-1])
str6 = str6.replace(' ', '')
str7 = (str(list7)[1:-1])
str7 = str7.replace(' ', '')
str8 = (str(list8)[1:-1])
str8 = str8.replace(' ', '')
str9 = (str(list9)[1:-1])
str9 = str9.replace(' ', '')
str10 = (str(list10)[1:-1])
str10 = str10.replace(' ', '')
str11 = (str(list11)[1:-1])
str11 = str11.replace(' ', '')
str12 = (str(list12)[1:-1])
str12 = str12.replace(' ', '')
str13 = (str(list13)[1:-1])
str13 = str13.replace(' ', '')
str14 = (str(list14)[1:-1])
str14 = str14.replace(' ', '')
str15 = (str(list15)[1:-1])
str15 = str15.replace(' ', '')
#str16 = (str(list16)[1:-1])
#str16 = str16.replace(' ', '')



In [5]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the 16 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')
url5 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str5 + '/property/CovalentUnitCount/TXT')
html5 = urlopen(url5) 
soup5 = BeautifulSoup(html5, 'lxml')
url6 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str6 + '/property/CovalentUnitCount/TXT')
html6 = urlopen(url6) 
soup6 = BeautifulSoup(html6, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()
cov_units5 = soup5.get_text()
cov_units6 = soup6.get_text()

In [6]:
url7 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str7 + '/property/CovalentUnitCount/TXT')
html7 = urlopen(url7) 
soup7 = BeautifulSoup(html7, 'lxml')
url8 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str8 + '/property/CovalentUnitCount/TXT')
html8 = urlopen(url8)
soup8 = BeautifulSoup(html8, 'lxml')
url9 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str9 + '/property/CovalentUnitCount/TXT')
html9 = urlopen(url9) 
soup9 = BeautifulSoup(html9, 'lxml')
url10 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str10 + '/property/CovalentUnitCount/TXT')
html10 = urlopen(url10) 
soup10 = BeautifulSoup(html10, 'lxml')
url11 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str11 + '/property/CovalentUnitCount/TXT')
html11 = urlopen(url11) 
soup11 = BeautifulSoup(html11, 'lxml')
url12 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str12 + '/property/CovalentUnitCount/TXT')
html12 = urlopen(url12) 
soup12 = BeautifulSoup(html12, 'lxml')

cov_units7 = soup7.get_text()
cov_units8 = soup8.get_text()                      
cov_units9 = soup9.get_text()
cov_units10 = soup10.get_text()
cov_units11 = soup11.get_text()
cov_units12 = soup12.get_text()

In [7]:
url13 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str13 + '/property/CovalentUnitCount/TXT')
html13 = urlopen(url13) 
soup13 = BeautifulSoup(html13, 'lxml')
url14 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str14 + '/property/CovalentUnitCount/TXT')
html14 = urlopen(url14) 
soup14 = BeautifulSoup(html14, 'lxml')
url15 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str15 + '/property/CovalentUnitCount/TXT')
html15 = urlopen(url15) 
soup15 = BeautifulSoup(html15, 'lxml')
#url16 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str16 + '/property/CovalentUnitCount/TXT')
#html16 = urlopen(url16) 
#soup16 = BeautifulSoup(html16, 'lxml')

cov_units13 = soup13.get_text()
cov_units14 = soup14.get_text()
cov_units15 = soup15.get_text()
#cov_units16 = soup16.get_text()

#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4 + cov_units5 + cov_units6 + cov_units7 + cov_units8 + cov_units9 + cov_units10 + cov_units11 + cov_units12 + cov_units13 + cov_units14 + cov_units15 

#cov_units string to cov_units list
cov_units = cov_units.split()
#len(cov_units)

# Dealing with multicomponent molecules

In [11]:
#adding cov_units column to antag
antag_pre['CovalentUnitCount'] = cov_units

#Making copy of antag_pre
antag_cov = antag_pre

#dropping rows where covalent unit count is = 1
antag_cov = antag_cov[antag_cov.CovalentUnitCount != '1']
#1581 CID's that are multicomponent

In [12]:
#getting CID list of multicomponent compounds
CID_list_multi = antag_cov['PUBCHEM_CID'].tolist()

#Removing decimals
CID_list_multi = list(map(int, CID_list_multi))

#Splitting CID_list into smaller strings (600 is around the max size for the full url)
list1 = CID_list_multi[:600]
#Switched to intervals of 500 (error if any larger)
list2 = CID_list_multi[600:1100]
#Switched to intervals of 400 (error if any larger)
list3 = CID_list_multi[1100:1500]
#Final interval
list4 = CID_list_multi[1500:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')

In [29]:
#if/els Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = antag_cov['CovalentUnitCount'].tolist()

list = []
i = 0
while i < len(CID_list_multi):
    if (list_cov_multi[i]) == 2:
    # if (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        print(parent1)
        list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    elif (list_cov_multi[i]) == 3:
    # if (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        print(parent1)
        list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
        
    else:
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        #list.append('0')
        i = i + 1
        

        
            
        
#parent1 = soup1.get_text()



len(list)

0

In [42]:
#Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = antag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        

        
            
        
#parent1 = soup1.get_text()


len(parent_CID_list)

1581

In [24]:
#Getting the Parent Compound for each Multicomponent CID

#opening and reading the 4 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/cids/TXT?cids_type=parent')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/cids/TXT?cids_type=parent')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/cids/TXT?cids_type=parent')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/cids/TXT?cids_type=parent')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')

parent1 = soup1.get_text()
parent2 = soup2.get_text()
parent3 = soup3.get_text()
parent4 = soup4.get_text()

parent = parent1 + parent2 + parent3 + parent4

#parent string to parent list
parent = parent.split()
len(parent)


1149

In [97]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1


In [99]:
#adding parent_CID's to antag_cov DF
antag_cov['Parent_CIDs'] = parent_CID_list

ValueError: Length of values does not match length of index

In [107]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
antag_cov = antag_cov.drop(antag_cov.loc[antag_cov['Parent_CIDs'] == '0'].index)

#make a new df called antag
##replace CID's with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
antag = antag_cov.drop(columns = ['PUBCHEM_CID'], axis = 1)
antag = antag.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

6113

In [109]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
i = 0

while i < len(antag_pre.index):
    if 
#Combining antag_pre with antag


Int64Index([8470, 6113, 7329, 4958, 7918, 6812,   18, 4565, 5536, 3814,
            ...
            7223, 8665, 7875, 2626, 3514, 2562, 2532, 2479, 3062,  762],
           dtype='int64', length=8359)

In [None]:
##Finding DF of all duplicates
antag_dups = antag[antag.duplicated('PUBCHEM_CID', keep = False)] 
antag_dups = antag_dups.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [None]:
#Creating two seperate datasets for the three different cases (third case is all deleted)

In [None]:
##df_inactive = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'active agonist'].index)
df_inactive = df_inactive.drop(df_inactive.loc[antag_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive = df_inactive[df_inactive.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_clean = df_inactive.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [None]:
##df_active = both activity scores are active
###Drop all inactive values and drop any values that are no longer dups

df_active = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'inactive'].index)
df_active = df_active[df_active.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by ascending CID AND by ascending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active = df_active.sort_values(by = ['PUBCHEM_CID','Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_clean = df_active.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [None]:
##df_disagree = the activity scores disagree
##We removed all the rest because they didn't disagree

In [None]:
#Combining antag_no_dups with df_active_clean and df_inactive_clean to create the final database with no dups
frames1 = [antag_no_dups, df_active_clean, df_inactive_clean]
#FINAL CLEAN DF
antag_clean = pd.concat(frames1)

# Cleaning Agonist Data

In [None]:
#Reading in Summary AGONIST CSV
ag_pre = pd.read_csv('AID_720719_datatable_all.csv')
ag_pre = ag_pre.iloc[4:]
ag_pre = ag_pre.reset_index()
ag_pre.head()

In [None]:
#Removing inconclusives
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist (fluorescent)'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive antagonist'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive agonist (cytotoxic)'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Activity Summary'] == 'inconclusive antagonist (cytotoxic)'].index)

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
ag_pre = ag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
ag_pre.fillna(0, inplace = True)

##Dropping values with no CID
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
ag_pre = ag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

# Identifying Covalent Unit Count Agonist

In [None]:
#List of all CID's
CID_list = antag_pre['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list = list(map(int, CID_list))

In [None]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1600]
list3 = CID_list[1600:2400]
#Switched to intervals of 700 (error if any larger)
list4 = CID_list[2400:3100]
#Switched to intervals of 600 (error if any larger)
list5 = CID_list[3100:3700]
list6 = CID_list[3700:4300]
list7 = CID_list[4300:4900]
list8 = CID_list[4900:5500]
list9 = CID_list[5500:6100]
#switched to intervals of 500 (integers get larger)
list10 = CID_list[6100:6600]
list11 = CID_list[6600:7100]
#switched to intervals of 400 (integers get larger)
list12 = CID_list[7100:7500]
list13 = CID_list[7500:7900]
list14 = CID_list[7900:8100]
list15 = CID_list[8100:]
#Already used all CID's, don't need additional list
#list16 = CID_list[7600:7700]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')
str5 = (str(list5)[1:-1])
str5 = str5.replace(' ', '')
str6 = (str(list6)[1:-1])
str6 = str6.replace(' ', '')
str7 = (str(list7)[1:-1])
str7 = str7.replace(' ', '')
str8 = (str(list8)[1:-1])
str8 = str8.replace(' ', '')
str9 = (str(list9)[1:-1])
str9 = str9.replace(' ', '')
str10 = (str(list10)[1:-1])
str10 = str10.replace(' ', '')
str11 = (str(list11)[1:-1])
str11 = str11.replace(' ', '')
str12 = (str(list12)[1:-1])
str12 = str12.replace(' ', '')
str13 = (str(list13)[1:-1])
str13 = str13.replace(' ', '')
str14 = (str(list14)[1:-1])
str14 = str14.replace(' ', '')
str15 = (str(list15)[1:-1])
str15 = str15.replace(' ', '')
#str16 = (str(list16)[1:-1])
#str16 = str16.replace(' ', '')



In [None]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the 16 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')
url5 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str5 + '/property/CovalentUnitCount/TXT')
html5 = urlopen(url5) 
soup5 = BeautifulSoup(html5, 'lxml')
url6 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str6 + '/property/CovalentUnitCount/TXT')
html6 = urlopen(url6) 
soup6 = BeautifulSoup(html6, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()
cov_units5 = soup5.get_text()
cov_units6 = soup6.get_text()

In [None]:
url7 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str7 + '/property/CovalentUnitCount/TXT')
html7 = urlopen(url7) 
soup7 = BeautifulSoup(html7, 'lxml')
url8 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str8 + '/property/CovalentUnitCount/TXT')
html8 = urlopen(url8)
soup8 = BeautifulSoup(html8, 'lxml')
url9 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str9 + '/property/CovalentUnitCount/TXT')
html9 = urlopen(url9) 
soup9 = BeautifulSoup(html9, 'lxml')
url10 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str10 + '/property/CovalentUnitCount/TXT')
html10 = urlopen(url10) 
soup10 = BeautifulSoup(html10, 'lxml')
url11 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str11 + '/property/CovalentUnitCount/TXT')
html11 = urlopen(url11) 
soup11 = BeautifulSoup(html11, 'lxml')
url12 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str12 + '/property/CovalentUnitCount/TXT')
html12 = urlopen(url12) 
soup12 = BeautifulSoup(html12, 'lxml')

cov_units7 = soup7.get_text()
cov_units8 = soup8.get_text()                      
cov_units9 = soup9.get_text()
cov_units10 = soup10.get_text()
cov_units11 = soup11.get_text()
cov_units12 = soup12.get_text()

In [None]:
url13 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str13 + '/property/CovalentUnitCount/TXT')
html13 = urlopen(url13) 
soup13 = BeautifulSoup(html13, 'lxml')
url14 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str14 + '/property/CovalentUnitCount/TXT')
html14 = urlopen(url14) 
soup14 = BeautifulSoup(html14, 'lxml')
url15 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str15 + '/property/CovalentUnitCount/TXT')
html15 = urlopen(url15) 
soup15 = BeautifulSoup(html15, 'lxml')
#url16 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str16 + '/property/CovalentUnitCount/TXT')
#html16 = urlopen(url16) 
#soup16 = BeautifulSoup(html16, 'lxml')

cov_units13 = soup13.get_text()
cov_units14 = soup14.get_text()
cov_units15 = soup15.get_text()
#cov_units16 = soup16.get_text()

#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4 + cov_units5 + cov_units6 + cov_units7 + cov_units8 + cov_units9 + cov_units10 + cov_units11 + cov_units12 + cov_units13 + cov_units14 + cov_units15 

#cov_units string to cov_units list
cov_units = cov_units.split()
#len(cov_units)

# Dealing with Multicomponent Molecules