# Dealing with multicomponent compounds

# Data Cleaning Full (antag)

In [205]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from requests import exceptions
import re
%matplotlib inline

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Summary ANTAGONIST CSV
antag_pre = pd.read_csv('AID_588533_datatable_all.csv')
antag_pre = antag_pre.iloc[5:]
list1 = antag_pre['PUBCHEM_CID']

In [206]:
#Removing inconclusives
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Phenotype'] == 'Inconclusive'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Phenotype'] == 'Cytotoxic'].index)
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['Phenotype'] == 'Fluorescent'].index)

In [207]:
#Changing "Phenotype" Column to Activity Summary
antag_pre = antag_pre.rename(index=str, columns={"Phenotype": "Activity Summary"})

#Changing "Potency" Column to Ratio Potency (uMJ)
antag_pre = antag_pre.rename(index=str, columns={"Potency": "Ratio Potency (uM)"})

#Changing "Efficacy" Column to Ratio Efficacy (%)
antag_pre = antag_pre.rename(index=str, columns={"Efficacy": "Ratio Efficacy (%)"})

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
antag_pre = antag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
antag_pre.fillna(0, inplace = True)

##Dropping values with no CID
antag_pre = antag_pre.drop(antag_pre.loc[antag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
antag_pre = antag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

In [208]:
#Replacing Inhibitor with active agonist title
#Replacing Activator with active agonist title
antag_pre = antag_pre.replace({'Inhibitor': 'active antagonist', 'Activator': 'active agonist', 'Inactive' : 'inactive'})

# Identifying Covalent Unit Count (antag)

In [209]:
#List of all CID's
CID_list = antag_pre['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]
len(CID_list)

2307

In [210]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1500]
list3 = CID_list[1500:2100]
list4 = CID_list[2100:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')

In [211]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the 16 CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()

#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4

#cov_units string to cov_units list
cov_units = cov_units.split()
len(cov_units)

2307

# Dealing with multicomponent molecules (antag)

In [214]:
#adding cov_units column to antag
antag_pre['CovalentUnitCount'] = cov_units

#Making copy of antag_pre
antag_cov = antag_pre

#dropping rows where covalent unit count is = 1
antag_cov = antag_cov[antag_cov.CovalentUnitCount != '1']
antag_cov.shape
#168 multicomponent CIDs (2307 total)

(168, 5)

In [215]:
#getting CID list of multicomponent compounds
CID_list_multi = antag_cov['PUBCHEM_CID'].tolist()

#Removing decimals
CID_list_multi = [int(i) for i in CID_list_multi]

In [216]:
#Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = antag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        



len(parent_CID_list)

168

In [217]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1

In [218]:
#adding parent_CID's to antag_cov DF
antag_cov['Parent_CIDs'] = parent_CID_list

In [219]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called antag_multi_to_parent
antag_multi_to_parent = antag_cov.drop(antag_cov.loc[antag_cov['Parent_CIDs'] == '0'].index)

##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
antag_multi_to_parent = antag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
antag_multi_to_parent = antag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [220]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
antag_pre_minus_multi = antag_pre[~antag_pre.PUBCHEM_CID.isin(CID_list_multi)]

In [221]:
#Combining antag_pre with antag_multi_to_parent
antag = antag_pre_minus_multi.append(antag_multi_to_parent)

# Dealing with conflicts (antag)

In [222]:
#Dropping duplicates from original antag dataframe
antag_no_dups = antag.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [223]:
##Finding DF of all duplicates
antag_dups = antag[antag.duplicated('PUBCHEM_CID', keep = False)] 
antag_dups = antag_dups.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [224]:
#Creating two seperate datasets for the three different cases (third case is all deleted)

In [225]:
##df_inactive = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'active agonist'].index)
df_inactive = df_inactive.drop(df_inactive.loc[antag_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive = df_inactive[df_inactive.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_clean = df_inactive.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [226]:
##df_active = both activity scores are active
###Drop all inactive values and drop any values that are no longer dups

df_active = antag_dups.drop(antag_dups.loc[antag_dups['Activity Summary'] == 'inactive'].index)
df_active = df_active[df_active.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by ascending CID AND by ascending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active = df_active.sort_values(by = ['PUBCHEM_CID','Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_clean = df_active.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [227]:
##df_disagree = the activity scores disagree
##We removed all the rest because they didn't disagree

In [229]:
#Combining antag_no_dups with df_active_clean and df_inactive_clean to create the final database with no dups
frames1 = [antag_no_dups, df_active_clean, df_inactive_clean]

#FINAL CLEAN DF
antag_clean = pd.concat(frames1)
antag_clean.shape

(1901, 5)

# Data Cleaning Full (ag)

In [232]:
#Data has no active antagonist values

In [233]:
#Reading in Summary ANTAGONIST CSV
ag_pre = pd.read_csv('AID_588532_datatable_all.csv')
ag_pre = ag_pre.iloc[5:]
list1 = ag_pre['PUBCHEM_CID']

In [234]:
#Removing inconclusives
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Phenotype'] == 'Inconclusive'].index)
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['Phenotype'] == 'Fluorescent'].index)

In [235]:
#Changing "Phenotype" Column to Activity Summary
ag_pre = ag_pre.rename(index=str, columns={"Phenotype": "Activity Summary"})

#Changing "Potency" Column to Ratio Potency (uMJ)
ag_pre = ag_pre.rename(index=str, columns={"Potency": "Ratio Potency (uM)"})

#Changing "Efficacy" Column to Ratio Efficacy (%)
ag_pre = ag_pre.rename(index=str, columns={"Efficacy": "Ratio Efficacy (%)"})

#Keeping only CID, Activity Summary, Ratio Potency (uM), and Ratio Efficacy
ag_pre = ag_pre[['PUBCHEM_CID', 'Activity Summary', 'Ratio Potency (uM)', 'Ratio Efficacy (%)']]
ag_pre.fillna(0, inplace = True)

##Dropping values with no CID
ag_pre = ag_pre.drop(ag_pre.loc[ag_pre['PUBCHEM_CID'] == 0].index)

#Sorting CID's in ascending order
ag_pre = ag_pre.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)

In [236]:
#Replacing Activator with active agonist title
ag_pre = ag_pre.replace({'Activator': 'active agonist', 'Inactive' : 'inactive'})

# Identifying Covalent Unit Count (ag)

In [240]:
#List of all CID's
CID_list = ag_pre['PUBCHEM_CID'].tolist()

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]
len(CID_list)

2421

In [241]:
#Splitting CID_list into smaller strings (800 is around the max size for the full url)
list1 = CID_list[:800]
list2 = CID_list[800:1500]
list3 = CID_list[1500:2100]
list4 = CID_list[2100:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')
str3 = (str(list3)[1:-1])
str3 = str3.replace(' ', '')
str4 = (str(list4)[1:-1])
str4 = str4.replace(' ', '')

In [242]:
#Getting the CovalentUnitCount for each CID
##IF it is greater than 1, it is a multicomponent compound

#opening and reading the CovalentUnitCount URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/CovalentUnitCount/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/CovalentUnitCount/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')
url3 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str3 + '/property/CovalentUnitCount/TXT')
html3 = urlopen(url3) 
soup3 = BeautifulSoup(html3, 'lxml')
url4 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str4 + '/property/CovalentUnitCount/TXT')
html4 = urlopen(url4) 
soup4 = BeautifulSoup(html4, 'lxml')

cov_units1 = soup1.get_text()
cov_units2 = soup2.get_text()
cov_units3 = soup3.get_text()
cov_units4 = soup4.get_text()

#total cov_units string
cov_units = cov_units1 + cov_units2 + cov_units3 + cov_units4

#cov_units string to cov_units list
cov_units = cov_units.split()
len(cov_units)

2421

# Dealing with multicomponent molecules (antag)

In [245]:
#adding cov_units column to antag
ag_pre['CovalentUnitCount'] = cov_units

#Making copy of antag_pre
ag_cov = ag_pre

#dropping rows where covalent unit count is = 1
ag_cov = ag_cov[ag_cov.CovalentUnitCount != '1']
ag_cov.shape
#182 multicomponent CIDs (2421 total)

(182, 5)

In [246]:
#getting CID list of multicomponent compounds
CID_list_multi = ag_cov['PUBCHEM_CID'].tolist()

#Removing decimals
CID_list_multi = [int(i) for i in CID_list_multi]

In [247]:
#Getting list of covalent unit number for the multicomponent compounds
list_cov_multi = ag_cov['CovalentUnitCount'].tolist()

parent_CID_list = []
i = 0
while i < len(CID_list_multi):
    try:
    # (list_cov_multi[i] == '2'):
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(CID_list_multi[i]) + '/cids/TXT?cids_type=parent'
        html1 = urlopen(url1) 
        soup1 = BeautifulSoup(html1, 'lxml')
        parent1 = soup1.get_text()
        #print(parent1)
        parent_CID_list.append(parent1)
        i = i + 1
       # else:
        #    i = i + 1
    except:
        #contine
        #print('eek')
        #if err.code == ('404: PUGREST.NotFound'):
        parent_CID_list.append('0')
        i = i + 1
        



len(parent_CID_list)

182

In [251]:
#Removing '\n from the end of each CID in parent_CID_list'
bad_chars = ['\n']

i = 0
while i < len(parent_CID_list):
    if parent_CID_list[i] == '0':
        i = i + 1
    else:
        for j in bad_chars: 
            parent_CID_list[i] = parent_CID_list[i].replace(j, '')
            i = i + 1

In [253]:
#adding parent_CID's to antag_cov DF
ag_cov['Parent_CIDs'] = parent_CID_list

In [254]:
#now that we have a list of the parent compounds for each multiple component molecule we:
#delete all values with a parent of 0
#make a new df called antag_multi_to_parent
ag_multi_to_parent = ag_cov.drop(ag_cov.loc[ag_cov['Parent_CIDs'] == '0'].index)

##replace PUBCHEM_CID with parent CID's
###Drop CID column and rename Parent_CID's to just PUBCHEM_CID
ag_multi_to_parent = ag_multi_to_parent.drop(columns = ['PUBCHEM_CID'], axis = 1)
ag_multi_to_parent = ag_multi_to_parent.rename(index=str, columns={"Parent_CIDs" : "PUBCHEM_CID"})

In [257]:
#Removing all CID's with multicomponent compounds from antag_pre (original DF of all antag compounds)
ag_pre_minus_multi = ag_pre[~ag_pre.PUBCHEM_CID.isin(CID_list_multi)]

In [261]:
#Combining antag_pre with antag_multi_to_parent
ag = ag_pre_minus_multi.append(ag_multi_to_parent)

# Dealing with conflicts (ag)

In [262]:
#Dropping duplicates from original antag dataframe
ag_no_dups = ag.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [263]:
##Finding DF of all duplicates
ag_dups = ag[ag.duplicated('PUBCHEM_CID', keep = False)] 
ag_dups = ag_dups.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [264]:
#Creating two seperate datasets for the three different cases (third case is all deleted)

In [265]:
##df_inactive = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive = ag_dups.drop(ag_dups.loc[ag_dups['Activity Summary'] == 'active agonist'].index)
df_inactive = df_inactive.drop(df_inactive.loc[ag_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive = df_inactive[df_inactive.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_clean = df_inactive.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [266]:
##df_active = both activity scores are active
###Drop all inactive values and drop any values that are no longer dups

df_active = ag_dups.drop(ag_dups.loc[ag_dups['Activity Summary'] == 'inactive'].index)
df_active = df_active[df_active.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by ascending CID AND by ascending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active = df_active.sort_values(by = ['PUBCHEM_CID','Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_clean = df_active.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [267]:
##df_disagree = the activity scores disagree
##We removed all the rest because they didn't disagree

In [268]:
#Combining antag_no_dups with df_active_clean and df_inactive_clean to create the final database with no dups
frames1 = [ag_no_dups, df_active_clean, df_inactive_clean]

#FINAL CLEAN DF
ag_clean = pd.concat(frames1)
ag_clean.shape

(1997, 5)

# Combining Dataframes and Cleaning (both)

In [269]:
#Combining antag_clean and ag_clean
frames3 = [antag_clean, ag_clean]
final_concat_covcount = pd.concat(frames3)

#Dropping Covalent Unit Column
final_concat = final_concat_covcount.drop(columns = ['CovalentUnitCount'], axis = 1)

#Changing CID's to int instead of string
final_concat['PUBCHEM_CID'] = (final_concat['PUBCHEM_CID']).astype(int)
#final_concat.dtypes

In [270]:
#Dropping duplicates from original final dataframe
final_no_dups = final_concat.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [271]:
##Finding DF of all duplicates
final_dups = final_concat[final_concat.duplicated('PUBCHEM_CID', keep = False)] 
final_dups = final_dups.sort_values(by = ['PUBCHEM_CID'], ascending = True, inplace = False)
final_dups = final_dups.reset_index(drop = True)

In [272]:
##df_inactive_final = both activity scores inactive
###Drop all active agonists and active antagonists, and drop any values that are no longer dups.  
###This will leave CID dups with outcomes that are both inactive.
df_inactive_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'active agonist'].index)
df_inactive_final = df_inactive_final.drop(df_inactive_final.loc[final_dups['Activity Summary'] == 'active antagonist'].index)
df_inactive_final = df_inactive_final[df_inactive_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

#Only leaving one of each inactive value
df_inactive_final_clean = df_inactive_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [273]:
##df_active_ag_final = both activity scores are active agonist
###Drop all inactive values and drop any values that are no longer dups
###Drop all active antagonist values

df_active_ag_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'inactive'].index)
df_active_ag_final = df_active_ag_final.drop(final_dups.loc[final_dups['Activity Summary'] == 'active antagonist'].index)
df_active_ag_final = df_active_ag_final[df_active_ag_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by descending CID AND by descending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active_ag_final = df_active_ag_final.sort_values(by = ['PUBCHEM_CID', 'Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_ag_final_clean = df_active_ag_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [274]:
##df_active_antag_final = both activity scores are active antagonists
###Drop all inactive values and drop any values that are no longer dups
###Drop all active agonist values

df_active_antag_final = final_dups.drop(final_dups.loc[final_dups['Activity Summary'] == 'inactive'].index)
df_active_antag_final = df_active_antag_final.drop(final_dups.loc[final_dups['Activity Summary'] == 'active agonist'].index)
df_active_antag_final = df_active_antag_final[df_active_antag_final.duplicated(subset = 'PUBCHEM_CID', keep = False)]

###Rank by descending CID AND by descending potency
###Drop duplicates and leave the FIRST value, which is the lowest potency!!!!!
df_active_antag_final = df_active_antag_final.sort_values(by = ['PUBCHEM_CID', 'Ratio Potency (uM)'], ascending = True, inplace = False)
df_active_antag_final_clean = df_active_antag_final.drop_duplicates(subset = 'PUBCHEM_CID', keep = 'first')

In [275]:
#getting lists of CID's of rows that fall into othr categories
inactive_list = df_inactive_final['PUBCHEM_CID'].tolist()
active_ag_list = df_active_ag_final['PUBCHEM_CID'].tolist()
active_antag_list = df_active_antag_final['PUBCHEM_CID'].tolist()
#combining lists
index_list = inactive_list + active_ag_list + active_antag_list

#Dropping those CID's from full dups dataframe final_dups
##This will create a dataframe with CID pairs that are either inactive/active or active/active
##Any remaining active/active CID pairs must be active agonist/antagonist pairs, and are inconclusive
##We will delete those values
final_dups_newindex = final_dups.set_index('PUBCHEM_CID')
df_disagree = final_dups_newindex.drop(labels = index_list)
df_disagree = df_disagree.reset_index()

In [276]:
#Sorting by CID
df_disagree = df_disagree.sort_values(by = ['PUBCHEM_CID', 'Activity Summary'], ascending = True, inplace = False)

In [277]:
#We remove all inactive values (BC we are keeping active)
df_disagree = df_disagree.drop(df_disagree.loc[df_disagree['Activity Summary'] == 'inactive'].index)
#We then delete any remaining duplicates, because that means they were active agonist/antagonist pairs
df_disagree_clean = df_disagree.drop_duplicates(subset = 'PUBCHEM_CID', keep = False)

In [278]:
#We combine all the pieces into the full cleaned combined dataframe
frames4 = [final_no_dups, df_inactive_final_clean, df_active_ag_final_clean, df_active_antag_final_clean, df_disagree_clean]
final_clean = pd.concat(frames4)


# Deleting any CID's that are in Tox21 data

In [287]:
#Reading in Molecular Properties CSV
tox21 = pd.read_csv('Molecular_Properties_CSV')
tox21 = tox21.astype(float, errors = 'ignore')

#getting list of CID's from tox21 data
tox21_CIDs = tox21['PUBCHEM_CID']
tox21_CIDs = tox21_CIDs.tolist()
final_clean.shape

(2182, 4)

In [294]:
#Removing all CID's in tox21 data from NCGC cleaned data (final_clean)
final_ncgc = final_clean[~final_clean.PUBCHEM_CID.isin(tox21_CIDs)]
final_ncgc.shape
#only 46 active values total

(328, 4)

In [295]:
#Taking Dataframe of only Active values
##Dropping all inactive values
df_active = final_ncgc.drop(final_ncgc.loc[final_ncgc['Activity Summary'] == 'inactive'].index)
df_active.shape

(46, 4)

In [291]:
#Writing to csv
final_ncgc.to_csv('NCGC_clean', index = False)
