In [1]:
#from chemspipy import ChemSpider
import sqlite3
from sqlite3 import Error
import re
from urllib.request import urlopen as ureq, urlretrieve as uret
from bs4 import BeautifulSoup as soup
from itertools import chain 
import pandas as pd
import fitz
from multiprocessing import Pool, Manager, Process

#cs = ChemSpider('ffff2079-4c5d-443b-8add-00206a946ed2')

In [2]:
def create_connection(db_file):
    try:
        conn = sqlite3.connect('file:{}?mode=rwc'.format(db_file),uri=True) #Read, Write & Create mode, so connecting to a non-existing database will cause it to be created
        print('sqlite3 version: '+sqlite3.version+'\n==============\n')
        return conn
    except Error as e:
        print(e)

def create_table(conn):
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS CStable (ChemSpider_ID INTEGER PRIMARY KEY, InChI_string TEXT, Std_InChI TEXT, 
                 Molecular_Formula TEXT, Average_Mass REAL, SMILES TEXT, Common_Name TEXT, Systematic_Name TEXT,
                 logP REAL,H_Bond_Donors INTEGER,H_Bond_Acceptors INTEGER,Num_Rota_Bonds INTEGER,Lipinski_Rule_5 INTEGER,
                 Polar_Surface_Area TEXT,Enthalpy_Vap REAL,Density REAL,Boiling_Point REAL)''')
    return c.lastrowid
#PSA is text because attached is the unit, angstrom which is text

def data_entry(conn,listOfProps,n,k):
    sql = '''INSERT INTO CStable (ChemSpider_ID,InChI_string,Std_InChI,Molecular_Formula,Average_Mass,
             SMILES,Common_Name,Systematic_Name,logP,H_Bond_Donors,H_Bond_Acceptors,Num_Rota_Bonds, 
             Lipinski_Rule_5,Polar_Surface_Area,Enthalpy_Vap,Density,Boiling_Point) 
             VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
    c = conn.cursor()
    c.execute(sql,listOfProps)
    print('({}) {} -> Saved to SQL Table'.format(n,k))
    #n is counter, k is key from CSDict keys
def update_name(conn,nmID):
    sql = ''' UPDATE CStable SET Common_Name = ? WHERE ChemSpider_ID = ? '''
    c = conn.cursor()
    c.execute(sql,nmID) #list of names (new,CSID)
    conn.commit()
    print('Common Name for CS_ID: {} was changed to "{}"'.format(nmID[1],nmID[0]))
def update_data(conn, listOfProps,n,k):
    sql = ''' UPDATE CStable
              SET InChI_string = ?,Std_InChI = ?,Molecular_Formula = ?,Average_Mass =?,
              SMILES=?, Common_Name=?, Systematic_Name=?, logP=?, H_Bond_Donors=?, H_Bond_Acceptors=?, Num_Rota_Bonds=?, 
              Lipinski_Rule_5=?, Polar_Surface_Area=?, Enthalpy_Vap=?, Density=?,Boiling_Point=?
              WHERE ChemSpider_ID = ?'''
    c = conn.cursor()
    c.execute(sql, listOfProps)
    conn.commit()
    print('({}) {} -> Updated SQL Table'.format(n,k))
    
def delete_all_entries(conn):
    sql = '''DELETE FROM CStable'''
    c = conn.cursor()
    c.execute(sql)
    conn.commit()
def drop_table(conn):
    sql = '''DROP TABLE CStable'''
    c = conn.cursor()
    c.execute(sql)
    conn.commit()  
def get_sqlTable(conn):
    with conn:
        c = conn.cursor()
        c.execute("SELECT * FROM CStable")
        print(c.fetchall())
        
def unlock_db(db_filename):
    connection = create_connection(db_filename)
    connection.commit()
    connection.interrupt()
    connection.close()
    print('Connection closed')

In [3]:
def process_single(soupPage,drug):
    PropDict = dict()
    try:
        PropDict['H Bond Acceptors'] = soupPage.find('td',{'class':'prop_title'},text='#H bond acceptors:').next_sibling.next_element.text.strip()
        PropDict['H Bond Donors'] = soupPage.find('td',{'class':'prop_title'},text='#H bond donors:').next_sibling.next_element.text.strip()
        PropDict['LogP'] = soupPage.find('a',{'href':'http://www.acdlabs.com/logp'}).next_sibling.next_element.next_sibling.text.strip()
        PropDict['Num. Rotatable Bonds'] = soupPage.find('td',{'class':'prop_title'},text='#Freely Rotating Bonds:').next_sibling.next_element.text.strip()
        PropDict['Lipinski Rule of 5'] = soupPage.find('td',{'class':'prop_title'},text = '#Rule of 5 Violations:').next_sibling.next_element.text.strip()
        PropDict['Polar Surface Area'] = soupPage.find('td',{'class':'prop_title'},text='Polar Surface Area:').next_sibling.next_element.text.strip()
        # if any of the above are '' or None, escape and move on to next drug in list
        molec_descriptors = ['H Bond Acceptors','H Bond Donors','LogP','Num. Rotatable Bonds','Lipinski Rule of 5','Polar Surface Area']
        check_blank = bool([a for a in PropDict.values() if a == '']) ##OR any([PropDict[i] == '' for i in drugdict.keys()]) 
        if check_blank:
            raise Exception('Blank Descriptors')
        else:
            PropDict['ChemSpider ID'] = soupPage.find('span',{'class':'prop_title'},text='ChemSpider ID').next_sibling
            PropDict['SMILES'] = soupPage.find('span',{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_WrapControl2'}).text
            PropDict['Std InChI'] = '{}{}'.format(soupPage.find('a',{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_StdInChIKey_conn'}).text,
                                                     soupPage.find('a',{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_StdInChIKey_rest'}).text)
            PropDict['InChI string'] = soupPage.find('span',{'id':"ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_WrapControl4"}).text
            check_diffName = str(soupPage.find('span',{'id':"ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_WrapTitle"}).text)
            if drug.lower() != check_diffName.lower():
                PropDict['Common Name'] = '{} / {}'.format(drug,check_diffName)
            else:
                PropDict['Common Name'] = drug
            #
            PropDict['Systematic Name'] = soupPage.find('span',{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_WrapSysName'}).text
            PropDict['Molecular Formula'] = soupPage.find("span",{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_prop_MF'}).text 
            PropDict['Average Mass (Da)'] = soupPage.find('span',text='Average mass').next_sibling.replace(' Da','')
            PropDict['Density'] = soupPage.find('td',{'class':'prop_title'},text='Density:').next_sibling.next_element.text.strip()
            PropDict['Boiling Point'] = soupPage.find('td',{'class':'prop_title'},text='Boiling Point:').next_sibling.next_element.text.strip()
            PropDict['Enthalpy of Vapourisation'] = soupPage.find('td',{'class':'prop_title'},text='Enthalpy of Vaporization:').next_sibling.next_element.text.strip()                
            return PropDict
        
    except AttributeError as a:
        if soupPage.find('h3') == None or soupPage.find('h3').text == 'Found 0 results':
            print('\nThere was no search result for {} - {}\n'.format(drug,a))
        return False

    except Exception as e:
        diffName = soupPage.find('span',{'id':"ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_WrapTitle"}).text
        if drug != diffName:
            print('\nThe molecular descriptors for {} ({}) were empty - {}\n'.format(diffName,drug,e))
        else:
            print('\nThe molecular descriptors for {} were empty - {}\n'.format(drug,e))
        return False

In [10]:
def is_multi(soupPage):
    try:
        if soupPage.tbody:
            search_result = soupPage.findAll("td",{"align":"center","class":"search-id-column"})
            numSrchRslt = len(search_result)
            if numSrchRslt > 1:
                return True
        return False
    except AttributeError:
        return False
    
def multi_hitSites(soupPage):
    list_htmlcontent=[]
    multi_hitList=[]
    for a in soupPage.tbody.findAll("td",{'class':'search-id-column'}):
        list_htmlcontent.append('{}{}'.format('http://www.chemspider.com',a.a['href']))
    for sites in list_htmlcontent:
        uClient = ureq(sites)
        pagehtml= uClient.read()
        uClient.close()
        soupPage_eachSite = soup(pagehtml,'html.parser')
        multi_hitList.append(soupPage_eachSite)
    return multi_hitList

def process_search(drug,soupPage): #,soupPage,outerDict,count,emptyDrugList):
    global count
    drug = drug.replace('+',' ')
    if is_multi(soupPage):
        print('There was more than one search result for {}\n'.format(drug)) 
        for eachItem in multi_hitSites(soupPage):
            selected_data = process_single(eachItem,drug)                
            if selected_data:
                if '~' in [item for item in selected_data['Common Name']]:
                    trunc_name = selected_data['Common Name'].split('~')[0].strip()
                    #outerDict[trunc_name] = selected_data
                    #outerDict.pop('Common Name')
                    outerDict[selected_data['ChemSpider ID']] = selected_data
                    outerDict[selected_data['ChemSpider ID']]['Common Name'] = trunc_name
                    print('\t({}) {} was found in search result of {} and saved to the dictionary.'.format(count,trunc_name,drug))
                else:
                    #tmpName = selected_data['Common Name']
                    outerDict[selected_data['ChemSpider ID']] = selected_data
                    print('\t({}) {} was found and saved to the dictionary'.format(count,drug))
                count += 1 
            else:
                diff_name = eachItem.find('span',{'id':"ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_WrapTitle"}).text
                emptyDrugList.append(diff_name)
    else:
        try:
            selected_data = process_single(soupPage,drug)
            outerDict[selected_data['ChemSpider ID']] = selected_data
            print('({}) {} was found and saved to the dictionary \n'.format(count,drug))
            count += 1
        except Error as e:
            print(e)
            emptyDrugList.append(drug)
            print('{} was not found in the ChemSpider DB \n'.format(drug))
 #   return count

def loopupdate(outerDict,n,k):
    loop =True
    while loop:
        updatentries = input('Do you want to update the TABLE entries for {}? (Y/N)'.format(k))
        if updatentries.lower() == 'y':
            try:
                update_data(conn,(outerDict[k]['InChI string'],outerDict[k]['Std InChI'],
                            outerDict[k]['Molecular Formula'],outerDict[k]['Average Mass (Da)'].replace(' Da',''),
                            outerDict[k]['SMILES'],outerDict[k]['Common Name'],outerDict[k]['Systematic Name'],
                            outerDict[k]['LogP'],outerDict[k]['H Bond Donors'],outerDict[k]['H Bond Acceptors'],
                            outerDict[k]['Num. Rotatable Bonds'],outerDict[k]['Lipinski Rule of 5'],
                            outerDict[k]['Polar Surface Area'],
                            outerDict[k]['Enthalpy of Vapourisation'],outerDict[k]['Density'],outerDict[k]['Boiling Point'],outerDict[k]['ChemSpider ID']), n, k)
                n+=1
                loop=False
            except Error as e:
                print(e)
                loop=False
        elif updatentries.lower() == 'n':
                loop=False
                pass
        else:
            print('\n\tEnter a valid entry.\n')
            print(updatentries)
    return n

In [19]:
searchlink = 'http://www.chemspider.com/Search.aspx?q='
count = 1
outerDict = dict()
emptyDrugList = []
for eachdrug in plaintxt_druglist:
        uClient=ureq(searchlink+eachdrug)
        pagehtml = uClient.read()
        soupPage_eachDrug = soup(pagehtml,'html.parser')
        count = process_search(eachdrug,soupPage_eachDrug,outerDict,count,emptyDrugList)

(1) Abemaciclib was found and saved to the dictionary 

(2) Abiraterone Acetate was found and saved to the dictionary 

(3) Abraxane was found and saved to the dictionary 


The molecular descriptors for ABVD were empty - Blank Descriptors

ABVD was not found in the ChemSpider DB 


There was no search result for ABVE - 'NoneType' object has no attribute 'next_sibling'

ABVE was not found in the ChemSpider DB 


There was no search result for ABVE-PC - 'NoneType' object has no attribute 'next_sibling'

ABVE-PC was not found in the ChemSpider DB 


The molecular descriptors for Actinium (AC) were empty - Blank Descriptors

AC was not found in the ChemSpider DB 

There was more than one search result for Acalabrutinib

	(4) Acalabrutinib was found and saved to the dictionary
	(5) Acalabrutinib was found and saved to the dictionary

There was no search result for AC-T - 'NoneType' object has no attribute 'next_sibling'

AC-T was not found in the ChemSpider DB 


There was no search result

	(34) Bosulif was found and saved to the dictionary

The molecular descriptors for bosutinib hydrate (Bosulif) were empty - Blank Descriptors

(35) Bosutinib was found and saved to the dictionary 


There was no search result for Braftovi - 'NoneType' object has no attribute 'next_sibling'

Braftovi was not found in the ChemSpider DB 


There was no search result for Brentuximab Vedotin - 'NoneType' object has no attribute 'next_sibling'

Brentuximab Vedotin was not found in the ChemSpider DB 

Brigatinib was not found in the ChemSpider DB 


There was no search result for BuMel - 'NoneType' object has no attribute 'next_sibling'

BuMel was not found in the ChemSpider DB 

(36) Busulfan was found and saved to the dictionary 

(37) Busulfex was found and saved to the dictionary 

Cabazitaxel was not found in the ChemSpider DB 


The molecular descriptors for cabozantinib (S)-malate (Cabometyx) were empty - Blank Descriptors

Cabometyx was not found in the ChemSpider DB 


The molecular 


There was no search result for Defitelio - 'NoneType' object has no attribute 'next_sibling'

Defitelio was not found in the ChemSpider DB 

(71) Degarelix was found and saved to the dictionary 


There was no search result for Denileukin Diftitox - 'NoneType' object has no attribute 'next_sibling'

Denileukin Diftitox was not found in the ChemSpider DB 


There was no search result for Denosumab - 'NoneType' object has no attribute 'next_sibling'

Denosumab was not found in the ChemSpider DB 

(72) DepoCyt was found and saved to the dictionary 

(73) Dexamethasone was found and saved to the dictionary 


The molecular descriptors for 5346058Q7S (Dexrazoxane Hydrochloride) were empty - Blank Descriptors

Dexrazoxane Hydrochloride was not found in the ChemSpider DB 


There was no search result for Dinutuximab - 'NoneType' object has no attribute 'next_sibling'

Dinutuximab was not found in the ChemSpider DB 

(74) Docetaxel was found and saved to the dictionary 

There was more than o



The molecular descriptors for [3,3',3''-{18-[(2R)-2-Hydroxy-2-(3-hydroxy-3-dioxiranyl)ethyl]-3,8,13,17-tetramethyl-2,7,12-porphyrintriyl-κ2N21,N23}tripropanoato(2-)]iron (FEC) were empty - Blank Descriptors


The molecular descriptors for Iron(2+) 2,7,13,17-tetrakis(2-carboxyethyl)-3,8,12,18-tetramethylporphine-21,23-diide (FEC) were empty - Blank Descriptors

(99) Femara was found and saved to the dictionary 


There was no search result for Filgrastim - 'NoneType' object has no attribute 'next_sibling'

Filgrastim was not found in the ChemSpider DB 

(100) Firmagon was found and saved to the dictionary 

(101) Fludarabine Phosphate was found and saved to the dictionary 


There was no search result for Fluoroplex - 'NoneType' object has no attribute 'next_sibling'

Fluoroplex was not found in the ChemSpider DB 


There was no search result for Fluorouracil Injection - 'NoneType' object has no attribute 'next_sibling'

Fluorouracil Injection was not found in the ChemSpider DB 


The

(124) Idelalisib was found and saved to the dictionary 


There was no search result for Idhifa - 'NoneType' object has no attribute 'next_sibling'

Idhifa was not found in the ChemSpider DB 

(125) Ifex was found and saved to the dictionary 

There was more than one search result for Ifosfamide

	(126) Ifosfamide was found and saved to the dictionary
	(127) Ifosfamide was found and saved to the dictionary
	(128) Ifosfamide was found and saved to the dictionary

There was no search result for IL-2 - 'NoneType' object has no attribute 'next_sibling'

IL-2 was not found in the ChemSpider DB 


The molecular descriptors for Gleevec (Imatinib Mesylate) were empty - Blank Descriptors

Imatinib Mesylate was not found in the ChemSpider DB 

(129) Imbruvica was found and saved to the dictionary 


There was no search result for Imfinzi - 'NoneType' object has no attribute 'next_sibling'

Imfinzi was not found in the ChemSpider DB 

(130) Imiquimod was found and saved to the dictionary 


There


The molecular descriptors for MESNA (Mesna) were empty - Blank Descriptors

Mesna was not found in the ChemSpider DB 


The molecular descriptors for MESNA (Mesnex) were empty - Blank Descriptors

Mesnex was not found in the ChemSpider DB 

(152) Methotrexate was found and saved to the dictionary 

There was more than one search result for Methylnaltrexone Bromide


The molecular descriptors for Methylnaltrexone bromide (Methylnaltrexone Bromide) were empty - Blank Descriptors


The molecular descriptors for Methylnaltrexone bromide (Methylnaltrexone Bromide) were empty - Blank Descriptors

(153) Midostaurin was found and saved to the dictionary 

(154) Mitomycin C was found and saved to the dictionary 


The molecular descriptors for Mitoxantrone hydrochloride (Mitoxantrone Hydrochloride) were empty - Blank Descriptors

Mitoxantrone Hydrochloride was not found in the ChemSpider DB 


There was no search result for MOPP - 'NoneType' object has no attribute 'next_sibling'

MOPP was not

	(179) PEB was found and saved to the dictionary
	(180) PEB was found and saved to the dictionary
	(181) PEB was found and saved to the dictionary
	(182) PEB was found and saved to the dictionary
	(183) PEB was found and saved to the dictionary
	(184) PEB was found and saved to the dictionary
	(185) PEB was found and saved to the dictionary
	(186) PEB was found and saved to the dictionary
	(187) PEB was found and saved to the dictionary
(188) Pegaspargase was found and saved to the dictionary 


The molecular descriptors for Pefloxacin mesylate (Pegfilgrastim) were empty - Blank Descriptors

Pegfilgrastim was not found in the ChemSpider DB 


There was no search result for Peginterferon Alfa-2b - 'NoneType' object has no attribute 'next_sibling'

Peginterferon Alfa-2b was not found in the ChemSpider DB 


There was no search result for PEG-Intron - 'NoneType' object has no attribute 'next_sibling'

PEG-Intron was not found in the ChemSpider DB 


There was no search result for Pembroli

	(207) Sancuso was found and saved to the dictionary

There was no search result for Sclerosol Intrapleural Aerosol - 'NoneType' object has no attribute 'next_sibling'

Sclerosol Intrapleural Aerosol was not found in the ChemSpider DB 


There was no search result for Siltuximab - 'NoneType' object has no attribute 'next_sibling'

Siltuximab was not found in the ChemSpider DB 


There was no search result for Sipuleucel-T - 'NoneType' object has no attribute 'next_sibling'

Sipuleucel-T was not found in the ChemSpider DB 


The molecular descriptors for Somatuline (Somatuline Depot) were empty - Blank Descriptors

Somatuline Depot was not found in the ChemSpider DB 

(208) Sonidegib was found and saved to the dictionary 


The molecular descriptors for Sorafenib tosylate (Sorafenib Tosylate) were empty - Blank Descriptors

Sorafenib Tosylate was not found in the ChemSpider DB 

(209) Sprycel was found and saved to the dictionary 


There was no search result for STANFORD V - 'NoneType'


The molecular descriptors for Arsenic trioxide (Trisenox) were empty - Blank Descriptors

Trisenox was not found in the ChemSpider DB 


The molecular descriptors for Lapatinib Ditosylate (Tykerb) were empty - Blank Descriptors

Tykerb was not found in the ChemSpider DB 


There was no search result for Unituxin - 'NoneType' object has no attribute 'next_sibling'

Unituxin was not found in the ChemSpider DB 

(247) Uridine Triacetate was found and saved to the dictionary 

There was more than one search result for VAC

	(248) VAC was found and saved to the dictionary
	(249) VAC was found and saved to the dictionary
	(250) VAC was found and saved to the dictionary
(251) Valrubicin was found and saved to the dictionary 

(252) Valstar was found and saved to the dictionary 

(253) Vandetanib was found and saved to the dictionary 


There was no search result for VAMP - 'NoneType' object has no attribute 'next_sibling'

VAMP was not found in the ChemSpider DB 


The molecular descriptors 

In [None]:
import csv #import output cell into csv file
with open('/Users/Downloads/CSDB_output.csv','w',newline="") as f:
    writecsv = csv.writer(f,delimiter=',')
    writecsv.writerow(_oh[66]) #_oh is the previous output of a cell

In [6]:
#obtain drug list for iterative search
drug_list = 'https://www.cancer.gov/about-cancer/treatment/drugs'
uClient_drug = ureq(drug_list)
drug_html= uClient_drug.read()
uClient_drug.close()
soup_drug = soup(drug_html,'html.parser')
tmp_drugList = []
for drug_names in soup_drug.findAll('ul',{'class':"no-bullets no-description"}):
    tmp_drugList.append(drug_names.text.strip())
drugList = []
for drugs in tmp_drugList:
    drugList.append(drugs.split('\n'))
drugList = list(chain(*drugList))
plaintxt_druglist = []
for eachdrug in drugList:
    if eachdrug != '[No Entries]':
        plaintxt_druglist.append(eachdrug.split('(')[0].strip().replace(' ','+'))
#append each drug name after splitting string from the text in parentheses and replacing space with a + sign

In [232]:
n = 1
create_connection("CSdb.db")
with conn:
    cur=conn.cursor()
    create_table(conn)
    skip=True
    while skip:
        try:
            skiprepeats = str(input('Do you want to skip all the existing entries and only add new ones? '))
        except ValueError:
            print('\n\tEnter a valid entry (y/n).\n')
            continue
        if skiprepeats.lower() not in ['y','n']:
            print('\n\tEnter a valid entry (y/n).\n')
            continue
        else:
            skip=False
    for k in outerDict.keys():
        cur.execute("SELECT count(*) FROM CStable WHERE ChemSpider_ID = ?", (k,))
        data=cur.fetchone()[0]
        if data==0:
            try:
                data_entry(conn,(outerDict[k]['ChemSpider ID'],outerDict[k]['InChI string'],outerDict[k]['Std InChI'],
                            outerDict[k]['Molecular Formula'],outerDict[k]['Average Mass (Da)'].replace(' Da',''),
                            outerDict[k]['SMILES'],outerDict[k]['Common Name'],outerDict[k]['Systematic Name'],
                            outerDict[k]['LogP'],outerDict[k]['H Bond Donors'],outerDict[k]['H Bond Acceptors'],
                            outerDict[k]['Num. Rotatable Bonds'],outerDict[k]['Lipinski Rule of 5'],
                            outerDict[k]['Polar Surface Area'],
                            outerDict[k]['Enthalpy of Vapourisation'],outerDict[k]['Density'],outerDict[k]['Boiling Point']), n, k)
                n+=1
            except sqlite3.IntegrityError:
                print('\n{} - {} is already in CStable database, unique constraint failed'.format(n,k))
                loopupdate(outerDict,n,k)
        elif data!=0 and skiprepeats =='y': #enty exists in Table and skip all repeats
            pass
            print('\n({}) No new entires to add to the SQL Table.'.format(n))
            n+=1
        elif data!=0 and skiprepeats=='n': #if the CSID exists, and want to update
            loopupdate(outerDict,n,k)


sqlite3 version: 2.6.0

Do you want to skip all the existing entries and only add new ones? y
(1) 64870107 -> Saved to SQL DB
(2) 3913 -> Saved to SQL DB
(3) 59069 -> Saved to SQL DB
(4) 5293751 -> Saved to SQL DB
(5) 25069683 -> Saved to SQL DB
(6) 71301 -> Saved to SQL DB
(7) 19879943 -> Saved to SQL DB
(8) 21106301 -> Saved to SQL DB
(9) 38772329 -> Saved to SQL DB
(10) 38201 -> Saved to SQL DB
(11) 65323053 -> Saved to SQL DB
(12) 154044 -> Saved to SQL DB
(13) 103 -> Saved to SQL DB
(14) 67690 -> Saved to SQL DB
(15) 39117 -> Saved to SQL DB
(16) 54847 -> Saved to SQL DB
(17) 25027391 -> Saved to SQL DB
(18) 5293228 -> Saved to SQL DB
(19) 181006 -> Saved to SQL DB
(20) 7999567 -> Saved to SQL DB
(21) 5784 -> Saved to SQL DB
(22) 140 -> Saved to SQL DB
(23) 571356 -> Saved to SQL DB
(24) 4067 -> Saved to SQL DB
(25) 4953629 -> Saved to SQL DB
(26) 24531930 -> Saved to SQL DB
(27) 130656 -> Saved to SQL DB
(28) 8289501 -> Saved to SQL DB
(29) 393879 -> Saved to SQL DB
(30) 8486772 

In [15]:
#conn=create_connection('CSdb.db')
with conn:
    cnt=2
    key='33395'
    update_data(conn,(outerDict[key]['InChI string'],outerDict[key]['Std InChI'],
                outerDict[key]['Molecular Formula'],outerDict[key]['Average Mass (Da)'].replace(' Da',''),
                outerDict[key]['SMILES'],'quest que tu fais?',outerDict[key]['Systematic Name'],
                outerDict[key]['LogP'],outerDict[key]['H Bond Donors'],outerDict[key]['H Bond Acceptors'],
                outerDict[key]['Num. Rotatable Bonds'],outerDict[key]['Lipinski Rule of 5'],
                outerDict[key]['Polar Surface Area'],
                outerDict[key]['Enthalpy of Vapourisation'],outerDict[key]['Density'],outerDict[key]['Boiling Point'],key), cnt, key)

(2) 33395 -> Updated SQL DB


In [12]:
with conn:
    cnt=2
    key='29340700'
    update_name(conn,('testing',key))

Common Name for CS_ID: 29340700 was changed to testing


In [78]:
unlock_db('CSdb.db')

sqlite3 version: 2.6.0

Connection closed


In [11]:
#conn = create_connection('CSdb.db')
with conn:
    key='33395'
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM CStable WHERE ChemSpider_ID = {};".format(key))
    rows = cursor.fetchall()
    if rows:
        for row in rows:
            print(row)

(33395, 'InChI=1S/C47H51NO14/c1-25-31(60-43(56)36(52)35(28-16-10-7-11-17-28)48-41(54)29-18-12-8-13-19-29)23-47(57)40(61-42(55)30-20-14-9-15-21-30)38-45(6,32(51)22-33-46(38,24-58-33)62-27(3)50)39(53)37(59-26(2)49)34(25)44(47,4)5/h7-21,31-33,35-38,40,51-52,57H,22-24H2,1-6H3,(H,48,54)/t31-,32-,33+,35-,36+,37-,38-,40-,45+,46-,47+/m0/s1', 'RCINICONZNJXQFLOQTUHTGSA-N', 'C47H51NO14', 853.906, 'CC1=C2[C@@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@H]3[C@@H]([C@@](C2(C)C)(C[C@@H]1OC(=O)[C@@H]([C@H](c5ccccc5)NC(=O)c6ccccc6)O)O)OC(=O)c7ccccc7)(CO4)OC(=O)C)O)C)OC(=O)C', 'Abraxane / (5β,7β,10α,13α)-4,10-bis(acetyloxy)-13-{[(2R,3S)-3-(benzoylamino)-2-hydroxy-3-phenylpropanoyl]oxy}-1,7-dihydroxy-9-oxo-5,20-epoxytax-11-en-2-yl benzoate', '(2alpha,5beta,7beta,10alpha,13alpha)-4,10-Diacetoxy-13-{[(2R,3S)-3-(benzoylamino)-2-hydroxy-3-phenylpropanoyl]oxy}-1,7-dihydroxy-9-oxo-5,20-epoxytax-11-en-2-yl benzoate', 7.38, 4, 15, 14, 3, '221 Å2', '146.0±3.0 kJ/mol', '1.4±0.1 g/cm3', '957.1±65.0 °C at 760 mmHg')


In [240]:
#conn=create_connection('CSdb.db')       Common_Name,ChemSpider_ID
pd.read_sql('''SELECT * FROM CStable''',conn)


Unnamed: 0,ChemSpider_ID,InChI_string,Std_InChI,Molecular_Formula,Average_Mass,SMILES,Common_Name,Systematic_Name,logP,H_Bond_Donors,H_Bond_Acceptors,Num_Rota_Bonds,Lipinski_Rule_5,Polar_Surface_Area,Enthalpy_Vap,Density,Boiling_Point
0,103,"InChI=1S/C3H5O7P/c4-2(3(5)6)1-10-11(7,8)9/h1H2...",LFLUCDOSQPJJBEUHFFFAOYSA-N,C3H5O7P,184.041,C(C(=O)C(=O)O)OP(=O)(O)O,HPV / 3-Phosphonooxypyruvic acid,2-Oxo-3-(phosphonooxy)propanoic acid,-2.91,3,7,4,0,131 Å2,75.5±6.0 kJ/mol,1.9±0.1 g/cm3,432.4±47.0 °C at 760 mmHg
1,134,"InChI=1S/C5H9NO3/c6-3-4(7)1-2-5(8)9/h1-3,6H2,(...",ZGXJTSGNIOSYLOUHFFFAOYSA-N,C5H9NO3,131.130,C(CC(=O)O)C(=O)CN,Aminolevulinic Acid,5-Amino-4-oxopentanoic acid,-0.93,3,4,4,0,80 Å2,59.2±6.0 kJ/mol,1.2±0.1 g/cm3,298.4±20.0 °C at 760 mmHg
2,140,InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-2...,VVIAGPKUTFNRDUUHFFFAOYSA-N,C20H23N7O7,473.439,c1cc(ccc1C(=O)NC(CCC(=O)O)C(=O)O)NCC2CNc3c(c(=...,Leucovorin / folinic acid,"N-(4-{[(2-Amino-5-formyl-4-oxo-1,4,5,6,7,8-hex...",-3.19,8,14,9,2,216 Å2,,1.7±0.1 g/cm3,
3,185,InChI=1S/C5H5N5/c6-4-3-5(9-1-7-3)10-2-8-4/h1-2...,GFFGJBXGBJISGVUHFFFAOYSA-N,C5H5N5,135.127,c1[nH]c(c-2ncnc2n1)N,ADE / Adenine,1H-Purin-6-amine,-2.12,3,5,0,0,75 Å2,48.0±3.0 kJ/mol,1.9±0.1 g/cm3,243.2±50.0 °C at 760 mmHg
4,292,InChI=1S/C11H12Cl2N2O5/c12-10(13)11(18)14-8(5-...,WIIZWVCIJKGZOKUHFFFAOYSA-N,C11H12Cl2N2O5,323.129,c1cc(ccc1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-],"CAF / 2,2-Dichloro-N-[1,3-dihydroxy-1-(4-nitro...","2,2-Dichloro-N-[1,3-dihydroxy-1-(4-nitrophenyl...",1.02,3,7,6,0,115 Å2,100.0±3.0 kJ/mol,1.5±0.1 g/cm3,644.9±55.0 °C at 760 mmHg
5,659,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3,IAZDPXIOMUYVGZUHFFFAOYSA-N,C2H6OS,78.133,CS(=O)C,Sclerosol / Dimethyl sulfoxide,(Methylsulfinyl)methane,-1.35,0,1,0,0,36 Å2,40.8±3.0 kJ/mol,1.1±0.1 g/cm3,189.0±9.0 °C at 760 mmHg
6,937,InChI=1S/H2O/h1H2,XLYOFNOQVPJJNPUHFFFAOYSA-N,H2O,18.015,O,ICE / Water,Water,-1.38,2,1,0,0,0 Å2,40.7±0.0 kJ/mol,1.0±0.1 g/cm3,100.0±9.0 °C at 760 mmHg
7,989,"InChI=1S/C5H14NO4P/c1-6(2,3)4-5-10-11(7,8)9/h4...",YHHSONZFOIEMCPUHFFFAOYSA-O,C5H15NO4P,184.150,C[N+](C)(C)CCOP(=O)(O)O,CHOP / Phosphocholine conjugate acid,"N,N,N-Trimethyl-2-(phosphonooxy)ethanaminium",-4.99,2,5,4,0,77 Å2,,,
8,1628,InChI=1S/C27H29NO11/c1-10-22(31)13(28)6-17(38-...,AOJJSUZBOXZQNBUHFFFAOYSA-N,C27H29NO11,543.519,CC1C(C(CC(O1)OC2CC(Cc3c2c(c4c(c3O)C(=O)c5cccc(...,"Doxil / 3-Glycoloyl-3,5,12-trihydroxy-10-metho...","3-Glycoloyl-3,5,12-trihydroxy-10-methoxy-6,11-...",2.82,7,12,5,3,206 Å2,123.5±3.0 kJ/mol,1.6±0.1 g/cm3,810.3±65.0 °C at 760 mmHg
9,1739,InChI=1S/C8H12N4O5/c9-7-10-2-12(8(16)11-7)6-5(...,NMUSYJAQQFHJEWUHFFFAOYSA-N,C8H12N4O5,244.205,c1nc(nc(=O)n1C2C(C(C(O2)CO)O)O)N,"Vidaza / 4-Amino-1-pentofuranosyl-1,3,5-triazi...","4-Amino-1-pentofuranosyl-1,3,5-triazin-2(1H)-one",-1.99,5,9,2,1,141 Å2,93.2±6.0 kJ/mol,2.1±0.1 g/cm3,534.5±60.0 °C at 760 mmHg


### get pdf from site and then scrape from pdf

In [9]:
topdrugurl = 'https://njardarson.lab.arizona.edu/content/top-pharmaceuticals-poster'
outputloc = "/Users/Documents/Python/"
uClientmain = ureq(topdrugurl)
htmlpg= uClientmain.read()
uClientmain.close()
bsoup_pg = soup(htmlpg,'html.parser')
bsoup_pg

<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
<head>
<meta charset="utf-8"/>
<meta content="Drupal 8 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="/sites/njardarson.lab.arizona.edu/themes/njardarson/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<link href="/content/top-pharmaceuticals-poster" rel="canonical"/>
<link href="/node/140" rel="shortlink"/>
<link href="/content/top-pharmaceuticals-poster" rel="revision"/>
<title>Top P

In [10]:
#https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/2016Top200PharmaceuticalPrescriptionSalesPosterLowResV2.pdf
#https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/Top200%20Pharmacetical%20Products%20by%20US%20Prescription%20in%202012_0.pdf
dl_links={}
for a in bsoup_pg.find_all(href=re.compile('.pdf')):
    filename = a.next
    link = a['href']
    if link.startswith('/sites/'):
        link = 'https://' + link[7:].replace(' ','%').replace('files','sites/njardarson.lab.arizona.edu/files')
        dl_links[filename]=link

In [228]:
dl_links

{'Top 100 US Prescription and Brand Name Drugs Products': 'https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/Top%20US%20Pharmaceutical%20Products%20of%202013.pdf',
 'Top 200 Brand Name Drugs by Prescription in 2016': 'https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/2016Top200PharmaceuticalPrescriptionSalesPosterLowResV2.pdf',
 'Top 200 Brand Name Drugs by Retail Sales in 2015': 'https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/Top200PharmaceuticalProductsRetailSales2015LowRes.pdf',
 'Top 200 Brand Name Drugs by Retail Sales in 2016': 'https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/2016Top200PharmaceuticalRetailSalesPosterLowResV3_0.pdf',
 'Top 200 Brand Name Drugs by Total US Prescriptions in 2010': 'https://njardarson.lab.arizona.edu/sites/njardarson.lab.arizona.edu/files/Top%200%Brand-name%Drugs%by%Total%US%Prescriptions%in%2010sm_0.pdf',
 'Top 200 Brand Name Drugs by Total US Prescr

In [233]:
'''download each pdf'''
import urllib.request
for fname,dlink in dl_links.items():
    try:
        with open('{}.pdf'.format(fname),'wb'):
            uret(dlink,'{}.pdf'.format(fname))
        print('Saved {} successfully!'.format(fname))
    except urllib.error.HTTPError:
        with open('{}.pdf'.format(fname),'wb'):
            uret(dlink.replace('%','%20'),'{}.pdf'.format(fname))
        print('Saved {} successfully (after tweaking url)!'.format(fname))

Saved Top 200 Brand Name Drugs by Prescription in 2016 successfully!
Saved Top 200 Brand Name Drugs by Retail Sales in 2016 successfully!
Saved Top 200 Brand Name Drugs by Retail Sales in 2015 successfully!
Saved Top 100 US Prescription and Brand Name Drugs Products successfully!
Saved Top 200 Brand Name Drugs by US Retail Sales in 2012 successfully (after tweaking url)!
Saved Top 200 Brand Name Drugs by Total US Prescriptions in 2012 successfully (after tweaking url)!
Saved Top 200 Brand Name Drugs by US Retail Sales in 2011 successfully (after tweaking url)!
Saved Top 200 Brand Name Drugs by Total US Prescriptions in 2011 successfully (after tweaking url)!
Saved Top 200 Brand Name Drugs by Total US Prescriptions in 2010 successfully (after tweaking url)!
Saved Top 200 Brand Name Drugs by US Retail Sales in 2010 successfully (after tweaking url)!
Saved Top 200 Pharmaceutical Products by Worldwide Sales in 2009 successfully!
Saved Top 200 Pharmaceutical Products by Worldwide Sales in 2

In [11]:
def getPDF(path):
    pdf=fitz.open(path)
    return pdf

def getMeta(pdf):    
    meta=pdf.metadata
    return meta

def getContent(pdf):
    content = ""
    if pdf.pageCount == 1:
        content = pdf.getPageText(0,"text")
    else:
        for i in range(0, pdf.pageCount-1):
            content += pdf.getPageText(i,"text")
    return content.strip().split(' ')

def dplyContent(filepath):
    try:
        pdf=getPDF(filepath)
        meta=getMeta(pdf)
        t=getContent(pdf)
        return t
    except Error as e:
        print(e)

In [12]:
def tidytxt(pdfile):
    filecontent = dplyContent(pdfile)
    drglst=[]
    for i in filecontent:
        if re.search('\d,\d{1,4}K',i) or re.match('\d',i) or re.search('[(|)]',i):
            pass
        elif re.match('Scripts',i,re.IGNORECASE):
            pass
        elif i == i.upper() and len(i)>4:
            pass 
        else:
            drglst.append(i)

    drglst_ = []
    for x in [x for x in drglst if x]:
        if re.search('Scripts\n',x):
            x = x.split('\n')
            drglst_.append(x[1])
        elif re.search('.\n',x):
            drglst_.append(x.split('\n')[0])
        elif '/' in x:
            drg = x.split('/')
            drglst_.append(drg[0])
            drglst_.append(drg[1])
        elif x.isalnum():
            drglst_.append(x)
    try:
        drglst_ = [x for x in drglst_ if x and not rexfind(x,rmkeywrd) and firstCap(x)]
    except:
        drglst_ = [x for x in drglst_ if x and firstCap(x)]
        drglst_ = [x for x in drglst_ if not rexfind(x,rmkeywrd)]
    return drglst_

'''deal with case sensitivity'''
def rexfind(word,rmlst):
    r = re.compile('{}'.format(word),re.IGNORECASE)
    rmatches = list(filter(r.match, rmlst))
    if len(rmatches) >0:
        return True
    else:
        return False

def firstCap(word):
    try:
        firstletter = re.match('^([A-Z]){1}([a-z]{3,})$',word)
        if firstletter:
            return True
        else:
            return False
    except:
        return False

In [15]:
combDrglst=[]
for title in dl_links.keys():
    tmplst = tidytxt('{}.pdf'.format(title))
    combDrglst.append(tmplst)
combDrglst = set([x for l in combDrglst for x in l])
combDrglst

{'Adoxa',
 'Alfa',
 'Intuniv',
 'Rosuvastatin',
 'Patanol',
 'Psychostimulants',
 'Vasotec',
 'Seroquel',
 'Cilostazol',
 'Palivizumab',
 'Robaxin',
 'Flecainide',
 'Avastin',
 'Nitrofurantoin',
 'Oxybutynin',
 'Lexiscan',
 'Chlorhexidine',
 'Vimpat',
 'Lamivudine',
 'Votrient',
 'Nifedical',
 'Propranolol',
 'Etexilate',
 'Entecavir',
 'Goserelin',
 'Farxiga',
 'Carvedilol',
 'Actiq',
 'Caspofungin',
 'Ofloxacin',
 'Lexapro',
 'Otezla',
 'Alprazolam',
 'Dapagliflozin',
 'Valium',
 'Acarbose',
 'Diclofenac',
 'Virus',
 'Prometrium',
 'Topicort',
 'Fingolimod',
 'Recombinate',
 'Pristiq',
 'Berlex',
 'Revlimid',
 'Raltegravir',
 'Trulicity',
 'Amoxil',
 'Clozaril',
 'Nilstat',
 'Imiglucerase',
 'Lidex',
 'Cephalon',
 'Tiotropium',
 'Budesonide',
 'Besylate',
 'Entyvio',
 'Piperacillin',
 'Apresoline',
 'Trandate',
 'Amitriptyline',
 'Azacitidine',
 'Tekturna',
 'Procrit',
 'Sandostatin',
 'Guanfacine',
 'Advil',
 'Arimidex',
 'Salamol',
 'Neurontin',
 'Desmopressin',
 'Synagis',
 'Tussi

In [16]:
'''write main drug list as text file so no need to re-load all cells again'''
import csv 
with open('combDrgLst.txt','w',newline='') as f:
    writecsv = csv.writer(f)
    for dr in combDrglst:
        writecsv.writerow([dr]) 

In [5]:
import csv
combDrgLst_=[]
with open('combDrgLst.txt','r',newline='') as f:
    readcsv=csv.reader(f,delimiter='\n')
    for dr in readcsv:
        combDrgLst_.append(dr)
combDrgLst_ = [x for sl in combDrgLst_ for x in sl]
combDrgLst_

['Adoxa',
 'Alfa',
 'Intuniv',
 'Rosuvastatin',
 'Patanol',
 'Psychostimulants',
 'Vasotec',
 'Seroquel',
 'Cilostazol',
 'Palivizumab',
 'Robaxin',
 'Flecainide',
 'Avastin',
 'Nitrofurantoin',
 'Oxybutynin',
 'Lexiscan',
 'Chlorhexidine',
 'Vimpat',
 'Lamivudine',
 'Votrient',
 'Nifedical',
 'Propranolol',
 'Etexilate',
 'Entecavir',
 'Goserelin',
 'Farxiga',
 'Carvedilol',
 'Actiq',
 'Caspofungin',
 'Ofloxacin',
 'Lexapro',
 'Otezla',
 'Alprazolam',
 'Dapagliflozin',
 'Valium',
 'Acarbose',
 'Diclofenac',
 'Virus',
 'Prometrium',
 'Topicort',
 'Fingolimod',
 'Recombinate',
 'Pristiq',
 'Berlex',
 'Revlimid',
 'Raltegravir',
 'Trulicity',
 'Amoxil',
 'Clozaril',
 'Nilstat',
 'Imiglucerase',
 'Lidex',
 'Cephalon',
 'Tiotropium',
 'Budesonide',
 'Besylate',
 'Entyvio',
 'Piperacillin',
 'Apresoline',
 'Trandate',
 'Amitriptyline',
 'Azacitidine',
 'Tekturna',
 'Procrit',
 'Sandostatin',
 'Guanfacine',
 'Advil',
 'Arimidex',
 'Salamol',
 'Neurontin',
 'Desmopressin',
 'Synagis',
 'Tussi

In [14]:
'''newest rmkeywrd list'''
with open('rmkeywrd_2.txt','r') as f:
    rmkeywrd = f.readlines()
rmkeywrd = [x.strip() for x in rmkeywrd]
rmkeywrd = set(rmkeywrd)
rmkeywrd = [x for x in rmkeywrd if x]
rmkeywrd

['Gilead',
 'Organ',
 'Antihypocalcemic',
 'Plough',
 'Antihyperlipidermic',
 'Blood',
 'Ophthalmology',
 'Complement',
 'Antihyperlipidemic',
 'Preps',
 'Ortho',
 'Novo',
 'Sulfate',
 'Inhibitor',
 'Pack',
 'Factor',
 'Forming',
 'Lindsay',
 'Laura',
 'Zeneca',
 'Antibacterial',
 'Reference',
 'Cytostatic',
 'Pharmaceutical',
 'Tract',
 'Biogen',
 'Anticonvulsant',
 'Immunology',
 'Glaucoma',
 'Contraceptive',
 'Erik',
 'Sclerosis',
 'Systemic',
 'Sankyo',
 'Corticoids',
 'Mood',
 'Comb',
 'Omnicef',
 'Million',
 'Dietary',
 'Gastrointestinal',
 'Aldara',
 'Produced',
 'Strips',
 'Roche',
 'Used',
 'Hour',
 'Fibrosis',
 'Dysfunction',
 'Essential',
 'Hoffmann',
 'Haziq',
 'Magnesium',
 'Solution',
 'Hormonal',
 'Formulas',
 'Hypothyroidism',
 'Incorporated',
 'Potassium',
 'Regulators',
 'Disorders',
 'Respiratory',
 'Maleate',
 'Vaccines',
 'Opioid',
 'Granules',
 'Review',
 'Urinary',
 'Syringe',
 'Alexandra',
 'Alimentary',
 'Jason',
 'Celgene',
 'Analgesic',
 'Relaxant',
 'Cardiov

In [105]:
'''remove words that didnt have a captial first letter in text file'''
rmlst=[]
for i in rmkeywrd:
    if firstCap(i):
        rmlst.append(i)
    else:
        pass
with open('rmkeywrd_2.txt','w') as f:
    for i in rmlst:
        f.write(i+'\n')

# multiprocess search new drug list

In [6]:
searchlink = 'http://www.chemspider.com/Search.aspx?q='
count = 1
manager = Manager()
emptyDrugList = manager.list()
outerDict = manager.dict()
tmpComDrgLst = combDrgLst_[:16]

def procDrgSrch(eachdrug,emptyDrugList,outerDict):
    uClient=ureq(searchlink+eachdrug)
    pagehtml = uClient.read()
    soupPage = soup(pagehtml,'html.parser')
    process_search(eachdrug,soupPage)
    #return (outerDict,emptyDrugList)

In [11]:
emdl =[]
d=dict()
procDrgSrch('taxol',emdl,d)

RecursionError: maximum recursion depth exceeded while calling a Python object

In [7]:
for i in range(0,len(tmpComDrgLst),4):
        p1 = Process(target=procDrgSrch, args=(tmpComDrgLst[i],emptyDrugList,outerDict))
        p2 = Process(target=procDrgSrch, args=(tmpComDrgLst[i+1],emptyDrugList,outerDict))
        p3 = Process(target=procDrgSrch, args=(tmpComDrgLst[i+2],emptyDrugList,outerDict))
        p4 = Process(target=procDrgSrch, args=(tmpComDrgLst[i+3],emptyDrugList,outerDict))
        p1.start()
        p2.start()
        p3.start()
        p4.start()
        p1.join()
        p2.join()
        p3.join()
        p4.join()
        
print(outerDict)

Adoxa was not found in the ChemSpider DB 

Rosuvastatin was not found in the ChemSpider DB 


There was no search result for Alfa - 'NoneType' object has no attribute 'next_sibling'

Alfa was not found in the ChemSpider DB 

There was more than one search result for Intuniv


The molecular descriptors for guanfacine hydrochloride (Intuniv) were empty - Blank Descriptors



Process Process-4:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-47c511e938db>", line 12, in procDrgSrch
    process_search(eachdrug,soupPage)
  File "<ipython-input-4-020ff8e972eb>", line 42, in process_search
    outerDict[selected_data['ChemSpider ID']] = selected_data
  File "<string>", line 2, in __setitem__
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/managers.py", line 756, in _callmethod
    conn.send((self._id, methodname, args, kwds))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/Library/Fra


The molecular descriptors for VASOTEC (Vasotec) were empty - Blank Descriptors

Vasotec was not found in the ChemSpider DB 

There was more than one search result for Seroquel

There was more than one search result for Patanol


There was no search result for Psychostimulants - 'NoneType' object has no attribute 'next_sibling'

Psychostimulants was not found in the ChemSpider DB 



Process Process-9:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-47c511e938db>", line 12, in procDrgSrch
    process_search(eachdrug,soupPage)
  File "<ipython-input-4-020ff8e972eb>", line 42, in process_search
    outerDict[selected_data['ChemSpider ID']] = selected_data
  File "<string>", line 2, in __setitem__
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/managers.py", line 756, in _callmethod
    conn.send((self._id, methodname, args, kwds))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/Library/Fra


The molecular descriptors for AL 4943A (Patanol) were empty - Blank Descriptors



Process Process-6:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-47c511e938db>", line 12, in procDrgSrch
    process_search(eachdrug,soupPage)
  File "<ipython-input-4-020ff8e972eb>", line 42, in process_search
    outerDict[selected_data['ChemSpider ID']] = selected_data
  File "<string>", line 2, in __setitem__
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/managers.py", line 756, in _callmethod
    conn.send((self._id, methodname, args, kwds))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/Library/Fra

Robaxin was not found in the ChemSpider DB 

Flecainide was not found in the ChemSpider DB 


There was no search result for Palivizumab - 'NoneType' object has no attribute 'next_sibling'

Palivizumab was not found in the ChemSpider DB 

Cilostazol was not found in the ChemSpider DB 

There was more than one search result for Lexiscan

Oxybutynin was not found in the ChemSpider DB 


There was no search result for Avastin - 'NoneType' object has no attribute 'next_sibling'

Avastin was not found in the ChemSpider DB 

Nitrofurantoin was not found in the ChemSpider DB 



Process Process-17:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-47c511e938db>", line 12, in procDrgSrch
    process_search(eachdrug,soupPage)
  File "<ipython-input-4-020ff8e972eb>", line 42, in process_search
    outerDict[selected_data['ChemSpider ID']] = selected_data
  File "<string>", line 2, in __setitem__
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/managers.py", line 756, in _callmethod
    conn.send((self._id, methodname, args, kwds))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/Library/Fr

{}


In [67]:
procDrgSrch('taxol')

(1) taxol was found and saved to the dictionary 



In [75]:
with Pool() as p:
    p.map(procDrgSrch,combDrgLst_[:3])

(1) Adoxa was found and saved to the dictionary 


There was no search result for Alfa - 'NoneType' object has no attribute 'next_sibling'

Alfa was not found in the ChemSpider DB 

There was more than one search result for Intuniv


The molecular descriptors for guanfacine hydrochloride (Intuniv) were empty - Blank Descriptors

	(1) Intuniv was found and saved to the dictionary


In [45]:
def group(iterator,count):
    itr = iter(iterator)
    while True:
        yield list([itr.__next__() for i in range(count)])

In [48]:
'''remove water from dictionary'''
for s in outerDict.keys():
    q = outerDict[s]['Common Name']
    m = re.search('water',q,re.IGNORECASE)
    if m:
        print(s)
        outerDict.pop(s) #or del outerDict['937]

In [43]:
emptyDrugList

['ABVD',
 'ABVE',
 'ABVE-PC',
 'AC',
 'AC-T',
 'Actemra',
 'Adcetris',
 'Ado-Trastuzumab Emtansine',
 'Afatinib Dimaleate',
 'Akynzeo',
 'Aldesleukin',
 'Alectinib hydrochloride',
 'Alemtuzumab',
 'Alimta',
 'Aliqopa',
 'Alkeran for Injection',
 'Alkeran Tablets',
 'Alunbrig',
 'Ameluz',
 'Aranesp',
 'Pamidronate disodium',
 'Sodium (3-amino-1-hydroxy-1,1-propanediyl)bis[hydrogen (phosphonate)] hydrate (2:1:5)',
 'Arsenic trioxide',
 'Arsenic trioxide',
 'Arsenic trioxide',
 'Arzerra',
 'Asparaginase Erwinia chrysanthemi',
 'Atezolizumab',
 'Avastin',
 'Avelumab',
 'Axicabtagene Ciloleucel',
 'Azedra',
 'Bavencio',
 'BEACOPP',
 'Belinostat',
 'Bendamustine Hydrochloride',
 'Bendeka',
 'BEP',
 'Besponsa',
 'Bevacizumab',
 'Bleomycin',
 'Blinatumomab',
 'Blincyto',
 'bosutinib hydrate',
 'Braftovi',
 'Brentuximab Vedotin',
 'Brigatinib',
 'BuMel',
 'Cabazitaxel',
 'Cabometyx',
 'Cabozantinib-S-Malate',
 'Campath',
 '06X131E4OE',
 'CAPOX',
 'Carboplatin',
 'CARBOPLATIN-TAXOL',
 'Carmustin

In [63]:
#testing 
rmidx=[]
for i,j in enumerate(emptyDrugList):
    if '[' in j or ']' in j:
        
        rmidx.append(i)
string = [e for i,e in enumerate(emptyDrugList) if i not in rmidx]
' '.join(e for e in string if e.isalnum() and re.search('^[A-Za-z]{3,}',e))

'ABVD ABVE Actemra Adcetris Akynzeo Aldesleukin Alemtuzumab Alimta Aliqopa Alunbrig Ameluz Aranesp Arzerra Atezolizumab Avastin Avelumab Azedra Bavencio BEACOPP Belinostat Bendeka BEP Besponsa Bevacizumab Bleomycin Blinatumomab Blincyto Braftovi Brigatinib BuMel Cabazitaxel Cabometyx Campath CAPOX Carboplatin Cerubidine Cervarix CEV Chlorambucil CMF Cobimetinib COPDAC COPP Cyclophosphamide Cyramza Dactinomycin Daratumumab Darzalex Dasatinib Defitelio Denosumab Dinutuximab Durvalumab Eligard Elitek Ellence Elotuzumab Oxaliplatin Oxaliplatin Empliciti Enzalutamide EPOCH Epogen Erbitux Erleada Erwinaze Evacet Evomela Filgrastim Fluoroplex FOLFIRI FOLFIRINOX FOLFOX Gardasil Gazyva Gleevec Glucarpidase Granix Herceptin Idhifa Imfinzi Imlygic Jakafi JEB Jevtana Kadcyla Kepivance Keytruda Kisqali Kymriah Lartruvo Lenvima LipoDox Lonsurf ELIGARD Lutathera Lutetium Lutetium Mektovi Mesna Mesnex MOPP MVAC Mylotarg Navelbine Necitumumab Nerlynx Neulasta Neupogen Nexavar Ninlaro Nivolumab Nplate O

In [208]:
def tidyTxtEmptyLst(strname):
    try:
        if re.search("^\((.+)\)",strname):
            tstrname = strname.split(')')[1]
            if ' ' in tstrname:
                tstrname = tstrname.split(' ')
                dictnword = {'word{}'.format(i): [] for i in range(len(tstrname))}
                tstrf=[]
                for n in range(len(tstrname)):
                    dictnword['word{}'.format(n)] = [strs for strs in tstrname[n] if strs.isalnum()] #check if alphanumeric
                    tstrf.append(dictnword['word{}'.format(n)])
                tstrname = ' '.join([''.join(x) for x in tstrf]) #join each element in inner list, then join with spaces for the outer list
                try: 
                    tstrname = re.split('\d',tstrname)[0]
                except:
                    return None
        elif ' ' in strname:    
            tstrname = strname.split(' ')[0]
        elif re.search('(.-.)', strname):
            tstrname = strname.replace(' ','').split('-')[0]
            if ',' in tstrname:
                tstrname=tstrname.split(',')[0]
            elif ';' in tstrname:
                tstrname=tstrname.split(';')[0]
        else:
            return strname
        return tstrname
    except Error as e:
        print(e)

In [224]:
'''clean up text of the list of no serach results in order to feed into web-scraping function'''
repeatDrgLst=[]
repeatDrgSet =set()
for i,j in enumerate(emptyDrugList):
    if ('[' in j or ']' in j):
        pass
    else:
        tmpdrug =cleantxt(j)
        if tmpdrug not in repeatDrgSet and re.search('^[A-Za-z]{3,}',tmpdrug) and tmpdrug.isalnum():
            repeatDrgLst.append(tmpdrug)
        repeatDrgSet.add(tmpdrug)
repeatDrgLst

['ABVD',
 'ABVE',
 'Actemra',
 'Adcetris',
 'Afatinib',
 'Akynzeo',
 'Aldesleukin',
 'Alectinib',
 'Alemtuzumab',
 'Alimta',
 'Aliqopa',
 'Alkeran',
 'Alunbrig',
 'Ameluz',
 'Aranesp',
 'Pamidronate',
 'Arsenic',
 'Arzerra',
 'Asparaginase',
 'Atezolizumab',
 'Avastin',
 'Avelumab',
 'Axicabtagene',
 'Azedra',
 'Bavencio',
 'BEACOPP',
 'Belinostat',
 'Bendamustine',
 'Bendeka',
 'BEP',
 'Besponsa',
 'Bevacizumab',
 'Bleomycin',
 'Blinatumomab',
 'Blincyto',
 'bosutinib',
 'Braftovi',
 'Brentuximab',
 'Brigatinib',
 'BuMel',
 'Cabazitaxel',
 'Cabometyx',
 'Cabozantinib',
 'Campath',
 'CAPOX',
 'Carboplatin',
 'CARBOPLATIN',
 'Carmustine',
 'Cerubidine',
 'Cervarix',
 'Indacaterol',
 'CEV',
 'Chlorambucil',
 'CHLORAMBUCIL',
 'CMF',
 'Cobimetinib',
 'cabozantinib',
 'Copanlisib',
 'COPDAC',
 'COPP',
 'Actinomycin',
 'Cyclophosphamide',
 'Cyramza',
 'Cytarabine',
 'Dactinomycin',
 'Daratumumab',
 'Darbepoetin',
 'Darzalex',
 'Dasatinib',
 'Daunorubicin',
 'Defibrotide',
 'Defitelio',
 'Den

In [225]:
secEmpDrgLst=[]
for eachdrug in repeatDrgLst:
        uClient=ureq(searchlink+eachdrug)
        pagehtml = uClient.read()
        soupPage_eachDrug = soup(pagehtml,'html.parser')
        count = process_search(eachdrug,soupPage_eachDrug,outerDict,count,secEmpDrgLst)


The molecular descriptors for ABVD were empty - Blank Descriptors

ABVD was not found in the ChemSpider DB 


There was no search result for ABVE - 'NoneType' object has no attribute 'next_sibling'

ABVE was not found in the ChemSpider DB 


There was no search result for Actemra - 'NoneType' object has no attribute 'next_sibling'

Actemra was not found in the ChemSpider DB 


There was no search result for Adcetris - 'NoneType' object has no attribute 'next_sibling'

Adcetris was not found in the ChemSpider DB 

There was more than one search result for Afatinib

	(279) Afatinib was found and saved to the dictionary
	(280) Afatinib was found and saved to the dictionary

The molecular descriptors for 2-[3,5-Bis(trifluoromethyl)phenyl]-N,2-dimethyl-N-{6-(4-methyl-1-piperazinyl)-4-[(3Z)-1,3-pentadien-3-yl]-3-pyridinyl}propanamide - (3aS)-2-[(3S)-1-azabicyclo[2.2.2]oct-3-yl]-2,3,3a,4,5,6-hexahydro-1H
-benzo[de]isoquinolin-1-one (1:1) (Akynzeo) were empty - Blank Descriptors

Akynzeo was 

	(291) Dasatinib was found and saved to the dictionary

The molecular descriptors for Dasatinib were empty - Blank Descriptors

(292) Daunorubicin was found and saved to the dictionary 


There was no search result for Defibrotide - 'NoneType' object has no attribute 'next_sibling'

Defibrotide was not found in the ChemSpider DB 


There was no search result for Defitelio - 'NoneType' object has no attribute 'next_sibling'

Defitelio was not found in the ChemSpider DB 


There was no search result for Denileukin - 'NoneType' object has no attribute 'next_sibling'

Denileukin was not found in the ChemSpider DB 


There was no search result for Denosumab - 'NoneType' object has no attribute 'next_sibling'

Denosumab was not found in the ChemSpider DB 

(293) Dexrazoxane was found and saved to the dictionary 


There was no search result for Dinutuximab - 'NoneType' object has no attribute 'next_sibling'

Dinutuximab was not found in the ChemSpider DB 

(294) Doxorubicin was found and sav


There was no search result for Kepivance - 'NoneType' object has no attribute 'next_sibling'

Kepivance was not found in the ChemSpider DB 


There was no search result for Keytruda - 'NoneType' object has no attribute 'next_sibling'

Keytruda was not found in the ChemSpider DB 


There was no search result for Kisqali - 'NoneType' object has no attribute 'next_sibling'

Kisqali was not found in the ChemSpider DB 


There was no search result for Kymriah - 'NoneType' object has no attribute 'next_sibling'

Kymriah was not found in the ChemSpider DB 

(315) Lanreotide was found and saved to the dictionary 

(316) Lapatinib was found and saved to the dictionary 


There was no search result for Lartruvo - 'NoneType' object has no attribute 'next_sibling'

Lartruvo was not found in the ChemSpider DB 

(317) Lenvatinib was found and saved to the dictionary 


The molecular descriptors for lenvatinib mesylate (Lenvima) were empty - Blank Descriptors

Lenvima was not found in the ChemSpider


There was no search result for Perjeta - 'NoneType' object has no attribute 'next_sibling'

Perjeta was not found in the ChemSpider DB 


There was no search result for Pertuzumab - 'NoneType' object has no attribute 'next_sibling'

Pertuzumab was not found in the ChemSpider DB 

(335) Ponatinib was found and saved to the dictionary 


There was no search result for Portrazza - 'NoneType' object has no attribute 'next_sibling'

Portrazza was not found in the ChemSpider DB 

(336) Procarbazine was found and saved to the dictionary 


There was no search result for Procrit - 'NoneType' object has no attribute 'next_sibling'

Procrit was not found in the ChemSpider DB 


There was no search result for Proleukin - 'NoneType' object has no attribute 'next_sibling'

Proleukin was not found in the ChemSpider DB 


There was no search result for Prolia - 'NoneType' object has no attribute 'next_sibling'

Prolia was not found in the ChemSpider DB 


The molecular descriptors for Eltrombopag ol


There was no search result for Voraxaze - 'NoneType' object has no attribute 'next_sibling'

Voraxaze was not found in the ChemSpider DB 


The molecular descriptors for Pazopanib Hydrochloride (Votrient) were empty - Blank Descriptors

Votrient was not found in the ChemSpider DB 


There was no search result for Vyxeos - 'NoneType' object has no attribute 'next_sibling'

Vyxeos was not found in the ChemSpider DB 


There was no search result for XELIRI - 'NoneType' object has no attribute 'next_sibling'

XELIRI was not found in the ChemSpider DB 


There was no search result for XELOX - 'NoneType' object has no attribute 'next_sibling'

XELOX was not found in the ChemSpider DB 


There was no search result for Xgeva - 'NoneType' object has no attribute 'next_sibling'

Xgeva was not found in the ChemSpider DB 


The molecular descriptors for Radium Ra 223 dichloride (Xofigo) were empty - Blank Descriptors

Xofigo was not found in the ChemSpider DB 


There was no search result for Yer

In [226]:
len(outerDict.keys())

243

In [103]:
#conn = create_connection("CSdb.db")
CSdf = pd.read_sql('''SELECT ChemSpider_ID,Common_Name,logP,H_Bond_Acceptors,
                   H_Bond_Donors,Num_Rota_Bonds,Lipinski_Rule_5,Polar_Surface_Area FROM CStable''',conn)
CSdf['Polar_Surface_Area'] = CSdf['Polar_Surface_Area'].replace(' \u212B2','',regex=True)
#CSdf['Polar_Surface_Area'] = CSdf['Polar_Surface_Area'].apply(lambda s: int(s.replace(' \u212B2','')))
CSdf

Unnamed: 0,ChemSpider_ID,Common_Name,logP,H_Bond_Acceptors,H_Bond_Donors,Num_Rota_Bonds,Lipinski_Rule_5,Polar_Surface_Area
0,185,"ADE,(Adenine)",-2.12,5,3,0,0,75
1,29400,"Adriamycin,(Doxorubicin)",2.82,12,7,5,3,206
2,33395,"Abraxane,((5β,7β,10α,13α)-4,10-bis(acetyloxy)-...",7.38,15,4,14,3,221
3,51809,"Aldara,(imiquimod)",3.46,4,2,2,0,57
4,7997598,"Abiraterone+Acetate,(Abiraterone acetate)",6.55,3,0,3,1,39
5,21106307,"Afinitor,(everolimus)",3.35,15,3,9,2,205
6,29340700,Abemaciclib,2.74,8,1,7,1,75


In [36]:
conn = create_connection('CSdb.db')
CSdf = pd.read_sql('''SELECT * FROM CStable''',conn)
conn.close()
CSdf

sqlite3 version: 2.6.0



Unnamed: 0,ChemSpider_ID,InChI_string,Std_InChI,Molecular_Formula,Average_Mass,SMILES,Common_Name,Systematic_Name,logP,H_Bond_Donors,H_Bond_Acceptors,Num_Rota_Bonds,Lipinski_Rule_5,Polar_Surface_Area,Enthalpy_Vap,Density,Boiling_Point
0,103,"InChI=1S/C3H5O7P/c4-2(3(5)6)1-10-11(7,8)9/h1H2...",LFLUCDOSQPJJBEUHFFFAOYSA-N,C3H5O7P,184.041,C(C(=O)C(=O)O)OP(=O)(O)O,HPV / 3-Phosphonooxypyruvic acid,2-Oxo-3-(phosphonooxy)propanoic acid,-2.91,3,7,4,0,131 Å2,75.5±6.0 kJ/mol,1.9±0.1 g/cm3,432.4±47.0 °C at 760 mmHg
1,134,"InChI=1S/C5H9NO3/c6-3-4(7)1-2-5(8)9/h1-3,6H2,(...",ZGXJTSGNIOSYLOUHFFFAOYSA-N,C5H9NO3,131.130,C(CC(=O)O)C(=O)CN,Aminolevulinic Acid,5-Amino-4-oxopentanoic acid,-0.93,3,4,4,0,80 Å2,59.2±6.0 kJ/mol,1.2±0.1 g/cm3,298.4±20.0 °C at 760 mmHg
2,140,InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-2...,VVIAGPKUTFNRDUUHFFFAOYSA-N,C20H23N7O7,473.439,c1cc(ccc1C(=O)NC(CCC(=O)O)C(=O)O)NCC2CNc3c(c(=...,Leucovorin / folinic acid,"N-(4-{[(2-Amino-5-formyl-4-oxo-1,4,5,6,7,8-hex...",-3.19,8,14,9,2,216 Å2,,1.7±0.1 g/cm3,
3,185,InChI=1S/C5H5N5/c6-4-3-5(9-1-7-3)10-2-8-4/h1-2...,GFFGJBXGBJISGVUHFFFAOYSA-N,C5H5N5,135.127,c1[nH]c(c-2ncnc2n1)N,ADE / Adenine,1H-Purin-6-amine,-2.12,3,5,0,0,75 Å2,48.0±3.0 kJ/mol,1.9±0.1 g/cm3,243.2±50.0 °C at 760 mmHg
4,292,InChI=1S/C11H12Cl2N2O5/c12-10(13)11(18)14-8(5-...,WIIZWVCIJKGZOKUHFFFAOYSA-N,C11H12Cl2N2O5,323.129,c1cc(ccc1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-],"CAF / 2,2-Dichloro-N-[1,3-dihydroxy-1-(4-nitro...","2,2-Dichloro-N-[1,3-dihydroxy-1-(4-nitrophenyl...",1.02,3,7,6,0,115 Å2,100.0±3.0 kJ/mol,1.5±0.1 g/cm3,644.9±55.0 °C at 760 mmHg
5,659,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3,IAZDPXIOMUYVGZUHFFFAOYSA-N,C2H6OS,78.133,CS(=O)C,Sclerosol / Dimethyl sulfoxide,(Methylsulfinyl)methane,-1.35,0,1,0,0,36 Å2,40.8±3.0 kJ/mol,1.1±0.1 g/cm3,189.0±9.0 °C at 760 mmHg
6,937,InChI=1S/H2O/h1H2,XLYOFNOQVPJJNPUHFFFAOYSA-N,H2O,18.015,O,ICE / Water,Water,-1.38,2,1,0,0,0 Å2,40.7±0.0 kJ/mol,1.0±0.1 g/cm3,100.0±9.0 °C at 760 mmHg
7,989,"InChI=1S/C5H14NO4P/c1-6(2,3)4-5-10-11(7,8)9/h4...",YHHSONZFOIEMCPUHFFFAOYSA-O,C5H15NO4P,184.150,C[N+](C)(C)CCOP(=O)(O)O,CHOP / Phosphocholine conjugate acid,"N,N,N-Trimethyl-2-(phosphonooxy)ethanaminium",-4.99,2,5,4,0,77 Å2,,,
8,1628,InChI=1S/C27H29NO11/c1-10-22(31)13(28)6-17(38-...,AOJJSUZBOXZQNBUHFFFAOYSA-N,C27H29NO11,543.519,CC1C(C(CC(O1)OC2CC(Cc3c2c(c4c(c3O)C(=O)c5cccc(...,"Doxil / 3-Glycoloyl-3,5,12-trihydroxy-10-metho...","3-Glycoloyl-3,5,12-trihydroxy-10-methoxy-6,11-...",2.82,7,12,5,3,206 Å2,123.5±3.0 kJ/mol,1.6±0.1 g/cm3,810.3±65.0 °C at 760 mmHg
9,1739,InChI=1S/C8H12N4O5/c9-7-10-2-12(8(16)11-7)6-5(...,NMUSYJAQQFHJEWUHFFFAOYSA-N,C8H12N4O5,244.205,c1nc(nc(=O)n1C2C(C(C(O2)CO)O)O)N,"Vidaza / 4-Amino-1-pentofuranosyl-1,3,5-triazi...","4-Amino-1-pentofuranosyl-1,3,5-triazin-2(1H)-one",-1.99,5,9,2,1,141 Å2,93.2±6.0 kJ/mol,2.1±0.1 g/cm3,534.5±60.0 °C at 760 mmHg


In [None]:
#using Chemspider API (Chemspipy)
conn = create_connection("mydb.db")
with conn:
    create_table()
    for result in cs.search(''):
        data_entry((result.csid,result.inchi,result.molecular_formula,result.molecular_weight,result.smiles,result.common_name,result.alogp))
conn.close()  

In [None]:
conn = create_connection("mydb.db")
c = conn.cursor()
list(c.execute('SELECT * FROM CStable'))
conn.close()
#
conn = create_connection("mydb.db")
c = conn.cursor()
list(c.execute('''SELECT * FROM CStable WHERE MW >=300 AND MW <=450; '''))
#
c.execute('''SELECT substr(InChI_string,7) FROM CStable;''') #works to remove InChI string header
print(c.fetchone())
#
c.execute('''UPDATE CStable SET InChI_string = substr(InChI_string,7)''')
#
c.execute('''SELECT InChI_string FROM CStable''')
print(c.fetchall())

In [3]:
#list of results
cs_url = 'http://www.chemspider.com/Search.aspx?rid=0f05b070-7a47-4ec3-b853-3accd7d69890&page_num=0'
uClient = ureq(cs_url)
page_html= uClient.read()
uClient.close()
page_soup = soup(page_html,'html.parser')

In [None]:
cs_url = 'http://www.chemspider.com/Search.aspx?q=Ado-Trastuzumab+Emtansine'
uClient = ureq(cs_url)
page_html= uClient.read()
uClient.close()
page_soup = soup(page_html,'html.parser')
page_soup

In [78]:
cs_url = 'http://www.chemspider.com/Search.aspx?q=Alecensa'
uClient = ureq(cs_url)
page_html= uClient.read()
uClient.close()
page_soup2 = soup(page_html,'html.parser')


In [None]:
if page_soup2.find('h3') == None or page_soup.find('h3').text == 'Found 0 results':
    if is_multi(page_soup2):
        diffName = page_soup2.find('span',{'id':"ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_WrapTitle"}).text
        print('\n There was no search result for {} - {}'.format(diffName,a))
    else:
        print('\n it was single')

In [9]:
for div in page_soup.findAll('div',{"class":"mol-thumb w150"}): #InChI strings
    print(div.a.img['alt'])

InChI=1/C20H32O3/c1-17(2)14-6-9-19-10-13(20(23,11-19)12-21)4-5-15(19)18(14,3)8-7-16(17)22/h13-15,21,23H,4-12H2,1-3H3/t13-,14-,15+,18-,19+,20-/m1/s1
InChI=1/C20H28O3/c1-11-9-20-6-5-14-18(2,3)17(23)13(21)10-19(14,4)15(20)7-12(11)8-16(20)22/h12,14-15,17,23H,1,5-10H2,2-4H3/t12-,14-,15+,17-,19-,20+/m1/s1
InChI=1/C20H32O3/c1-17(2)14-5-9-19-8-4-13(20(23,11-19)12-21)10-15(19)18(14,3)7-6-16(17)22/h13-15,21,23H,4-12H2,1-3H3/t13-,14-,15+,18-,19-,20-/m1/s1
InChI=1/C20H32O4/c1-17(2)14-6-7-19-8-12(20(24,10-19)11-21)4-5-15(19)18(14,3)9-13(22)16(17)23/h12,14-16,21,23-24H,4-11H2,1-3H3/t12-,14-,15+,16-,18-,19+,20-/m1/s1


In [10]:
for tr in page_soup.tbody.findAll("tr"): #CS_IDs
    print(tr.a.text.strip())

347851
347853
399127
347852


In [11]:
lst=[] #MW and formula
for td in page_soup.tbody.findAll("td",{"align":"center","class":None}):
    lst.append(td.text)
z=range(0,len(search_result)) 
for i in z:
    print(lst[i*6],lst[(i*6)+1])

C20H32O3 320.4663
C20H28O3 316.4345
C20H32O3 320.4663
C20H32O4 336.4657


In [169]:
lst_site=[] ##concatenate inner sites list
for a in page_soup.tbody.findAll("td",{'class':'search-id-column'}):
    lst_site.append('{}{}'.format('http://www.chemspider.com',a.a['href']))

In [10]:
#single search
#uClient = ureq('http://www.chemspider.com//Search.aspx?q=Abitrexate')
uClient = ureq('http://www.chemspider.com/Search.aspx?q=Ado-Trastuzumab+Emtansine')
page_html=uClient.read()
page_soup=soup(page_html,'html.parser')
#page_soup.find('td',{'class':'prop_title'},text='#H bond acceptors:').next_sibling.next_element.text.strip()
import csv
with open('c:/users/trinh/desktop/import.csv','w') as f:
    writecsv = csv.writer(f,delimiter = ',')
    writecsv.writerow(page_soup)

In [37]:
#SMILES (several items)
for i in list_sites[0].findAll("span",{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_WrapControl2'}):
    print(i.text)
#SMILES (one item)    
list_sites[0].find("span",{'id':'ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewDetails_rptDetailsView_ctl00_moreDetails_WrapControl2'}).text

C[C@@]12CCC(=O)C([C@H]1CC[C@]34[C@H]2CC[C@H](C3)[C@@](C4)(CO)O)(C)C


In [None]:
sites_num = ['site_'+str(string) for string in range(len(search_result))] #nested dictionary {site_X:properties of compound}
dict_site_props=dict()
j=0
for each in list_sites:
    prop_values = [i.text.strip() for i in each.findAll("td",{"class":"prop_value_nowrap"})]
    prop_titles = [i.text.strip() for i in each.findAll("td",{"class":"prop_title"})]
    inner_dict_site_props=dict(zip(prop_titles,prop_values))
    dict_site_props[sites_num[j]]=inner_dict_site_props
    j+=1

In [None]:
#def deep_get(dictionary, *keys):
#    return reduce(lambda d, key: d.get(key) if d else None, keys, dictionary)
dict_site_props['site_1']['Density:']
dict_site_props.get('site_1').get('Density:')

In [None]:
###for search result list####

def InChIstring(soupPage_forDrug): #InChI strings
    for div in soupPage_forDrug.findAll('div',{"class":"mol-thumb w150"}): 
        return div.a.img['alt']

def CSID(soupPage_forDrug): #CS_IDs
    for tr in soupPage_forDrug.tbody.findAll("tr"): 
        return tr.a.text.strip()

def select_MWform(lst): #select molecular formula and MW from list 
    new_list=[]
    for i in lst:
        try:
            if not int(i) and int(i) !=0:
                new_list.append(i)
        except:
            new_list.append(i)
    return new_list
            
def MW_formula(soupPage_forDrug): #MW and formula
    lst=[] 
    for td in soupPage_forDrug.tbody.findAll("td",{"align":"center","class":None}):
        lst.append(td.text) 
    return select_MWform(lst)
