# Query the database of organic molecules based on TCI scraped data

## Imports

In [2]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem as Chem
from tqdm.notebook import tqdm

## Connect to the TCI RDKit cartridge

In [55]:
host = 'molecule-db-instance-1.czixbih3kolx.us-west-2.rds.amazonaws.com'
#host = 'molecule-db.cluster-ro-czixbih3kolx.us-west-2.rds.amazonaws.com'
port = 5432
database = 'smallmoleculedb'
user = 'MoleculeMaster'
password = 'UXT7nljK3!R791Tlz!KAgHu'

In [56]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')

In [57]:
%%time
#retrieve a list
list_retrieved = engine.execute("SELECT * FROM rdk.mols").fetchall()

CPU times: user 146 ms, sys: 39 ms, total: 185 ms
Wall time: 5.4 s


In [58]:
list_retrieved[:10]

[('A0001', 'CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1'),
 ('A0002', 'CCOC(=O)C1(C)CCCC2(C)C3CCC(C(C)C)=CC3=CCC12'),
 ('A0003', 'c1cc2c3c(cccc3c1)CC2'),
 ('A0135', 'c1cc2c3c(cccc3c1)CC2'),
 ('A0004', 'O=C1C(=O)c2cccc3cccc1c23'),
 ('A0005', 'C1=Cc2cccc3cccc1c23'),
 ('A0006', 'CC=NO'),
 ('A0007', 'CC(N)=O'),
 ('A0008', 'CC(=N)N.Cl'),
 ('A0009', 'CC(=O)Nc1ccc(C=O)cc1')]

In [7]:
%%time
#retrieve a pandas dataframe
df_retrieved = pd.read_sql("SELECT * FROM rdk.mols", engine)

CPU times: user 169 ms, sys: 21 ms, total: 190 ms
Wall time: 2.75 s


In [8]:
df_retrieved.sample(10)

Unnamed: 0,tci_id,m
23479,P0243,C[N+](C)(C)c1ccccc1.[Br-]
12428,D2927,NCCc1ccc(Cl)c(Cl)c1
7106,B6060,CC1CN(Cc2ccc(F)cc2)CCN1C(=O)COc1ccc(Cl)cc1NC(N)=O
11559,D1900,O=[N+]([O-])c1ccc(Cl)cc1[N+](=O)[O-]
26661,T1052,CC#CC(=O)OCC
11064,D1305,CCNCCCCCCNCC
25570,S0137,O.O.O=C(O)c1cc(S(=O)(=O)O)ccc1O
28743,U0096,CC(C)(C)OC(=O)NOCC(=O)ON1C(=O)CCC1=O
23740,P0613,C=COC(=O)CC
3384,B1118,OCC1CCCCC1NCc1ccccc1


### Query data from the cartridge

Available options https://www.rdkit.org/docs/Cartridge.html#reference-guide

In [10]:
%%time
df_query = pd.read_sql("select * from rdk.mols where m@>'Nc1ncnc(N)n1' limit 10;", engine)

CPU times: user 7.34 ms, sys: 3.79 ms, total: 11.1 ms
Wall time: 929 ms


In [11]:
df_query

Unnamed: 0,tci_id,m
0,B0727,CCCCNc1nc(N)nc(N)n1
1,A0676,Nc1nc(N)[nH]c(=O)n1
2,A1650,CCNc1nc(Cl)nc(NC(C)C)n1
3,V0057,C=Cc1nc(N)nc(N)n1
4,B6186,CSc1nc(NC(C)C)nc(NC(C)C)n1
5,D0111,Nc1nc(N)nc(-c2ccccc2)n1
6,T2700,c1ccc2c(c1)c1ccccc1n2-c1nc(-n2c3ccccc3c3ccccc3...
7,T2059,COCN(COC)c1nc(N(COC)COC)nc(N(COC)COC)n1
8,D3238,Cc1nccn1CCc1nc(N)nc(N)n1
9,D3239,CCCCCCCCCCCc1nccn1CCc1nc(N)nc(N)n1


# Query other tables

In [128]:
query = '''SELECT * \
         FROM tci_compound_identifications\
         LEFT JOIN tci_available_stock ON tci_compound_identifications.code = tci_available_stock.code\
         LEFT JOIN tci_general_information ON tci_compound_identifications.code = tci_general_information.code\
         LEFT JOIN rdk.mols ON tci_compound_identifications.code = rdk.mols.tci_id\
         WHERE (
                 m@>'Nc1ncnc(N)n1'
                 )
         AND   (
                 "purity" > 60
                 ) 
         ORDER BY "lowestPriceOption" ASC\
         LIMIT 10;'''

In [129]:
df_query = pd.read_sql(query, engine)

In [131]:
df_query.dropna(axis=1, how='all')

Unnamed: 0,name,CAS,code,ProductNumber,CasRN,reaxysRegistryNumber,pubchemSubstanceId,SMILESPubChem,merckIndex14,mdlNumber,...,MolecularFormula,Molecular Weight,purityAnalysisMethod,appearance,solubilityWater,storeUnderInertGas,storageTemperature,purity,tci_id,m
0,Melamine Monomer,108-78-1,T0337,t0337,108-78-1,124341.0,87576309.0,C1(=NC(=NC(=N1)N)N)N,5811.0,mfcd00006055,...,C__3H__6N__6,126.12,>98.0%(t)(hplc),white powder to crystal,practically insoluble,,,98.0,T0337,Nc1nc(N)nc(N)n1
1,",4-Diamino-6-methyl-1,3,5-triazine",542-02-9,D0583,d0583,542-02-9,118348.0,87567233.0,CC1=NC(=NC(=N1)N)N,,mfcd00023192,...,C__4H__7N__5,125.14,>98.0%(t),white to almost white powder to crystal,,,,98.0,D0583,Cc1nc(N)nc(N)n1
2,Trichloromelamine,7673-09-8,T0384,t0384,7673-09-8,524508.0,87576343.0,C1(=NC(=NC(=N1)NCl)NCl)NCl,,mfcd00006047,...,C__3H__3Cl__3N__6,229.45,>95.0%(t),white to light yellow powder to crystal,slightly soluble,store under inert gas,0-10°c,95.0,T0384,ClNc1nc(NCl)nc(NCl)n1
3,Benzoguanamine,91-76-9,D0111,d0111,91-76-9,153223.0,87566851.0,C1=CC=C(C=C1)C2=NC(=NC(=N2)N)N,1089.0,mfcd00023187,...,C__9H__9N__5,187.21,>99.0%(hplc),white to almost white powder to crystal,insoluble,,,99.0,D0111,Nc1nc(N)nc(-c2ccccc2)n1
4,"2,4-Diamino-6-[2-(2-methyl-1-imidazolyl)ethyl]...",38668-46-1,D3238,d3238,38668-46-1,6973600.0,87558092.0,CC1=NC=CN1CCC2=NC(=NC(=N2)N)N,,,...,C__9H__1__3N__7,219.25,>98.0%(t)(hplc),white to almost white powder to crystal,practically insoluble,,,98.0,D3238,Cc1nccn1CCc1nc(N)nc(N)n1
5,",9-Bis[2-(3,5-diamino-2,4,6-triazaphenyl)ethyl...",22535-90-6,B1724,b1724,22535-90-6,594660.0,87564623.0,C1C2(COC(O1)CCC3=NC(=NC(=N3)N)N)COC(OC2)CCC4=N...,,mfcd00191395,...,C__1__7H__2__6N__1__0O__4,434.46,>98.0%(t),white to almost white powder to crystal,,,,98.0,B1724,Nc1nc(N)nc(CCC2OCC3(CO2)COC(CCc2nc(N)nc(N)n2)O...
6,",4-Diamino-6-[2-(2-undecyl-1-imidazolyl)ethyl]...",50729-75-4,D3239,d3239,50729-75-4,,87558177.0,CCCCCCCCCCCC1=NC=CN1CCC2=NC(=NC(=N2)N)N,,,...,C__1__9H__3__3N__7,359.52,>99.0%(t),white to almost white powder to crystal,insoluble,,,99.0,D3239,CCCCCCCCCCCc1nccn1CCc1nc(N)nc(N)n1
7,Cyromazine,66215-27-8,C2366,c2366,66215-27-8,882879.0,87561454.0,C1CC1NC2=NC(=NC(=N2)N)N,2775.0,mfcd00078650,...,C__6H__1__0N__6,166.19,>98.0%(t)(hplc),white to almost white powder to crystal,,,,98.0,C2366,Nc1nc(N)nc(NC2CC2)n1
8,(tert-Butylamino)-4-(cyclopropylamino)-6-(meth...,28159-98-0,I0842,i0842,28159-98-0,792218.0,160871206.0,CC(C)(C)NC1=NC(=NC(=N1)NC2CC2)SC,,mfcd01863779,...,C__1__1H__1__9N__5S,253.37,>98.0%(t),white to almost white powder to crystal,insoluble,,,98.0,I0842,CSc1nc(NC2CC2)nc(NC(C)(C)C)n1
9,"2,4-Diamino-6-diallylamino-1,3,5-triazine",91-77-0,D1075,d1075,91-77-0,15452.0,87567640.0,C=CCN(CC=C)C1=NC(=NC(=N1)N)N,,mfcd00047347,...,C__9H__1__4N__6,206.25,>98.0%(t)(hplc),white to almost white powder to crystal,,,,98.0,D1075,C=CCN(CC=C)c1nc(N)nc(N)n1
