# Create a database of organic molecules based on TCI scraped data

## Imports

In [1]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem as Chem
from tqdm.notebook import tqdm
from utils import *
#from config import host, port, database, user, password

# Load the data

In [2]:
%cd data
!ls 

/home/stephy/Learning_DataScience/Chemical_db/TCI_database_app/data
'TCI_available stock.json'		'TCI_Related Laws.json'
'TCI_compound identifications.json'	 TCI.smi
'TCI_general information.json'		 TCI_specifications.json
'TCI_GHS precautionary statement.json'	'TCI_Transport information.json'
 TCI_Other.json				 TCI.txt
 TCI_properties.json


## Compound Identifications Data

In [18]:
df = pd.read_json('TCI_compound identifications.json', orient ='records', compression = 'infer')

In [19]:
df

Unnamed: 0,name,CAS,reaxys registry number,pubchem substance id,sdbs (aist spectral db),merck index (14),mdl number,SMILES_by_PubChem,id,grade,related cas rn,colour index,enzyme commission number
0,Abietic Acid,514-10-3,2221451.0,87561707.0,1471.0,7.0,mfcd03423567,CC(C)C1=CC2=CCC3C(C2CC1)(CCCC3(C)C(=O)O)C,A0001,,,,
1,Ethyl Abietate,631-71-0,,87561708.0,,,mfcd00028860,CCOC(=O)C1(CCCC2(C1CC=C3C2CCC(=C3)C(C)C)C)C,A0002,,,,
2,Acenaphthene,83-32-9,386081.0,87561709.0,863.0,28.0,mfcd00003807,C1CC2=CC=CC3=C2C1=CC=C3,A0003,,,,
3,Acenaphthene,83-32-9,386081.0,87561826.0,863.0,28.0,mfcd00003807,C1CC2=CC=CC3=C2C1=CC=C3,A0135,up,,,
4,Acenaphthenequinone,82-86-0,879172.0,87561710.0,3313.0,,mfcd00003805,C1=CC2=C3C(=C1)C(=O)C(=O)C3=CC=C2,A0004,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29558,Zinc Acetate,557-34-6,3563830.0,,,10128.0,mfcd00012454,CC(=O)[O-].CC(=O)[O-].[Zn+2],Z0044,,,,
29559,Zirconium(IV) Chloride,10026-11-6,3903213.0,,,,mfcd00011306,Cl[Zr](Cl)(Cl)Cl,Z0045,,,,
29560,Zinc Oxide,1314-13-2,8128140.0,,,10147.0,mfcd00011300,O=[Zn],Z0046,,,,
29561,Zirconium(IV) Oxide,1314-23-4,11322257.0,,,10180.0,mfcd00011310,,Z0049,,,,


### Convert smiles with RDKit and attach to the data

In [23]:
df['rdkit_smiles'] = df.SMILES_by_PubChem.apply(get_cansmi) # Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)

[13:46:32] Explicit valence for atom # 8 Br, 3, is greater than permitted
[13:46:36] Explicit valence for atom # 11 Br, 3, is greater than permitted
[13:46:37] Explicit valence for atom # 18 Br, 3, is greater than permitted
[13:46:37] Explicit valence for atom # 18 Br, 3, is greater than permitted
[13:46:37] Explicit valence for atom # 12 Br, 3, is greater than permitted
[13:46:37] Explicit valence for atom # 23 Sn, 6, is greater than permitted


In [24]:
df[['SMILES_by_PubChem', 'rdkit_smiles']].sample(10)

Unnamed: 0,SMILES_by_PubChem,rdkit_smiles
2703,C1=CC2=NNN=C2C=C1,c1ccc2n[nH]nc2c1
4034,C1=CC2=C(C=CN2)C(=C1)Br,Brc1cccc2[nH]ccc12
1233,CC(C(C(=O)O)N)O,CC(O)C(N)C(=O)O
1360,CCC(COC(=O)C)OC,CCC(COC(C)=O)OC
9907,CCCCCCCCCCOC1=CC=C(C=C1)C2=CC=C(C=C2)C#N,CCCCCCCCCCOc1ccc(-c2ccc(C#N)cc2)cc1
3041,CC(=O)CBr,CC(=O)CBr
9420,CC1=NC(=NC=C1)Cl,Cc1ccnc(Cl)n1
27262,CCCCCN(CCCCC)CCCCC,CCCCCN(CCCCC)CCCCC
11929,C(CCI)CCI,ICCCCCI
25651,CC1(C2CCC(C1C2)CCOCC[N+]3(CCOCC3)CC4=CC(=C(C=C...,COc1cc(Br)c(C[N+]2(CCOCCC3CCC4CC3C4(C)C)CCOCC2...


In [20]:
#drop duplicates
df.drop_duplicates(inplace=True)

In [21]:
df

Unnamed: 0,name,CAS,reaxys registry number,pubchem substance id,sdbs (aist spectral db),merck index (14),mdl number,SMILES_by_PubChem,id,grade,related cas rn,colour index,enzyme commission number
0,Abietic Acid,514-10-3,2221451.0,87561707.0,1471.0,7.0,mfcd03423567,CC(C)C1=CC2=CCC3C(C2CC1)(CCCC3(C)C(=O)O)C,A0001,,,,
1,Ethyl Abietate,631-71-0,,87561708.0,,,mfcd00028860,CCOC(=O)C1(CCCC2(C1CC=C3C2CCC(=C3)C(C)C)C)C,A0002,,,,
2,Acenaphthene,83-32-9,386081.0,87561709.0,863.0,28.0,mfcd00003807,C1CC2=CC=CC3=C2C1=CC=C3,A0003,,,,
3,Acenaphthene,83-32-9,386081.0,87561826.0,863.0,28.0,mfcd00003807,C1CC2=CC=CC3=C2C1=CC=C3,A0135,up,,,
4,Acenaphthenequinone,82-86-0,879172.0,87561710.0,3313.0,,mfcd00003805,C1=CC2=C3C(=C1)C(=O)C(=O)C3=CC=C2,A0004,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29558,Zinc Acetate,557-34-6,3563830.0,,,10128.0,mfcd00012454,CC(=O)[O-].CC(=O)[O-].[Zn+2],Z0044,,,,
29559,Zirconium(IV) Chloride,10026-11-6,3903213.0,,,,mfcd00011306,Cl[Zr](Cl)(Cl)Cl,Z0045,,,,
29560,Zinc Oxide,1314-13-2,8128140.0,,,10147.0,mfcd00011300,O=[Zn],Z0046,,,,
29561,Zirconium(IV) Oxide,1314-23-4,11322257.0,,,10180.0,mfcd00011310,,Z0049,,,,


In [25]:
df_rdkit = df[['id','rdkit_smiles']]
df_rdkit.values[0]

array(['A0001', 'CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1'], dtype=object)

In [26]:
# to create the cartridge in psql save the smiles and the id
np.savetxt(r'TCI.txt', df_rdkit.values, fmt='%s')

In [8]:
df.columns = ['name', 'cas', 'reaxys_registry_number', 'pubchem_id',
       'sdbs_aist_spectral_db', 'merck_index', 'mdl_number',
       'pubchem_smiles', 'id', 'grade', 'cas_rn', 'colour_index',
       'enzyme_commission_number', 'rdkit_smiles']

In [9]:
df.sample(5)

Unnamed: 0,name,cas,reaxys_registry_number,pubchem_id,sdbs_aist_spectral_db,merck_index,mdl_number,pubchem_smiles,id,grade,cas_rn,colour_index,enzyme_commission_number,rdkit_smiles
1036,(S)-(-)-2-Aminomethyl-1-ethylpyrrolidine,22795-99-9,3587377.0,87562761.0,50882.0,,mfcd00191371,CCN1CCCC1CN,A1301,,,,,CCN1CCCC1CN
24287,Pentafluorobenzoic Acid,602-94-8,2054395.0,87574937.0,10537.0,,mfcd00002406,C1(=C(C(=C(C(=C1F)F)F)F)F)C(=O)O,P0806,,,,,O=C(O)c1c(F)c(F)c(F)c(F)c1F
383,Amylopectin,9037-22-3,,87562080.0,,481.0,mfcd00130510,C(C1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)...,A0456,,,,,OCC1OC(OC2C(CO)OC(OCC3OC(OC4C(CO)OC(O)C(O)C4O)...
16045,N-[(9H-Fluoren-9-ylmethoxy)carbonyl]-L-methionine,71989-28-1,4300266.0,87570070.0,11737.0,,mfcd00037134,CSCCC(C(=O)O)NC(=O)OCC1C2=CC=CC=C2C3=CC=CC=C13,F0296,,,,,CSCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
13315,",4-Dihydroxy-3-nitropyridine",89282-12-2,149558.0,125307681.0,52886.0,,mfcd01075671,C1=CNC(=O)C(=C1O)[N+](=O)[O-],D4002,,,,,O=c1[nH]ccc(O)c1[N+](=O)[O-]


### Columns of the SQL table

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29563 entries, 0 to 29562
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      29563 non-null  object 
 1   cas                       29563 non-null  object 
 2   reaxys_registry_number    27941 non-null  float64
 3   pubchem_id                26311 non-null  float64
 4   sdbs_aist_spectral_db     10085 non-null  float64
 5   merck_index               4140 non-null   float64
 6   mdl_number                26971 non-null  object 
 7   pubchem_smiles            29008 non-null  object 
 8   id                        29563 non-null  object 
 9   grade                     709 non-null    object 
 10  cas_rn                    710 non-null    object 
 11  colour_index              295 non-null    float64
 12  enzyme_commission_number  30 non-null     object 
 13  rdkit_smiles              29002 non-null  object 
dtypes: flo

In [12]:
columns = \
[('name', 'TEXT', '', 'Name of the compound'),
 ('cas', 'TEXT', '', 'CAS identification'),
 ('reaxys_registry_number', 'REAL', '', 'Identifier for chemical substance in Reaxys (Beilstein Registry Number)'),
 ('pubchem_id', 'REAL', '', 'Identifier from database of chemical molecules and their activities in biological assays PubChem ID'),
 ('sdbs_aist_spectral_db', 'REAL', '', 'Spectral Database for Organic Compounds'),
 ('merck_index', 'REAL', '', ' Merck Index for authoritative information on chemicals, drugs and biologicals'), 
 ('mdl_number', 'TEXT', '', 'MLD unique identification number for each reaction and variation.'),
 ('pubchem_smiles', 'TEXT', '', 'SMILES identifier from Pubchem'),
 ('id', 'TEXT', 'PRIMARY KEY', 'TCI unique id'),
 ('grade', 'TEXT', '', 'Grade refering to the purity of the chemical'),
 ('cas_rn', 'TEXT', '', 'CAS RN unique identifier that provides an unambiguous means to distinguish chemical substances or molecular structures'), 
 ('colour_index', 'REAL', '', 'Colour Index Generic Name describes a commercial product by its recognised usage class, its hue and a serial number'),
 ('enzyme_commission_number', 'TEXT', '', 'Enzyme Commission numerical classification scheme for enzymes, based on the chemical reactions they catalyze'),
 ('rdkit_smiles', 'MOL', '', 'SMILES identifier from RDKit')
]

### Details

| column name | data type | table constraint | description |
| ---- | ---- | ---- | ---- |
|name | TEXT | - | Name of the compound |
|CAS|TEXT|-|CAS identification|
|reaxys registry number|REAL|-|Identifier for chemical substance in Reaxys (Beilstein Registry Number)|
|pubchem substance id|REAL|-|Identifier from database of chemical molecules and their activities in biological assays PubChem ID|
|sdbs (aist spectral db)|REAL|-|Spectral Database for Organic Compounds|
|merck index (14)|REAL|-| Merck Index for authoritative information on chemicals, drugs and biologicals| 
|mdl number|TEXT|-|MLD unique identification number for each reaction and variation.|
|SMILES_by_PubChem|TEXT|-|SMILES identifier from Pubchem|
|id|TEXT|PRIMARY KEY|TCI unique id|
|grade|TEXT|-|Grade refering to the purity of the chemical|
|related cas rn|TEXT|-|CAS RN unique identifier that provides an unambiguous means to distinguish chemical substances or molecular structures| 
|colour index|REAL|-|Colour Index Generic Name describes a commercial product by its recognised usage class, its hue and a serial number|
|enzyme commission number|TEXT|-|Enzyme Commission numerical classification scheme for enzymes, based on the chemical reactions they catalyze|
|rdkit_smiles | MOL | - | SMILES identifier from RDKit |

### Create table and upload to server

In [15]:
host = 'ec2-3-234-131-8.compute-1.amazonaws.com'
port = 5432
database = 'd901m4b078shfu'
user = 'qelzehrmtxlsia'
password = '58b8629bc71e03ed06fa0ce08e470a75b37f99d376e96c4cb29abef00375f7ed'

In [19]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
#df.to_sql('compound_identification', engine,  index=False)

In [20]:
sql_create_table = \
'''
CREATE TABLE tci_data(
    id text primary key,
    name text,
    cas text,
    reaxys_registry_number real,
    pubchem_id real,
    sdbs_aist_spectral_db real,
    merck_index real, 
    mdl_number text,
    pubchem_smiles text,
    grade text,
    cas_rn text,
    colour_index real,
    enzyme_commission_number text,
    rdkit_smiles mol)
'''

In [21]:
with engine.connect() as con:
    con.execute(sql_create_table)

ProgrammingError: (psycopg2.errors.UndefinedObject) type "mol" does not exist
LINE 16:     rdkit_smiles mol)
                          ^

[SQL: 
CREATE TABLE tci_data(
    id text primary key,
    name text,
    cas text,
    reaxys_registry_number real,
    pubchem_id real,
    sdbs_aist_spectral_db real,
    merck_index real, 
    mdl_number text,
    pubchem_smiles text,
    grade text,
    cas_rn text,
    colour_index real,
    enzyme_commission_number text,
    rdkit_smiles mol)
]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [17]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE compound_identification ADD PRIMARY KEY ("id");')

IntegrityError: (psycopg2.errors.UniqueViolation) could not create unique index "compound_identification_pkey"
DETAIL:  Key (id)=(S0943) is duplicated.

[SQL: ALTER TABLE compound_identification ADD PRIMARY KEY ("id");]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

In [18]:
#create an index
with engine.connect() as con:
    con.execute(f"CREATE INDEX compound_identification_index ON compound_identification({','.join([col[0] for col in columns])});")

### Test the connection

In [27]:
%%time
table_retrieved = engine.execute("SELECT * FROM compound_identification").fetchall()

CPU times: user 29.2 ms, sys: 3.39 ms, total: 32.6 ms
Wall time: 26 s


In [25]:
%%time
df_retrieved = pd.read_sql("SELECT * FROM compound_identification", engine)

CPU times: user 33.3 ms, sys: 11.8 ms, total: 45.1 ms
Wall time: 19.6 s


The *CPU or execution time*, measures how much time a CPU spent on executing a program.

  - The *user time* corresponds to the time spent executing the process.

  - The *System time* corresponds to the time that the CPU is answering to system calls from the process. The system time can change depending on the operating system.

*Wall time or wall-clock time*, measures the total time to execute a program in a computer. 

If *(CPU time)/(wall clock time) < 1* means the program was just waiting and not executing the program. It can be affected by 
* Processes running on the machine, if other processes are keeping the CPU busy, there might be the need to wait for some free CPU.
* Unstable or slow network.
* Waiting for the server response.

More information on `%%time` [here](https://ipython.readthedocs.io/en/stable/interactive/magics.html?highlight=%25time#magic-time). And [here](https://pythonspeed.com/articles/blocking-cpu-or-io/) is some discussion regarding time bottle necks.

In [26]:
df_retrieved.sample(10)

Unnamed: 0,name,cas,reaxys_registry_number,pubchem_id,sdbs_aist_spectral_db,merck_index,mdl_number,pubchem_smiles,id,grade,cas_rn,colour_index,enzyme_commission_number,rdkit_smiles
1794,Amino-5-chloro-3-methylbenzoic Acid,20776-67-4,3530413.0,160871357.0,,,mfcd02358895,CC1=CC(=CC(=C1N)C(=O)O)Cl,A2399,,,,,Cc1cc(Cl)cc(C(=O)O)c1N
296,"Amino-1,5-naphthalenedisulfonic Acid Disodium ...",14170-43-5,4117152.0,87561990.0,,,mfcd00070487,C1=CC2=C(C=C(C=C2S(=O)(=O)[O-])N)C(=C1)S(=O)(=...,A0339,,,,,Nc1cc(S(=O)(=O)[O-])c2cccc(S(=O)(=O)[O-])c2c1....
1559,"2-Amino-4,5-dimethylthiazole Hydrobromide",7170-76-5,3690705.0,87559841.0,52113.0,,mfcd00035212,CC1=C(SC(=N1)N)C.Br,A2062,,,,,Br.Cc1nc(N)sc1C
1096,"2-Amino-4,4,4-trifluorobutyric Acid Hydrochloride",15959-93-0,19545620.0,87562822.0,,,mfcd00671488,C(C(C(=O)O)N)C(F)(F)F,A1367,,,,,NC(CC(F)(F)F)C(=O)O
827,2-Amino-9-fluorenone,3096-57-9,2804184.0,87562532.0,12164.0,,mfcd00001157,C1=CC=C2C(=C1)C3=C(C2=O)C=C(C=C3)N,A1040,,,,,Nc1ccc2c(c1)C(=O)c1ccccc1-2
1502,Amino-2-chlorophenol,6358-06-1,2081092.0,87559209.0,51618.0,,mfcd01707858,C1=CC(=C(C=C1N)O)Cl,A1976,,,,,Nc1ccc(Cl)c(O)c1
1908,4-Amino-2-methoxypyridine,20265-39-8,115200.0,172089020.0,,,mfcd06738657,COC1=NC=CC(=C1)N,A2543,,,,,COc1cc(N)ccn1
1897,Amiodarone Hydrochloride,19774-82-4,4776323.0,172088928.0,,482.0,mfcd00069204,CCCCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O...,A2530,,1951-25-3,,,CCCCc1oc2ccccc2c1C(=O)c1cc(I)c(OCCN(CC)CC)c(I)...
502,2-Aminothiazole,96-50-4,105738.0,87562204.0,3370.0,479.0,mfcd00005325,C1=CSC(=N1)N,A0633,,,,,Nc1nccs1
1153,Sodium 4-Acetylbenzenesulfonate,61827-67-6,4218686.0,87562884.0,22400.0,,mfcd00007510,CC(=O)C1=CC=C(C=C1)S(=O)(=O)[O-].[Na+],A1430,,,,,CC(=O)c1ccc(S(=O)(=O)[O-])cc1.[Na+]


### Test different queries

In [31]:
test_smiles = engine.execute("SELECT * FROM compound_identification WHERE rdkit_smiles>'[n,o,s]1cccc1' ").fetchall()

In [32]:
df_test_smiles = pd.DataFrame(test_smiles)

In [95]:
df_test_smiles.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
121,9-Anthraceneboronic Acid,100622-34-2,3301031.0,135727333.0,,,mfcd03425925,B(C1=C2C=CC=CC2=CC3=CC=CC=C13)(O)O,A2328,,,,,OB(O)c1c2ccccc2cc2ccccc12
60,Dibenzyl Adipate,2451-84-5,1892058.0,87562191.0,7556.0,,mfcd00059667,C1=CC=C(C=C1)COC(=O)CCCCC(=O)OCC2=CC=CC=C2,A0618,,,,,O=C(CCCCC(=O)OCc1ccccc1)OCc1ccccc1
91,4-(1-Adamantyl)phenol,29799-07-3,2054509.0,87562847.0,30018.0,,mfcd00168143,C1C2CC3CC1CC(C2)(C3)C4=CC=C(C=C4)O,A1392,,,,,Oc1ccc(C23CC4CC(CC(C4)C2)C3)cc1
119,Ammonium Thioglycolate,5421-46-5,11213443.0,87561582.0,,,mfcd00137451,C(C(=O)[O-])S.[NH4+],A2217,,,,,O=C([O-])CS.[NH4+]
110,",3-Anthracenedicarboxylic Anhydride",6812-14-2,206364.0,87558132.0,,,,C1=CC=C2C=C3C=C4C(=CC3=CC2=C1)C(=O)OC4=O,A1834,,,,,O=C1OC(=O)c2cc3cc4ccccc4cc3cc21


## Available Stock

In [34]:
df_2 = pd.read_json('TCI_available stock.json', orient ='records', compression = 'infer')

In [57]:
df_2.columns = ['id','Stock25G_price','Stock25G_Saitama_Kawaguchi','Stock25G_Hyogo_Amagasaki',
 'Stock25G_Stock_in_other_WH','Stock500G_price','Stock500G_Saitama_Kawaguchi',
 'Stock500G_Hyogo_Amagasaki','Stock500G_Stock_in_other_WH','Stock100G_price',
 'Stock100G_Saitama_Kawaguchi','Stock100G_Hyogo_Amagasaki',
 'Stock100G_Stock_in_other_WH','Stock1G_price','Stock1G_Saitama_Kawaguchi',
 'Stock1G_Hyogo_Amagasaki','Stock1G_Stock_in_other_WH','Stock5G_price',
 'Stock5G_Saitama_Kawaguchi','Stock5G_Hyogo_Amagasaki',
 'Stock5G_Stock_in_other_WH','Stock250G_price','Stock250G_Saitama_Kawaguchi',
 'Stock250G_Hyogo_Amagasaki','Stock250G_Stock_in_other_WH','Stock10G_price',
 'Stock10G_Saitama_Kawaguchi','Stock10G_Hyogo_Amagasaki',
 'Stock10G_Stock_in_other_WH','Stockprepared_after_order_item','Stock200G_price',
 'Stock200G_Saitama_Kawaguchi','Stock200G_Hyogo_Amagasaki',
 'Stock200G_Stock_in_other_WH','Stock300G_price','Stock300G_Saitama_Kawaguchi',
 'Stock300G_Hyogo_Amagasaki','Stock300G_Stock_in_other_WH','Stock20G_price',
 'Stock20G_Saitama_Kawaguchi','Stock20G_Hyogo_Amagasaki',
 'Stock20G_Stock_in_other_WH','Stock50G_price','Stock50G_Saitama_Kawaguchi',
 'Stock50G_Hyogo_Amagasaki','Stock50G_Stock_in_other_WH']

In [58]:
df_2.sample(5)

Unnamed: 0,id,Stock25G_price,Stock25G_Saitama_Kawaguchi,Stock25G_Hyogo_Amagasaki,Stock25G_Stock_in_other_WH,Stock500G_price,Stock500G_Saitama_Kawaguchi,Stock500G_Hyogo_Amagasaki,Stock500G_Stock_in_other_WH,Stock100G_price,...,Stock300G_Hyogo_Amagasaki,Stock300G_Stock_in_other_WH,Stock20G_price,Stock20G_Saitama_Kawaguchi,Stock20G_Hyogo_Amagasaki,Stock20G_Stock_in_other_WH,Stock50G_price,Stock50G_Saitama_Kawaguchi,Stock50G_Hyogo_Amagasaki,Stock50G_Stock_in_other_WH
1501,A1975,,,,,,,,,,...,,,,,,,,,,
1966,A2647,,,,,,,,,,...,,,,,,,,,,
1036,A1301,,,,,,,,,,...,,,,,,,,,,
915,A1153,,,,,,,,,3100.0,...,,,,,,,,,,
1670,A2220,,,,,,,,,,...,,,,,,,,,,


In [59]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              2000 non-null   object 
 1   Stock25G_price                  1236 non-null   object 
 2   Stock25G_Saitama_Kawaguchi      1236 non-null   object 
 3   Stock25G_Hyogo_Amagasaki        1236 non-null   object 
 4   Stock25G_Stock_in_other_WH      1236 non-null   object 
 5   Stock500G_price                 369 non-null    object 
 6   Stock500G_Saitama_Kawaguchi     369 non-null    object 
 7   Stock500G_Hyogo_Amagasaki       369 non-null    object 
 8   Stock500G_Stock_in_other_WH     369 non-null    object 
 9   Stock100G_price                 217 non-null    object 
 10  Stock100G_Saitama_Kawaguchi     217 non-null    object 
 11  Stock100G_Hyogo_Amagasaki       217 non-null    object 
 12  Stock100G_Stock_in_other_WH     21

In [60]:
columns_2 = \
[('id', 'TEXT', 'PRIMARY KEY', 'TCI unique id'),
 ('Stock25G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock25G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock25G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock25G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock500G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock500G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock500G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock500G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock100G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock100G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock100G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock100G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock1G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock1G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock1G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock1G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock5G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock5G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock5G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock5G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock250G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock250G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock250G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock250G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock10G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock10G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock10G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock10G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stockprepared_after_order_item', 'TEXT', '',  'Prepared after item'),
 ('Stock200G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock200G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock200G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock200G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock300G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock300G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock300G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock300G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock20G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock20G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock20G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock20G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses'),
 ('Stock50G_price', 'TEXT', '', 'Price for the specified quatity of product'),
 ('Stock50G_Saitama_Kawaguchi','TEXT', '', 'Number of goods available in Saitama_Kawaguchi'),
 ('Stock50G_Hyogo_Amagasaki','TEXT', '', 'Number of goods available in Hyogo_Amagasaki'),
 ('Stock50G_Stock_in_other_WH', 'TEXT', ' Number of goods in other Warehouses')
]

### Details

| column name | data type | table constraint | description |
| ---- | ---- | ---- | ---- |
|id|TEXT|PRIMARY KEY|TCI unique id|
|Stock25G_price|TEXT|-|Price for the specified quatity of product|
|Stock25G_Saitama_Kawaguchi|TEXT|-|Number of goods available in Saitama_Kawaguchi|
|Stock25G_Hyogo_Amagasaki|TEXT|-|Number of goods available in Hyogo_Amagasaki|
|Stock25G_Stock_in_other_WH|TEXT|-| Number of goods in other Warehouses|

In [61]:
df_2.to_sql('available_stock', engine,  index=False)

In [62]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE available_stock ADD PRIMARY KEY ("id");')

In [64]:
%%time
table_retrieved_2 = engine.execute("SELECT * FROM available_stock").fetchall()

CPU times: user 226 ms, sys: 2.43 ms, total: 229 ms
Wall time: 7.2 s


In [1]:
sql = "SELECT *\
       FROM compound_identification\
       INNER JOIN available_stock ON compound_identification.id=available_stock.id\
       WHERE rdkit_smiles>'[n,o,s]1cccc1';"

In [2]:
%%time
df_retrieved = pd.read_sql(sql, engine)

NameError: name 'pd' is not defined

In [93]:
df_sample = df_retrieved.sample()
df_sample.dropna(axis=1)

Unnamed: 0,name,cas,reaxys_registry_number,pubchem_id,mdl_number,pubchem_smiles,id,rdkit_smiles,id.1,Stock25G_price,Stock25G_Saitama_Kawaguchi,Stock25G_Hyogo_Amagasaki,Stock25G_Stock_in_other_WH
129,Angelicin,523-50-2,153970.0,172089135.0,mfcd00064930,C1=CC2=C(C=CO2)C3=C1C=CC(=O)O3,A2573,O=c1ccc2ccc3occc3c2o1,A2573,47200,Contact Company,8,Contact Company
