# Create a database of organic molecules based on TCI scraped data

So far:
* An AWS Aurora server has already been created. See https://mi-6.docbase.io/posts/2582931
* A RDKit cartridge has already been setup in the database following https://mi-6.docbase.io/posts/2547303
* TCI data was cleaned, preprocessed and the naming of properties was taken care of (avoid using SQL special characters). Jupyter notebook https://github.com/stephanyvargas/Learning_DataScience/blob/master/WebScrapping_data/test_data_TCI.ipynb

## Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

# Load the data

In [2]:
%cd data
!ls 

/home/stephy/Learning_DataScience/Chemical_db/TCI_database_app/data
TCIAvailableStock.json		    TCI.smi
TCICompoundIdentifications.json     TCISpecifications.json
TCIGeneralInformation.json	    TCISpecificProperties.json
TCIGHSprecautionaryStatements.json  TCITransportationInformation.json
TCIRealtedLaws.json		    TCI.txt


## Compound Identifications Data

In [3]:
df_identifications = pd.read_json('TCICompoundIdentifications.json', orient ='split', compression = 'infer')

In [4]:
df_identifications.sample(3)

Unnamed: 0,name,CAS,code,grade,ProductNumber,CasRN,reaxysRegistryNumber,pubchemSubstanceId,SMILESPubChem,merckIndex14,mdlNumber,sdbsAistSpectralDB,relatedCasRN,colourIndex,enzymeCommissionNumber
M1356,Methyl-beta-cyclodextrin,128446-36-6,M1356,,m1356,128446-36-6,,87573311.0,,,mfcd00074980,,,,
B5371,sec-Butyl Methacrylate,2998-18-7,B5371,,b5371,2998-18-7,,,CCC(C)OC(=O)C(=C)C,,mfcd00048637,,,,
D4283,Dipropylene Glycol Dimethyl Ether,111109-77-4,D4283,,d4283,111109-77-4,,172088909.0,,,mfcd00210047,,,,


In [5]:
df_identifications.drop(['ProductNumber'], axis=1, inplace=False)

Unnamed: 0,name,CAS,code,grade,CasRN,reaxysRegistryNumber,pubchemSubstanceId,SMILESPubChem,merckIndex14,mdlNumber,sdbsAistSpectralDB,relatedCasRN,colourIndex,enzymeCommissionNumber
A0001,Abietic Acid,514-10-3,A0001,,514-10-3,2221451.0,87561707.0,CC(C)C1=CC2=CCC3C(C2CC1)(CCCC3(C)C(=O)O)C,7.0,mfcd03423567,1471.0,,,
A0002,Ethyl Abietate,631-71-0,A0002,,631-71-0,,87561708.0,CCOC(=O)C1(CCCC2(C1CC=C3C2CCC(=C3)C(C)C)C)C,,mfcd00028860,,,,
A0003,Acenaphthene,83-32-9,A0003,,83-32-9,386081.0,87561709.0,C1CC2=CC=CC3=C2C1=CC=C3,28.0,mfcd00003807,863.0,,,
A0004,Acenaphthenequinone,82-86-0,A0004,,82-86-0,879172.0,87561710.0,C1=CC2=C3C(=C1)C(=O)C(=O)C3=CC=C2,,mfcd00003805,3313.0,,,
A0005,Acenaphthylene,208-96-8,A0005,,208-96-8,774092.0,87561711.0,C1=CC2=C3C(=C1)C=CC3=CC=C2,,mfcd00003806,1349.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2157,Carbon Nanotube Multi-walled,308068-56-6,C2157,,308068-56-6,,87559343.0,[C],,,,,,
C2158,Carbon Nanotube Multi-walled,308068-56-6,C2158,,308068-56-6,,87559344.0,[C],,,,,,
C3133,Carbon Nanotube Single-walled,308068-56-6,C3133,,308068-56-6,,253660889.0,[C],,,,,,
D6015,5-Iodo-2'-deoxytubercidin,166247-63-8,D6015,,166247-63-8,7626194.0,,,,mfcd07778650,,,,


In [6]:
df_identifications.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30669 entries, A0001 to I1156
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30669 non-null  object 
 1   CAS                     29903 non-null  object 
 2   code                    30669 non-null  object 
 3   grade                   732 non-null    object 
 4   ProductNumber           30669 non-null  object 
 5   CasRN                   29894 non-null  object 
 6   reaxysRegistryNumber    28269 non-null  float64
 7   pubchemSubstanceId      26483 non-null  float64
 8   SMILESPubChem           28854 non-null  object 
 9   merckIndex14            4145 non-null   float64
 10  mdlNumber               27392 non-null  object 
 11  sdbsAistSpectralDB      10056 non-null  float64
 12  relatedCasRN            744 non-null    object 
 13  colourIndex             290 non-null    float64
 14  enzymeCommissionNumber  30 non-null    

### Details

| column name | data type | table constraint | description |
| ---- | ---- | ---- | ---- |
|name | TEXT | - | Name of the compound |
|CAS|TEXT|-|CAS identification|
|reaxysRegistryNumber|TEXT|-|Identifier for chemical substance in Reaxys (Beilstein Registry Number)|
|pubchemSubstanceId|TEXT|-|Identifier from database of chemical molecules and their activities in biological assays PubChem ID|
|sdbsAistSpectralDB|TEXT|-|Spectral Database for Organic Compounds|
|merckIndex14|TEXT|-| Merck Index for authoritative information on chemicals, drugs and biologicals| 
|mdlNumber|TEXT|-|MLD unique identification number for each reaction and variation.|
|SMILESPubChem|TEXT|-|SMILES identifier from Pubchem|
|code|TEXT|PRIMARY KEY|TCI unique id|
|grade|TEXT|-|Grade refering to the purity of the chemical|
|CasRn|TEXT|-|CAS RN unique identifier that provides an unambiguous means to distinguish chemical substances or molecular structures| 
|colourIndex|TEXT|-|Colour Index Generic Name describes a commercial product by its recognised usage class, its hue and a serial number|
|relatedCasRn|TEXT|-|CAS RN unique identifier that provides an unambiguous means to distinguish chemical substances or molecular structures| 
|enzymeCommissionNumber|TEXT|-|Enzyme Commission numerical classification scheme for enzymes, based on the chemical reactions they catalyze|


### Create table and upload to server

In [7]:
host ='molecule-db-instance-1.czixbih3kolx.us-west-2.rds.amazonaws.com'
port = 5432
database = 'smallmoleculedb'
user = 'MoleculeMaster'
password = 'UXT7nljK3!R791Tlz!KAgHu'

In [8]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
engine

Engine(postgresql://MoleculeMaster:***@molecule-db-instance-1.czixbih3kolx.us-west-2.rds.amazonaws.com:5432/smallmoleculedb)

In [9]:
# Upload the dataframe -> changed everything to kumoji!!!
df_identifications.to_sql('tci_compound_identifications', engine, index=False)

In [10]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_compound_identifications ADD PRIMARY KEY ("code");')

### Test the connection

In [11]:
%%time
table_retrieved = engine.execute("SELECT * FROM tci_compound_identifications;").fetchall()

CPU times: user 151 ms, sys: 63 ms, total: 214 ms
Wall time: 5.36 s


In [12]:
%%time
df_retrieved = pd.read_sql("SELECT * FROM tci_compound_identifications;", engine)

CPU times: user 239 ms, sys: 38.6 ms, total: 277 ms
Wall time: 3.21 s


The *CPU or execution time*, measures how much time a CPU spent on executing a program.

  - The *user time* corresponds to the time spent executing the process.

  - The *System time* corresponds to the time that the CPU is answering to system calls from the process. The system time can change depending on the operating system.

*Wall time or wall-clock time*, measures the total time to execute a program in a computer. 

If *(CPU time)/(wall clock time) < 1* means the program was just waiting and not executing the program. It can be affected by 
* Processes running on the machine, if other processes are keeping the CPU busy, there might be the need to wait for some free CPU.
* Unstable or slow network.
* Waiting for the server response.

More information on `%%time` [here](https://ipython.readthedocs.io/en/stable/interactive/magics.html?highlight=%25time#magic-time). And [here](https://pythonspeed.com/articles/blocking-cpu-or-io/) is some discussion regarding time bottle necks.

In [13]:
table_retrieved[:3]

[('Abietic Acid', '514-10-3', 'A0001', None, 'a0001', '514-10-3', 2221451.0, 87561707.0, 'CC(C)C1=CC2=CCC3C(C2CC1)(CCCC3(C)C(=O)O)C', 7.0, 'mfcd03423567', 1471.0, None, None, None),
 ('Ethyl Abietate', '631-71-0', 'A0002', None, 'a0002', '631-71-0', None, 87561708.0, 'CCOC(=O)C1(CCCC2(C1CC=C3C2CCC(=C3)C(C)C)C)C', None, 'mfcd00028860', None, None, None, None),
 ('Acenaphthene', '83-32-9', 'A0003', None, 'a0003', '83-32-9', 386081.0, 87561709.0, 'C1CC2=CC=CC3=C2C1=CC=C3', 28.0, 'mfcd00003807', 863.0, None, None, None)]

In [14]:
df_retrieved.sample(3)

Unnamed: 0,name,CAS,code,grade,ProductNumber,CasRN,reaxysRegistryNumber,pubchemSubstanceId,SMILESPubChem,merckIndex14,mdlNumber,sdbsAistSpectralDB,relatedCasRN,colourIndex,enzymeCommissionNumber
21190,Methyl-2-nitrosopropane Dimer,31107-20-7,M1164,,m1164,31107-20-7,,,CC(C)(C)N=O.CC(C)(C)N=O,,mfcd00002065,23542.0,6841-96-9,,
13587,"Dibenzothiophene 5,5-Dioxide",1016-05-3,D4153,,d4153,1016-05-3,146515.0,160870915.0,C1=CC=C2C(=C1)C3=CC=CC=C3S2(=O)=O,,mfcd00004970,,,,
28692,Tetrakis(dimethylsilyl)silane,2003-85-2,T1753,,t1753,2003-85-2,2074444.0,87577487.0,C[Si](C)[Si]([Si](C)C)([Si](C)C)[Si](C)C,,mfcd01631315,,,,


## Available Stock

In [15]:
df_stock = pd.read_json('TCIAvailableStock.json', orient ='split', compression = 'infer')

In [16]:
df_stock.sample(3).dropna(axis=1, how='all')

Unnamed: 0,code,OtherWH1G,price1G,Hyogo1G,Saitama1G,OtherWH5G,price5G,Hyogo5G,Saitama5G,lowestPriceOption
14285,D5147,Contact Company,3900.0,Contact Company,4,Contact Company,13200.0,2,10,3900.0
5780,B4143,Contact Company,4200.0,1,1,Contact Company,14400.0,2,Contact Company,4200.0
21413,M1626,Contact Company,21900.0,2,3,Contact Company,67600.0,Contact Company,1,21900.0


### Details

| column name | data type | table constraint | description |
| ---- | ---- | ---- | ---- |
|code|TEXT|PRIMARY KEY|TCI unique id|
|lowestPriceOption|REAL|-|Lowest avaliable price for a given compound|
|price[Amount][Units]|TEXT|-|Price for the specified amount of a product|
|Saitama[Amount][Units]|TEXT|-|Number of goods available in Saitama_Kawaguchi for a given amount|
|Hyogo[Amount][Units]|TEXT|-|Number of goods available in Hyogo_Amagasaki for a given amount|
|OtherWH[Amount][Units]|TEXT|-| Number of goods in other Warehouses for a given amount|

In [17]:
# Upload the dataframe -> changed everything to kumoji!!!
df_stock.to_sql('tci_available_stock', engine,  index=True)

In [18]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_available_stock ADD PRIMARY KEY ("code");')

In [19]:
%%time
df_retrieved_2 = pd.read_sql("SELECT * FROM tci_available_stock;", engine)

CPU times: user 2.03 s, sys: 104 ms, total: 2.13 s
Wall time: 12.4 s


In [20]:
df_retrieved_2.sample(2).dropna(axis=1, how='all')

Unnamed: 0,index,code,OtherWH1G,price1G,Hyogo1G,Saitama1G,OtherWH25G,price25G,Hyogo25G,Saitama25G,OtherWH5G,price5G,Hyogo5G,Saitama5G,lowestPriceOption
2123,2123,A2845,,,,,18,52000.0,1,3,Contact Company,14600.0,1,4,14600.0
25403,25403,P1625,20.0,5600.0,19.0,20.0,20,53700.0,1,10,20,16300.0,20,1,5600.0


## General Information

In [21]:
df_general = pd.read_json('TCIGeneralInformation.json', orient ='split', compression = 'infer')

In [22]:
df_general.sample(3).dropna(axis=1, how='all')

Unnamed: 0,code,MolecularFormula,Molecular Weight,purityAnalysisMethod,appearance,solubilityWater,storeUnderInertGas,purity
T1916,T1916,C__2__1H__2__2Si,302.49,>98.0%(gc),white to light yellow powder to crystal,practically insoluble,store under inert gas,98.0
T0241,T0241,C__1__5H__1__5NO__4,273.29,>98.0%(t),white to almost white powder to crystal,,,98.0
M1983,M1983,C__7H__1__2O,112.17,>95.0%(gc),colorless to light orange to yellow clear liquid,,,95.0


### Details

| column name | data type | table constraint | description | Non-Null Count | Example output |
| ---- | ---- | ---- | ---- | ---- |  ---- |
|code|TEXT|PRIMARY KEY|TCI unique id| - |  'A0001' |
| MolecularWeight |  NUMERIC | - | - | 29682 | 302.46 |
| purity |  NUMERIC | - | - | 27969 | 80. |
| MolecularFormula | TEXT | - | - | 29819 | 'C__2__0H__3__0O__2' |
| purityAnalysisMethod | TEXT | - | - | 27969 | '>80.0%(gc)' | 
| appearance | TEXT | - | - | 29773 | 'white to light yellow powder to crystal' |
| solubilityWater | TEXT | - | - | 5378 | 'decomposes in contact with water,practically insoluble' |
| rangeMolecularWeight | TEXT | - | - | 19 | '5000 to 150000(calcd.on dried substance)' |
| sensitiveness* | TEXT | - | - | 82 | 'colorimetric test : turn red with 1ppm aluminium' |
| averageActiveOxygen | TEXT | - | - | 1 | 'min. 98.0 %' |
| averageN | TEXT | - | - | 42 | '40.0 to 50.0' |
| averageMN | TEXT | - | - | 1 | '3.5 to 4.5' |
| averageNM | TEXT | - | - | 3 | '2.5 to 3.5' |
| contentDryingSubstance | TEXT | - | - | 2 | '6.0 to 8.5 %' |
| etherificationValueDryingSubstance | TEXT | - | - | 2 |  '0.5 to 0.8' |
| cw | TEXT | - | - | 34 | 'first-class designated chemicals (precursor)' |
| AssayMonoEster | TEXT | - | - | 6 | '35.0 to 47.0 %' | 
| AssayDiester | TEXT | - | - | 6 | '53.0 to 65.0 %' |
| suitabilityAbsorptiometry | TEXT | - | - | 1 | 'to pass test' |
| suitabilityAldehydeAnalysis | TEXT | - | - | 2 | 'abs min 0.450(near 635nm) in the presence of formaldehyde(0.2ppm)' |
| suitabilityAminoAcidAnalysis | TEXT | - | - | 3 | 'to pass test' |
| suitabilityArsenicAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityBeryliumAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityCalciumAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityChromeAnalysis | TEXT | - | - | 1 | 'abs min.0.42(near 540nm)in the presence of dichromate(1 ppm)' |
| suitabilityCobaltAnalysis | TEXT | - | - | 2 | 'to pass test' |
| suitabilityCyanAnalysis | TEXT | - | - | 2 | 'to pass test' |
| suitabilityElectrophoresis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityFormaldehydeAnalysys | TEXT | - | - | 1 | 'abs min. 0.300(near 580nm) in the presence of formaldehyde(0.6ppm)' |
| suitabilityGCAnalysis | TEXT | - | - | 2 | 'to pass test' |
| suitabilityIronAnalysis | TEXT | - | - | 6 | 'min. 0.50(535 nm)' |
| suitabilityLcMsAnalysis | TEXT | - | - | 2 | 'to pass test' |
| suitabilityMagnesiumAnalysis | TEXT | - | - | 1 | 'abs min. 0.30(540nn) in the presence of mg(10ppm)' |
| suitabilityMassAnalysisCalibratio | TEXT | - | - | 3 | 'to pass test' |
| suitabilityMelamineAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityNitrateAnalysis | TEXT | - | - | 2 | 'abs min. 0.08(near 410nm) in the presence of nitrate ion(6 ppm)' |
| suitabilityNMRAnalysis | TEXT | - | - | 6 | 'effective as chiral shift reagent for 1-phenylethylamine' |
| suitabilitypCresolAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityProteinAnalysisEcoli | TEXT | - | - | 1 | 'to pass test' |
| suitabilityProteinAnalysisYeast | TEXT | - | - | 1 | 'to pass test' |
| suitabilityRedoxReagent | TEXT | - | - | 1 | 'to pass test' | 
| suitabilitySilverAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilitySulfateAnalysis | TEXT | - | - | 1 | 'abs min. 0.40(near 530nm) in the presence of sulfate ion(200 ppm)' |
| suitabilitySulfideAnalysis | TEXT | - | - | 2 | 'abs min. 0.37(near 668nm) in the presence of sulfide' |
| suitabilitySulfiteAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityTitaniumAnalysis | TEXT | - | - | 2 | 'abs min.0.28(near 390nm) in the presence of ti(1 ppm)' |
| suitabilityElisaTests | TEXT | - | - | 7 | 'min. 1.0 (human lactoferrin, 10 micro g/ml, od450)' |
| suitabilityVanadiumAnalysis | TEXT | - | - | 2 | 'to pass test' |
| suitabilityVCMAnalysis | TEXT | - | - | 1 | 'to pass test' |
| suitabilityTest | TEXT | - | - | 5 | 'to pass test(detection of primary and secondary amines)' |
| suitabilityTestProteinAnalysis | TEXT | - | - | 1 | 'to pass test' |
| elementalAnalysisNitrogen | TEXT | - | - | 340 | '24.00% to 25.50% (calcd.on anh.substance)' |
| elementalAnalysisCarbon | TEXT | - | - | 88 | '39.80 to 43.00 %' |
| elementalAnalysisOxygen | TEXT | - | - | 2 | '45 to 55 %(in 50deg-c, after drying in a vacuum)' |
| elementalAnalysisSulfuride | TEXT | - | - | 13 | '65.00 to 69.00 %' |
| elementalAnalysisHydrogen | TEXT | - | - | 3 | '6.80 to 7.50 %' |
| storeUnderInertGas | TEXT | - | - | 11072 | 'store under inert gas' |
| storageTemperature | TEXT | - | - | 4855 | '0-10°c' |
| suitabilityProteinAnalysis | TEXT | - | - | 37 | 'Noneto pass test' |

(*) May have mixed unicode characters in the output 

In [34]:
df_general.to_sql('tci_general_information', engine,  index=False)

In [35]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_general_information ADD PRIMARY KEY ("code");')

In [36]:
%%time
df_retrieved_3 = pd.read_sql("SELECT * FROM tci_general_information;", engine)

CPU times: user 633 ms, sys: 88.9 ms, total: 722 ms
Wall time: 17.2 s


In [38]:
df_retrieved_3.sample(3).dropna(axis=1, how='all')

Unnamed: 0,code,MolecularFormula,Molecular Weight,purityAnalysisMethod,appearance,solubilityWater,storeUnderInertGas,storageTemperature,purity
11511,D1408,C__5H__1__0Br__2,229.94,>95.0%(gc),colorless to light orange to yellow clear liquid,,,0-10°c,95.0
170,A0205,,,,white to light yellow powder to crystal,soluble,,,
7455,B6269,C__8H__1__8O__2Si,174.32,>95.0%(gc),colorless to light yellow clear liquid,,store under inert gas,0-10°c,95.0


##  GHS precautionary Statements

In [40]:
df_ghs = pd.read_json('TCIGHSprecautionaryStatements.json', orient ='split', compression = 'infer')

In [63]:
df_ghs.dropna(axis=0, how='any').sample(3)

Unnamed: 0,code,conditionsToAvoid,signalWord,poisonousOrDeleterious
C2580,C2580,heat sensitive,warning,deleterious substance
C1410,C1410,air sensitive,danger,deleterious substance
T2023,T2023,hygroscopic,danger,poisonous substance


### Details

| column name | data type | table constraint | description | Non-Null Count | Example output |
| ---- | ---- | ---- | ---- | ---- |  ---- |
|code|TEXT|PRIMARY KEY|TCI unique id| 30669 |  'A0001' |
| conditionsToAvoid | TEXT | - | - | 13784 | 'heat sensitive' |
| signalWord | TEXT | - | - | 21106 | 'warning' |
| poisonousOrDeleterious | TEXT | - | - | 1494 | '	poisonous substance' |

In [65]:
df_ghs.to_sql('tci_ghs', engine,  index=False)

In [66]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_ghs ADD PRIMARY KEY ("code");')

In [67]:
%%time
df_retrieved_4 = pd.read_sql("SELECT * FROM tci_ghs;", engine)

CPU times: user 53 ms, sys: 1.25 ms, total: 54.3 ms
Wall time: 3.4 s


In [71]:
df_retrieved_4.sample(3)

Unnamed: 0,code,conditionsToAvoid,signalWord,poisonousOrDeleterious
18553,H1395,,warning,
16431,F0532,,warning,deleterious substance
28579,T1612,moisture sensitive,danger,


## Realted Laws

In [73]:
df_laws = pd.read_json('TCIRealtedLaws.json', orient ='split', compression = 'infer')

In [94]:
df_laws.drop('code', axis=1).dropna(axis=0, how = 'all').sample(5).dropna(axis=1, how = 'all')

Unnamed: 0,code,ChemicalSubstanceLawNumber,RTECS,fireDefenseLaw,prtrLawNewSpecificChemical,ishl
D3813,D3813,,,group-4-3-iii,,
N1202,N1202,1-417,qr7040000,,specified class 1 designated chemical substances,
G0409,G0409,"3-559, 3-594",,group-4-3-iii,class 1 designated chemical substances,mutagens
M2422,M2422,,,,,
D1955,D1955,3-1250,am7700000,,,


### Details

| column name | data type | table constraint | description | Non-Null Count | Example output |
| ---- | ---- | ---- | ---- | ---- |  ---- |
|code|TEXT|PRIMARY KEY|TCI unique id| 30669 |  'A0001' |
| ChemicalSubstanceLawNumber | TEXT | - | - | 8258 | '4-675' |
| RTECS | TEXT | - | - | 8231 | 'tp8580000' |
| fireDefenseLaw | TEXT | - | - | 8951 | 'group-4-3-iii' |
| prtrLawNewSpecificChemical | TEXT | - | - | 1959 | 'class 1 designated chemical substances' |
| narcoticsAndPsychotropicsControlLaw | TEXT | - | - | 72 | 'narcotic or psychotropic raw material' |
| ishl | TEXT | - | - | 389 | 'mutagens' |
| chemicalSubstanceLawEncs | TEXT | - | - | 49 | 'priority assessment chemical substance' |
| pharmaceuticalAffairsLawScheduled | TEXT | - | - | 12 | 'designated substance' |
| protectionOfTheOzoneLayerLawTypeSpecifiedMaterial | TEXT | - | - | 7 | appendix 1-1' |

In [98]:
df_laws.to_sql('tci_laws', engine,  index=False)

In [99]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_laws ADD PRIMARY KEY ("code");')

In [100]:
%%time
df_retrieved_5 = pd.read_sql("SELECT * FROM tci_laws;", engine)

CPU times: user 56.8 ms, sys: 9.11 ms, total: 65.9 ms
Wall time: 2.74 s


In [102]:
df_retrieved_5.dropna(axis=0, how = 'all').sample(5).dropna(axis=1, how = 'all')

Unnamed: 0,code,ChemicalSubstanceLawNumber,RTECS,fireDefenseLaw,prtrLawNewSpecificChemical
25235,P1319,,,group-4-1-ii,
10358,D0093,,,,
20124,L0403,9-2424,oj6360000,,
7508,C0110,3-194,bx0350000,group-4-3-iii,class 1 designated chemical substances
10841,D0696,3-2227,,group-4-1-ii,


## Transportation Information

In [106]:
df_transportation = pd.read_json('TCITransportationInformation.json', orient ='split', compression = 'infer')

In [127]:
df_transportation.drop('code', axis=1).dropna(axis=0, how = 'all').sample(5)

Unnamed: 0,unNumber,PackagingAndContainer,packingGroup,class,airTransportation
T2426,un1993,,iii,3,
B5015,,1G-Glass Bottle with Plastic Insert,,,
C1541,un2734,,ii,8 / 3,
T0078,un3439,,iii,6.1,
D2408,,100MG-Glass Bottle with Plastic Insert,,,


In [134]:
for value in df_transportation.keys():
    print(value)
    print(df_transportation[value].unique()[:3])

code
['A0001' 'A0002' 'A0003']
unNumber
['un3077' None 'un2332']
PackagingAndContainer
[None '(View image), \xa025G-Ampule' '1G-Glass Bottle with Plastic Insert']
packingGroup
['iii' None 'ii']
class
[9.0 None 3.0]
airTransportation
[None 'airmail banned']


### Details

| column name | data type | table constraint | description | Non-Null Count | Example output |
| ---- | ---- | ---- | ---- | ---- |  ---- |
|code|TEXT|PRIMARY KEY|TCI unique id| 30669 |  'A0001' |
| unNumber |TEXT| - | - | 7432 | 'un3077' |
| PackagingAndContainer* |TEXT| - | - | 2627 | '1G-Glass Bottle with Plastic Insert' |
| packingGroup |TEXT| - | - | 7263 | 'ii' |
| class |TEXT| - | - | 7439 | 3.0 |
| airTransportation |TEXT| - | - | 186 | 'airmail banned' |

In [135]:
df_transportation.to_sql('tci_transportation', engine,  index=False)

In [136]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_transportation ADD PRIMARY KEY ("code");')

In [137]:
%%time
df_retrieved_6 = pd.read_sql("SELECT * FROM tci_transportation;", engine)

CPU times: user 70.3 ms, sys: 28.7 ms, total: 99 ms
Wall time: 2.52 s


In [138]:
df_retrieved_6.drop('code', axis=1).dropna(axis=0, how = 'all').sample(5)

Unnamed: 0,unNumber,PackagingAndContainer,packingGroup,class,airTransportation
7372,,1G-Glass Bottle with Plastic Insert,,,
12770,un3077,,iii,9,
20692,un1224,,iii,3,
28236,un2924,,ii,3 / 8,
28980,un3077,,iii,9,


## Specific Properties

In [139]:
df_properties = pd.read_json('TCISpecificProperties.json', orient ='split', compression = 'infer')

In [152]:
df_properties.drop('code', axis=1).dropna(axis=0, how = 'all').sample(6).dropna(axis=1, how = 'all')

Unnamed: 0,maximumAbsorptionWavelength,solubilitySolubleIn,solubilityInHotToluene,solubilityInHotDmf,boilingPoint,meltingPoint
B5739,313(meoh) nm,toluene,,,192.0,124.0
D5207,,acetone,,,,118.0
T0167,,,almost transparency,,,225.0
D2842,,methanol,,,,
N0603,,,,almost transparency,,
B1385,,methanol,,,231.0,74.0


In [154]:
for value in df_properties.keys():
    print(value)
    print(df_properties[value].unique()[:3])

code
['A0001' 'A0002' 'A0003']
specificGravity
[None 1.03 0.97]
density20degC
[None '0.8460 to 0.8560 g/ml' '0.8080 to 0.8120 g/ml']
transitionIntervalPH
[None '(pale yellow)ph10\u3000to\u3000ph12(deep yellow)'
 '(purplish red)1.2 to 3.0(yellow)']
esterValue
[None '430.0 to 470.0' '246 to 254']
viscosity
[None '500.0 to 900.0 mpa-s(2 %, h2o, 25 deg-c)'
 '900 to 1400 mpa-s(1 %, h2o, 25 deg-c)']
averageMolecularWeight
[None '285 to 310' '287 to 292']
freezingPoint
[None '36.0 to 42.0  °c' '22.0 to 26.0  °c']
bindingCapacity
[None 'to pass test(min. 5 mg/ml gel, human lactoferrin)' 'to pass test']
biotinylationRatio
[None 'to pass test' 'to pass test (3 to 20)']
lumoLevel
[None '-4.0\xa0ev' '-3.9\xa0ev']
holeMobilityMuFet
[None 'min. 0.10 cm2/vs(hmds si/sio2 substrate)'
 'min. 0.1 cm2/vs(bare si/sio2 substrate)']
exchangeCapacity
[None '0.50 to 0.90 meq/g(calcd.on anh.substance)']
maximumAbsorptionWavelength
[None '493(h2o)\xa0nm' '492(h2o)\xa0nm']
absorbance275nm
[None 'max. 0.25']
absor

In [157]:
properties_list = list(df_properties.keys())
len(properties_list)

134

### Details

| column name | data type | table constraint | description | Non-Null Count | Example output |
| ---- | ---- | ---- | ---- | ---- |  ---- |

In [159]:
df_properties.to_sql('tci_specificproperties', engine,  index=False)

In [160]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_specificproperties ADD PRIMARY KEY ("code");')

In [161]:
%%time
df_retrieved_7 = pd.read_sql("SELECT * FROM tci_specificproperties;", engine)

CPU times: user 965 ms, sys: 59.7 ms, total: 1.02 s
Wall time: 10.7 s


In [163]:
df_retrieved_7.drop('code', axis=1).dropna(axis=0, how = 'all').sample(5).dropna(axis=1, how = 'all')

Unnamed: 0,specificGravity,refractiveIndex,degreeOfSolubilityInWater,solubilitySolubleIn,flashPoint,boilingPoint,meltingPoint
5070,1.22,1.45,,,,,
1434,,,,,,,238.0
7825,0.95,1.45,87 g/l 20 °c,"alcohol,ether,acetone",44.0,157.0,
4331,,,,,,,115.0
20585,1.1,,,,129.0,243.0,14.0


## Specifications

In [164]:
df_specifications = pd.read_json('TCISpecifications.json', orient ='split', compression = 'infer')

In [165]:
df_specifications.drop('code', axis=1).dropna(axis=0, how = 'all').sample(6).dropna(axis=1, how = 'all')

Unnamed: 0,purityNonaqueousTitration,purityHPLC,purityGC,purityNeutralizationTitration,physicalState20degC
D1134,,,,,liquid
M2283,,,min. 98.0 %,,solid
B4356,,min. 98.0 area%,,min. 98.0 %,solid
T3702,min. 98.0 %,,min. 98.0 %,,solid
S0966,,,,,liquid
T2512,,,min. 98.0 %,,liquid


In [166]:
df_specifications.to_sql('tci_productspecifications', engine,  index=False)

In [167]:
# define a primary Key
with engine.connect() as con:
    con.execute('ALTER TABLE tci_productspecifications ADD PRIMARY KEY ("code");')

In [168]:
%%time
df_retrieved_8 = pd.read_sql("SELECT * FROM tci_productspecifications;", engine)

CPU times: user 2.73 s, sys: 142 ms, total: 2.87 s
Wall time: 14.1 s


In [169]:
df_retrieved_8.drop('code', axis=1).dropna(axis=0, how = 'all').sample(5).dropna(axis=1, how = 'all')

Unnamed: 0,purityArgentometricTitration,purityHPLC,purityGC,purityNMR,nmr,physicalState20degC
26734,,,min. 99.0 %,,,liquid
23398,,min. 98.0 area%,,,,solid
25164,,,min. 95.0 %,,,liquid
6067,min. 95.0 %,min. 97.0 area%,,,,solid
15977,,,,min. 95.0 atom%,confirm to structure,solid
