In [21]:
from pathlib import Path
import pandas as pd
from hydra import compose, initialize
from ergochemics.draw import draw_reaction, draw_molecule
from IPython.display import SVG
from rdkit import Chem
import yaml

with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose("filepaths")

In [22]:
cpds = pd.read_csv(Path(cfg.interim_data) / "compounds.csv")
cpds.head()

Unnamed: 0,id,smiles,name,rxn_count,n_atoms
0,0,*C#N,a nitrile,6,3
1,1,*C(*)(N)C(=O)O,"2,2-dialkylglycine",2,7
2,2,*C(*)(O)C#N,a disubstituted aliphatic (S)-hydroxynitrile,2,6
3,3,*C(*)(O)C(*)(*)O,an ethanediol,2,8
4,4,*C(*)=O,dialkyl ketone,14,4


In [23]:
cpds.sort_values(by="rxn_count", ascending=False, inplace=True)
cpds.head(20)

Unnamed: 0,id,smiles,name,rxn_count,n_atoms
5990,5990,O,H2O,4599,1
1910,1910,CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc...,CoA,1870,48
5972,5972,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,ATP,1423,31
6612,6612,O=O,O2,1280,2
5468,5468,NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC...,NADP(+),1215,48
5426,5426,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,NADPH,1205,48
5467,5467,NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC...,NAD(+),1137,44
6672,6672,O=P(O)(O)OP(=O)(O)O,diphosphate,1122,9
5423,5423,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,NADH,1064,44
6615,6615,O=P(O)(O)O,phosphate,996,5


In [24]:
default_sources = cpds.iloc[:20]["name"].values.tolist()
default_sources.remove("2-oxoglutarate")

In [25]:
cpds["n_atoms"] = cpds["smiles"].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())
cpds.sort_values(by="n_atoms", ascending=True, inplace=True)
cpds.head(10)

Unnamed: 0,id,smiles,name,rxn_count,n_atoms
5990,5990,O,H2O,4599,1
6917,6917,S,hydrogen sulfide,18,1
1262,1262,C,methane,4,1
5327,5327,N,NH4(+),442,1
5102,5102,CS,methanethiol,8,2
4634,4634,CN,methylamine,18,2
585,585,*N,a primary amine,10,2
6612,6612,O=O,O2,1280,2
1510,1510,C=O,formaldehyde,74,2
1263,1263,C#C,acetylene,2,2


In [26]:
default_sources.extend(["hydrogen sulfide", "methane"])

In [27]:
with open(Path(cfg.configs) / "sources" / "default.yaml", "w") as f:
    yaml.dump({"source_names": default_sources}, f)