In [3]:
import os
import pandas as pd
from tqdm.auto import tqdm
from openbabel import openbabel as ob
from openbabel import pybel as pb
from pymatgen import Molecule

In [4]:
train = pd.read_csv("Clean/train.csv", index_col=0)
test = pd.read_csv("Clean/test.csv", index_col=0)

In [5]:
def xyz_from_smiles(smiles, forcefield="gaff", steps=1000, folder=None):
    """ Find xyz coordinates of molecules from smile strings"""
    
    mols = []
    mols_smi = [pb.readstring('smi', mol) for mol in smiles]
    if folder:
        os.makedirs(folder, exist_ok=True)
    for idx, mol in tqdm(enumerate(mols_smi), total=len(mols_smi)):

        # optimize 3D geometry
        mol.make3D(forcefield=forcefield, steps=steps)
        mol.localopt(forcefield=forcefield, steps=steps)

        # convert to pymatgen class
        species = []
        coords = []
        for atom in ob.OBMolAtomIter(mol.OBMol):
            species.append(atom.GetAtomicNum())
            coords.append([atom.GetX(), atom.GetY(), atom.GetZ()])
        mol = Molecule(species, coords)
        if folder:
            mol.to(filename=f"{folder}/{idx}.xyz")
        mols.append(mol)
    return mols

In [None]:
train_xyz = xyz_from_smiles(train["Smiles"], folder="structures/train")
test_xyz = xyz_from_smiles(test["Smiles"], folder="structures/test")