In [1]:
import os
import pandas as pd
import numpy as np
import pylab as plt
from openbabel import openbabel as ob
from openbabel import pybel as pb
from tqdm.auto import tqdm
from pymatgen import Molecule

In [2]:
train = pd.read_csv("Task/train.csv", index_col=0)
train["Smiles"].replace("se", "Se", inplace=True, regex=True)
test = pd.read_csv("Task/test.csv", index_col=0)

In [3]:
pb.forcefields

['gaff', 'ghemical', 'mmff94', 'mmff94s', 'uff']

In [5]:
def xyz_from_smiles(smiles, forcefield="mmff94", steps=50, folder=None):
    """ Finds xyz coordinates of molecules from smile strings"""
    
    mols = []
    mols_smi = [pb.readstring('smi', mol) for mol in smiles]
    if folder:
        os.makedirs(folder, exist_ok=True)
    for idx, mol in tqdm(enumerate(mols_smi), total=len(mols_smi)):

        # optimize 3D geometry
        mol.make3D(forcefield=forcefield, steps=steps)
        mol.localopt(forcefield=forcefield, steps=steps)

        # convert to pymatgen class
        species = []
        coords = []
        for atom in ob.OBMolAtomIter(mol.OBMol):
            species.append(atom.GetAtomicNum())
            coords.append([atom.GetX(), atom.GetY(), atom.GetZ()])
        mol = Molecule(species, coords)
        if folder:
            mol.to(filename=f"{folder}/{idx}.xyz")
        mols.append(mol)
    return mols

In [7]:
train_xyz = xyz_from_smiles(train["Smiles"], folder="structures/train")
test_xyz = xyz_from_smiles(test["Smiles"], folder="structures/test")

  0%|          | 0/5557 [00:00<?, ?it/s]

  0%|          | 0/1614 [00:00<?, ?it/s]