In [1]:
"""Module for lightning datamodules"""

from rdkit.Chem.rdmolfiles import MolFromSmarts
from rdkit.Chem import Descriptors
import pandas as pd
import dask.dataframe as dd
from deepchem.splits.splitters import RandomStratifiedSplitter, ScaffoldSplitter
import deepchem.molnet as dcm
from torch.utils.data import DataLoader, Subset
from pytorch_lightning import LightningDataModule
from tokenizers import Tokenizer
from tokenizers.models import BPE
from utils import util_funcs
from typing import Callable, Dict, List
from rdkit import RDLogger
import numpy as np
from deepchem.data.datasets import DiskDataset
from tokenizers import Tokenizer
from torch.utils.data import Dataset


class FGRPretrainDataset(Dataset):
    """Pytorch dataset for pretraining autoencoder"""

    def __init__(self, smiles: List[str], fgroups_list: List[str], tokenizer: Tokenizer) -> None:
        """Initialize dataset with arguments

        Args:
            smiles (List[str]): List of SMILES strings
            fgroups_list (List[str]): List of functional groups
            tokenizer (Tokenizer): Pretrained Tokenizer
        """
        self.smiles = smiles
        self.fgroups_list = fgroups_list
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        print(str(smile))
        f_g, mfg = util_funcs.smiles2vector_fgr(smile, self.tokenizer, self.fgroups_list)
        return f_g, mfg



class FGRPretrainDataModule(LightningDataModule):
    """Lightning Datamodule for pretraining"""

    def __init__(
        self,
        root: str,
        batch_size: int,
        num_workers: int,
        pin_memory: bool,
    ) -> None:
        """Initialize lightning datamodule for pretraining

        Args:
            root (str): Root data folder
            batch_size (int): Batch size
            num_workers (int): Number of workers for data loading
            pin_memory (bool): Save data in memory
        """
        super().__init__()

        self.root = root
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.batch_size = batch_size
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")).from_file("./data/tokenizer_bpe.json")

    def prepare_data(self) -> None:
        df = dd.read_csv(self.root + "pubchem/pubchem_data_*.csv")
        print("Reading SMILES")
        self.train, self.valid = df["SMILES"].random_split((0.9, 0.1), random_state=123)
        print("Splitting")
        self.train.compute()
        self.valid.compute()
        fgroups = pd.read_csv(self.root + "fg.csv")["SMARTS"].tolist()
        self.fgroups_list = [MolFromSmarts(x) for x in fgroups]

    def setup(self, stage=None):
        self.train_fold = FGRPretrainDataset(self.train, self.fgroups_list, self.tokenizer)
        self.val_fold = FGRPretrainDataset(self.valid, self.fgroups_list, self.tokenizer)

    def train_dataloader(self):
        loader = DataLoader(
            self.train_fold,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            drop_last=True,
        )
        return loader

    def val_dataloader(self):
        loader = DataLoader(
            self.val_fold,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
        )
        return loader


In [2]:
dat = FGRPretrainDataModule('../datasets/processed/', batch_size=8, num_workers=4, pin_memory=True)

In [3]:
dat.prepare_data()
dat.setup()

Reading SMILES
Splitting


In [4]:
lo = dat.train_dataloader()

In [None]:
next(iter(lo))

In [9]:
from dask.diagnostics import ProgressBar
ProgressBar().register()
from rdkit.Chem.rdmolfiles import MolFromSmarts, MolFromSmiles

def mol(x):
    try:
        mols = MolFromSmiles(x)
    except:
        mols = None
    return mols

df = dd.read_csv("../datasets/processed/"+ "pubchem/pubchem_data_*.csv")
print("Reading SMILES")
smiles_mol = df["SMILES"].map(lambda x: mol(x))
smiles_mol.compute()

Reading SMILES


In [12]:
from rdkit.Chem.rdmolfiles import MolFromSmarts, MolFromSmiles
try:
    mol = MolFromSmiles('CC#')
except:
    print('error')

[10:39:06] SMILES Parse Error: syntax error while parsing: CC#
[10:39:06] SMILES Parse Error: Failed parsing SMILES 'CC#' for input: 'CC#'


In [31]:
data = {'SMILES': ['CC#','CC#','CC','CC#']}

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(data)['SMILES'].map(lambda x: MolFromSmiles(x))

[10:42:11] SMILES Parse Error: syntax error while parsing: CC#
[10:42:11] SMILES Parse Error: Failed parsing SMILES 'CC#' for input: 'CC#'
[10:42:11] SMILES Parse Error: syntax error while parsing: CC#
[10:42:11] SMILES Parse Error: Failed parsing SMILES 'CC#' for input: 'CC#'
[10:42:11] SMILES Parse Error: syntax error while parsing: CC#
[10:42:11] SMILES Parse Error: Failed parsing SMILES 'CC#' for input: 'CC#'


In [39]:
from dask.diagnostics import ProgressBar
from rdkit.Chem.rdmolfiles import MolFromSmiles
import dask.dataframe as dd
df = dd.read_csv("../datasets/processed/" + "pubchem/pubchem_data_*.csv")
print("Reading SMILES")
smiles_mol = df["SMILES"].map(lambda x: MolFromSmiles(x))
df['mol'] = smiles_mol
df = df.dropna()

Reading SMILES


[10:43:58] SMILES Parse Error: syntax error while parsing: foo
[10:43:58] SMILES Parse Error: Failed parsing SMILES 'foo' for input: 'foo'
[10:43:58] SMILES Parse Error: syntax error while parsing: foo
[10:43:58] SMILES Parse Error: Failed parsing SMILES 'foo' for input: 'foo'
