In [9]:
import pickle
import random

class moleculeRNN:

    def __init__(self, maxlen, step):
        self._data=None
        self._charset=None
        self._predictors=None
        self._response=None
        self._maxlen=maxlen
        self._step=step
        self.char2indices=None
        self.indices2char=None
        self._corpus=None
        random.seed(1234)

    def readFile(self, file):
        with open(file, 'rb') as f:
            self._data = pickle.load(f)
        #self._data = pd.read_csv(file)

    def postprocessData(self):
        self._data.loc[:, 'encodedSmilesPostprocessed'] = self._data.loc[:, 'encodedSmiles'].map(lambda x: x + 'Q')

    def __provideDescriptor(self, smiles):
        smilesOB = SmilesOB(smiles)
        if smilesOB.getConversionflag():
            smilesOB.addH()
            smilesOB.calculateQuantities()
            result = smilesOB.getQuantities()
            return result

    def createDescriptor(self):
        self._data.loc[:, 'molDescriptor'] = self._data.loc[:, 'canonicalSmiles'].apply(self.__provideDescriptor)

    def sampleTheData(self, nsample = 10000, writeToFile=False, corpusname=None):
        sample=random.sample(list(self._data.index), nsample)
        if (writeToFile):
            self._data.iloc[sample, :].to_csv(corpusname+"_sample"+".csv", index=False)

        self._corpus = self._data.loc[sample, 'encodedSmilesPostprocessed'].str.cat(sep='')
        self._charset = sorted(list(set(self._corpus)))

    def prepareInputForTraining(self):
        chars = sorted(list(set(self._corpus)))
        print('total chars:', len(chars))
        self.char2indices = dict((c, i) for i, c in enumerate(chars))
        self.indices2char = dict((i, c) for i, c in enumerate(chars))

        sentences = []
        nextchars = []

        for iii in range(0, len(self._corpus) - self._maxlen, self._step):
            sentences.append(self._corpus[iii: iii + self._maxlen])
            nextchars.append(self._corpus[iii + self._maxlen])
        print('number of sequences:', len(sentences))

        print('Vectorization...')
        self._predictors = np.zeros((len(sentences),self._maxlen, len(chars)), dtype=np.bool)
        self._response = np.zeros((len(sentences), len(chars)), dtype=np.bool)
        for iii, sentence in enumerate(sentences):
            for jjj, char in enumerate(sentence):
                self._predictors[iii, jjj, self.char2indices[char]] = 1
            self._response[iii, self.char2indices[nextchars[iii]]] = 1

    def pickleDictionaries(self, corpusname):
        fileaname=corpusname+'.pckl'

        with open(fileaname, 'wb') as f:
            pickle.dump([self.char2indices, self.indices2char], f)

    def getPredictors(self):
        return self._predictors

    def getResponses(self):
        return self._response

In [10]:
moleculeRNN = moleculeRNN(40, 2)
pcklFile = 'molDataGrouped.pckl'
moleculeRNN.readFile(pcklFile)