# Import

In [2]:
from os import listdir
from os.path import isfile, join

import os
import re

import pandas as pd

import numpy as np
import datetime

In [3]:
class resultSet:
    def __init__(self, results_dir, prop_delta, head=None, tolerance=.001, only_check_SMILES=True, n_core=18, 
                 init_data_path='~/CS6250_project/raw_data/fp_all_data.csv'):
        self.parent_dir = results_dir
        os.chdir(self.parent_dir)
        self.prop_delta = prop_delta
        self.all_files = [f for f in listdir(self.parent_dir) if isfile(join(self.parent_dir, f))]
        self.results_files = [f for f in self.all_files if re.match('^results.[0-9]+$', f) != None]
        self.only_check_SMILES = only_check_SMILES
        self.tolerance = tolerance
        self.fp_df_path = 'fp_df.csv'
        self.init_data_path = init_data_path
        self.n_core = n_core
        self.head = head
        
    def resultsToList(self, path_to_results):
        '''
        Load results from path_to_results as list of unique tuples
        '''
        start_append = False
        data = set()
        with open(path_to_results, 'r') as f:
            for line in f:
                if 'Done' in line:
                    start_append = True
                elif start_append:
                    #print line.split()
                    _, _, target, _, bg = line.split()
                    bg = float(bg)
                    if bg > self.prop_delta:
                        data.add((target, bg))
        return data
    
    def create_fp_input(self):
        '''
        Create input for 'fp' command
        '''
        f = ('file_dataset = ./smiles_df.csv\n'
        'col_smiles = SMILES\n'
        'col_X = aT bT m e\n'
        'col_id = ID\n'
        'file_fingerprint = fp_df.csv\n'
        'polymer_fp_version = 2\n'
        'ismolecule = 0\n'
        'drop_failed_rows = 0\n'
        'ncore = %s\n' %self.n_core)
        text_file = open('fp_input', "w")
        text_file.write(f)
        text_file.close()
        
    def _run_fp(self):
        '''
        Run fp command. Should only be called after create_fp_input
        '''
        os.system('fp fp_input') 
    
    def collectUniqueResults(self):
        '''
        Return all unique-SMILES polymers with superior property values
        '''
        inter_epoch_data = set()
        for f in self.results_files:
            inter_epoch_data = inter_epoch_data.union(self.resultsToList(f))
        #self.inter_epoch_data = inter_epoch_data
        return inter_epoch_data
    
    def prep_pre_fp_df(self):
        '''
        Prepare dataframe for fingerprinting
        '''
        inter_epoch_data = self.collectUniqueResults()
        pre_fp_df = pd.DataFrame(inter_epoch_data, columns=['SMILES', 'Property Value']) #create df to use for fingerprinting
        if self.head != None:
            pre_fp_df = pre_fp_df.head(self.head)
        pre_fp_df['ID'] = ['ID_%s' %ind for ind in pre_fp_df.index] #create ID for polymers
        if self.only_check_SMILES == True:
            pre_fp_df.drop_duplicates(subset=['SMILES'], inplace=True) #drop duplicate SMILES
        self.pre_fp_df = pre_fp_df
        self.pre_fp_df.to_csv('smiles_df.csv')
    
    def runFP(self):
        '''
        Run fingerprinting
        '''
        self.prep_pre_fp_df()
        self.create_fp_input() 
        print("Starting to Fingerprint Valuable Polymers from Results Set")
        self._run_fp() #run fingerprinting
        print("Finished Fingerprinting Valuable Polymers from Results Set")
        
    
    def dropWithTolerance(self, df, cols_to_consider):
        '''
        Drop all duplicates from data frame
        '''
        #drop duplicates with tolerance
        np_df = df[cols_to_consider].to_numpy()
        keep_rows = []
        duplicate_rows = []
        tol = [self.tolerance]*np_df.shape[1]
        for ind, row in enumerate(np_df):
            if ind not in duplicate_rows:
                keep_rows += [ind]
                diffs = np.abs(np.asarray(row[None, :]) - np_df)
                matching_inds = np.nonzero((diffs <= tol).all(1))[0].tolist()
                try:
                    matching_inds.remove(ind)
                except:
                    pass
                duplicate_rows += matching_inds
        df = df.iloc[keep_rows, :]
        return df
    
    
    def dropWithToleranceFromReference(self, df, ref):
        np_df = df[list(self.fp_intersect)].to_numpy()

        tol = [self.tolerance]*len(self.fp_intersect)

        np_reduced = ref[list(self.fp_intersect)].to_numpy()

        new_polymers = []
        keep_inds = []
        pvs = []
        for ind, row in enumerate(np_df):
            smiles = df.iloc[ind]['SMILES']
            pv = df.iloc[ind]['Property Value']
            diffs = np.abs(np.asarray(row[None, :]) - np_reduced)
            matching_inds = np.nonzero((diffs <= tol).all(1))[0].tolist()
            if len(matching_inds) == 0:
                new_polymers.append(smiles)
                keep_inds.append(ind)
                pvs.append(pv)
        print("%s Valid, Novel, Unique (in fingerprint space) Polymers have been generated" %len(new_polymers))
        new_polymers_df = pd.DataFrame({'SMILES': new_polymers, "Band gap": pvs})
        return new_polymers_df

        
    def vuPolymers(self):
        '''
        Return all valuable, unique polymers in resultSet
        '''
        self.runFP()
        df = pd.read_csv('fp_df.csv') #load df containing fingerprint for each SMILES
        df = df.iloc[df.dropna().index] #drop NA
        self.fp_cols = [col for col in df.keys() if col != 'ID' and 'Unnamed' not in col] #get columns which contain fingerprint
        df = self.dropWithTolerance(df, self.fp_cols)
        print("%s Valid, Unique (in fingerprint space) Polymers have been generated" %len(df))
        return df
    
    def vnuPolymers(self):
        '''
        Return all valuable, novel, unique polymers in resultSet
        '''
        df = self.vuPolymers()
        df = df.merge(self.pre_fp_df, on='ID')
        initial_df = pd.read_csv(self.init_data_path) #load initial dataset
        initial_fp_cols = [col for col in initial_df.keys() if col != 'id' and 'Unnamed' not in col and 'bandgap' not in col]

        self.fp_intersect = set(self.fp_cols).intersection(initial_fp_cols)

        reduced_df = initial_df[list(self.fp_intersect) + ['smiles']]
        
        new_polymers_df = self.dropWithToleranceFromReference(df, reduced_df)
        return new_polymers_df
    
    def save_vnu_polymers(self):
        new_polymers_df = self.vnuPolymers()
        self.now = datetime.datetime.now().strftime("%I_%M%p_on_%B_%d_%Y")
        save_file = 'new_polymers_%s.csv' %self.now
        new_polymers_df.to_csv(save_file)
        
        print("New Polymers saved to %s" %save_file)

In [4]:
rs = resultSet('/home/rgur/g2g/improved/lr_0.001_bs_32_depthT_6_depthG_8/results/', 6, tolerance=.001, n_core=5)

In [5]:
rs.save_vnu_polymers()

Starting to Fingerprint Valuable Polymers from Results Set
Finished Fingerprinting Valuable Polymers from Results Set
5053 Valid, Unique (in fingerprint space) Polymers have been generated
4975 Valid, Novel, Unique (in fingerprint space) Polymers have been generated
New Polymers saved to new_polymers_12_35PM_on_April_28_2020.csv


In [1]:
import numpy as np

In [2]:
a = np.array([1,2,3])

In [6]:
b = [a, a]

In [7]:
import pandas as pd

In [11]:
pd.DataFrame(data=b, columns=['a', 'b', 'c'])

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3
