In [11]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
import seaborn as sns

import tmap, os

from chembench import dataset

from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
%matplotlib inline
tqdm.pandas(ascii=True)
np.random.seed(123)



In [12]:
n_fold = 5

In [13]:
esol = dataset.load_ESOL()
lipop = dataset.load_Lipop()
malaria = dataset.load_Malaria()
pdbf = dataset.load_PDBF()
hiv = dataset.load_HIV()
bace = dataset.load_BACE()
bbbp = dataset.load_BBBP()

total samples: 1128
total samples: 4200
total samples: 9999
total samples: 9880
total samples: 41127
total samples: 1513
total samples: 2039


In [14]:
random_seeds = [2, 32, 128, 512, 1024]
cols = ['rd_%s' % i for i in random_seeds]

data_save_folder = './rand_split_results/'
if not os.path.exists(data_save_folder):
    os.makedirs(data_save_folder)

    
for data in [esol, lipop, malaria, hiv, bace, bbbp , pdbf]:
    
    task_name = data.task_name
    df = pd.DataFrame(data.x, columns = ['smiles'])
    df[task_name] = data.y

    ncols = []
    for seed, col in zip(random_seeds, cols):
        kf = KFold(n_splits=5, shuffle = True, random_state=seed)
        gb = {}
        i = 1
        for tr, ts in kf.split(range(len(df))):
            gb.update(dict(zip(ts, [i for x in ts])))
            i += 1
        ncol = pd.Series(gb).sort_index()
        ncols.append(ncol)
        
    dfc = pd.concat(ncols, axis=1)
    dfc.columns = cols
    dfc.to_csv(os.path.join(data_save_folder, 'rd_split_%s.csv' % task_name))