In [1]:
# Imports
import os
import tempfile
import shutil
import abc
import pandas as pd
import numpy as np
import math
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn import metrics
from scipy import stats as stats
from sklearn.model_selection import KFold
# import chemprop
from sklearn.ensemble import RandomForestRegressor as RF
#from lightgbm import LGBMRegressor as lgb
from models import ACANetOPTPoC, ACANetOPT 

def cross_validation(x, y, prop, model, k=10, seed=1): # provide option to cross validate with x and y instead of file
    kf = KFold(n_splits=k, random_state=seed, shuffle=True)
    i = 1 # Used to keep track of current fold
    preds = []
    vals  = []

    for train, test in kf.split(x):
        if i > 1:
            continue
        model.fit(x[train],y[train]) # Fit on training data
        preds = np.append(preds, model.predict(x[test])) # Predict on testing data
        y_pairs = pd.merge(y[test],y[test],how='cross') # Cross-merge data values
        vals = np.append(vals, y_pairs.Y_y - y_pairs.Y_x) # Calculate true delta values

        if seed == 1: # Saving individual folds for mathematical invariants analysis
            results = [preds]
            #pd.DataFrame(results).to_csv("../Results/ACANetOPT/{}_{}_Individual_Fold_{}.csv".format(prop, model, cnt), index=False)
            # If you .T the dataframe, then the first column is predictions
            i +=1

    return [vals, preds] # Return true delta values and predicted delta values


def cross_validation_file(data_path, prop, model, k=10, seed=1): # Cross-validate from a file
    df = pd.read_csv(data_path)
    x = df[df.columns[0]]
    y = df[df.columns[1]]
    return cross_validation(x,y,prop,model,k,seed)


In [None]:

###################
####  5x10 CV  ####
###################

properties = ['HalfLife']
for prop in properties:
    dirpath = os.path.join('./plot/acanetopt_poc', prop)
    
    models = [ACANetOPTPoC(dirpath=dirpath, gpuid=0)]
    for model in models:
        delta = pd.DataFrame(columns=['Pearson\'s r', 'MAE', 'RMSE']) # For storing results
        for i in range(5): # Allow for 5x10-fold cross validation
            if i== 0:
                dataset = '../Datasets/Benchmarks/{}.csv'.format(prop) # Training dataset
                results = cross_validation_file(data_path=dataset, prop = prop, model=model, k=10, seed = i) 

epoch: 100%|##########| 50/50 [00:15<00:00,  3.21it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.32it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.31it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.19it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.37it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.35it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.29it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.30it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.39it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.22it/s]
epoch: 100%|##########| 50/50 [00:16<00:00,  3.08it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.31it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.33it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.13it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.38it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.17it/s]
epoch: 100

Best cliff_lower and cliff_upper parameter is: 0.1 and 0.5, respectively.


epoch: 100%|##########| 50/50 [00:14<00:00,  3.41it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.23it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.25it/s]
epoch: 100%|##########| 50/50 [00:17<00:00,  2.93it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.24it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.30it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.31it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.14it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.14it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.20it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.20it/s]
epoch: 100%|##########| 50/50 [00:16<00:00,  3.12it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.20it/s]
epoch: 100%|##########| 50/50 [00:15<00:00,  3.15it/s]
epoch: 100%|##########| 50/50 [00:16<00:00,  2.96it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.34it/s]
epoch: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
epoch: 100

Best cliff-awareness factor alpha by cross-validation is: 0.01




Epoch: 001, Loss: 0.7817 Test: 0.9236
Epoch: 002, Loss: 0.6830 Test: 0.9200
Epoch: 003, Loss: 0.6569 Test: 0.9023
Epoch: 004, Loss: 0.6410 Test: 0.8691
Epoch: 005, Loss: 0.6217 Test: 0.8110
Epoch: 006, Loss: 0.6050 Test: 0.7094
Epoch: 007, Loss: 0.5987 Test: 0.6195
Epoch: 008, Loss: 0.5830 Test: 0.5977
Epoch: 009, Loss: 0.5784 Test: 0.5728
Epoch: 010, Loss: 0.5688 Test: 0.5665
Epoch: 011, Loss: 0.5411 Test: 0.5742
Epoch: 012, Loss: 0.5239 Test: 0.5715
Epoch: 013, Loss: 0.5268 Test: 0.5586
Epoch: 014, Loss: 0.5111 Test: 0.5905
Epoch: 015, Loss: 0.5051 Test: 0.5647
Epoch: 016, Loss: 0.4979 Test: 0.5661
Epoch: 017, Loss: 0.4911 Test: 0.5812
Epoch: 018, Loss: 0.4836 Test: 0.5601
Epoch: 019, Loss: 0.4720 Test: 0.5617
Epoch: 020, Loss: 0.4752 Test: 0.5773
Epoch: 021, Loss: 0.4643 Test: 0.5634
Epoch: 022, Loss: 0.4650 Test: 0.5686
Epoch: 023, Loss: 0.4487 Test: 0.5857
Epoch: 024, Loss: 0.4550 Test: 0.5807
Epoch: 025, Loss: 0.4407 Test: 0.5581
Epoch: 026, Loss: 0.4311 Test: 0.5569
Epoch: 027, 