In [23]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import chemprop
from sklearn.utils import shuffle
from chemprop.train.evaluate import evaluate_predictions

In [5]:
data = pd.read_csv("TYK2_final.csv", index_col=False)
data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
column_names = ['smiles', 'target']
data.columns = column_names

# Get 10% for training
data = np.array(data)
data= shuffle(data, random_state=1)
size = int(len(data)*0.1)
train, test = data[:size], data[size:]
train_df = pd.DataFrame(train, columns=column_names)
train_df.to_csv("train_dataset.csv", index=False)
test_df = pd.DataFrame(test, columns=column_names)
test_df.to_csv("test_dataset.csv", index=False)


In [16]:
def run_chemprop(batchsize):
    train_arguments = [
        '--data_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/train_dataset.csv',
        '--dataset_type', 'regression',
        '--save_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/',
        '--batch_size', str(batchsize), 
    ]

    train_args = chemprop.args.TrainArgs().parse_args(train_arguments)
    mean_score, std_score = chemprop.train.cross_validate(args=train_args, train_func=chemprop.train.run_training)

    test_arguments = [
    '--test_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/test_dataset.csv',
    '--preds_path', 'preds.csv',
    '--checkpoint_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/', 
    ]   

    test_args = chemprop.args.PredictArgs().parse_args(test_arguments)
    preds = chemprop.train.make_predictions(args=test_args)

    preds = pd.read_csv("preds.csv")
    y_preds = [[target] for target in preds["target"].values]
    test = pd.read_csv("test_dataset.csv")
    y_test = [[target] for target in test["target"].values]
    dataset_type = 'test'
    num_tasks = len(y_preds[0])
    results = evaluate_predictions(preds=y_preds, targets=y_test, num_tasks=num_tasks, metrics=['rmse', 'r2'], dataset_type=dataset_type)
  
    return results['rmse'], results['r2']

Command line
python /Users/sinhanushka_/opt/anaconda3/envs/chemprop/lib/python3.8/site-packages/ipykernel_launcher.py -f /Users/sinhanushka_/Library/Jupyter/runtime/kernel-737c4653-547e-4f1f-b17b-4a9f04f83bb4.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 25,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,
 'config_path': None,
 'constraints

In [17]:
test_arguments = [
    '--test_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/test_dataset.csv',
    '--preds_path', 'preds.csv',
    '--checkpoint_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/', 
    '--batch_size', '25', 
]

test_args = chemprop.args.PredictArgs().parse_args(test_arguments)
preds = chemprop.train.make_predictions(args=test_args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


8998it [00:00, 375040.72it/s]
100%|██████████| 8998/8998 [00:00<00:00, 248180.73it/s]

Validating SMILES





Test size = 8,998


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".


100%|██████████| 1/1 [01:10<00:00, 70.05s/it]

Saving predictions to preds.csv
Elapsed time = 0:01:22





In [40]:
preds = pd.read_csv("preds.csv")
y_preds = [[target] for target in preds["target"].values]
test = pd.read_csv("test_dataset.csv")
y_test = [[target] for target in test["target"].values]
dataset_type = 'test'
num_tasks = len(y_preds[0])
results = evaluate_predictions(preds=y_preds, targets=y_test, num_tasks=num_tasks, metrics=['rmse', 'r2'], dataset_type=dataset_type)
print(results)

{'rmse': [0.760796544846458], 'r2': [0.6866690741579313]}


In [None]:
# Run for three different batches
    # Run for three different seeds 
        # A while loop that runs until we have 20% 
            # train_args 
                # get the metrics 
            # pred args
                # predictions and evaluation 
            # randomly select 10 samples 
            # remove them from the test
            # update train_args 
            # update pred_args
            # return 
def random_sampling_chemprop(batchsize):
    seeds = [1, 2, 3]
    
    rmse_vals = []
    r2_vals = []

    data = pd.read_csv("TYK2_final.csv", index_col=False)
    data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
    column_names = ['smiles', 'target']
    data.columns = column_names

    for seed in seeds:
        rmse_instance = []
        r2_instance = []
        # Get the data 
        data = np.array(data)
        shuffled_data = shuffle(data, random_state=seed)
        size = int(len(shuffled_data)*0.1)
        train, test = shuffled_data[:size], shuffled_data[size:]
        train_df = pd.DataFrame(train, columns=column_names)
        train_df.to_csv("train_dataset.csv", index=False)
        test_df = pd.DataFrame(test, columns=column_names)
        test_df.to_csv("test_dataset.csv", index=False)
        
        counter = 0
        while len(train_df) <= round(((len(shuffled_data) * 0.2) / 500 )) * 500:
            if counter % 10 == 0:
                rmse, r2 = run_chemprop(batchsize) 
                rmse_instance.append(rmse)
                r2_instance.append(r2)
            train_df =  pd.concat([train_df, test_df.iloc[0]])  
            test_df = test_df.drop(test_df.index[0]) 
            
            # Update the csv
            train_df.to_csv("train_dataset.csv", index=False)
            test_df.to_csv("test_dataset.csv", index=False)

            counter += batchsize

        rmse_vals.append(rmse_instance)
        r2_vals.append(r2_instance)

    return rmse_vals, r2_vals


In [None]:
batch_sizes = [1]
rand_rmse_dict = {}
rand_r2_dict = {}

final_rand_data_chemprop = pd.DataFrame()

for batch in batch_sizes:

    rand_rmse, rand_r2 = random_sampling_chemprop(batch)

    rand_rmse_mean = np.mean(rand_rmse, axis=0)
    rand_rmse_stdev = np.std(rand_rmse, axis=0)
    print("mean:", rand_rmse_mean)
    print("standard deviation:", rand_rmse_stdev)
    rand_rmse_dict[batch] = (rand_rmse, rand_rmse_mean, rand_rmse_stdev)
    final_rand_data_chemprop[f"{batch}_rmse_mean"] = rand_rmse_mean
    final_rand_data_chemprop[f"{batch}_rmse_stdev"] = rand_rmse_stdev


    rand_r2_mean = np.mean(rand_r2, axis=0)
    rand_r2_stdev = np.std(rand_r2, axis=0)
    print("mean:", rand_r2_mean)
    print("standard deviation:", rand_r2_stdev)
    rand_r2_dict[batch] = (rand_r2, rand_r2_mean, rand_r2_stdev)
    final_rand_data_chemprop[f"{batch}_r2_mean"] = rand_r2_mean
    final_rand_data_chemprop[f"{batch}_r2_stdev"] = rand_r2_stdev


final_rand_data_chemprop.to_csv("final_data_random_chemprop.csv")

In [None]:



# Run monte carlo dropout 
# Choose the sample with the highest dropout
# Add it to training set 
# Remove it from the over all dataset 