## Deep Learning Model: Chemprop 

In [41]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import chemprop
from sklearn.utils import shuffle
from chemprop.train.evaluate import evaluate_predictions


## Random Sampling 

In [43]:
def run_chemprop(batchsize):
    train_arguments = [
        '--data_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/rand_train_dataset.csv',
        '--dataset_type', 'regression',
        '--save_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/',
        '--batch_size', str(batchsize), 
        '--epochs', str(5),
        
    ]

    train_args = chemprop.args.TrainArgs().parse_args(train_arguments)
    train_args.quiet = True
    mean_score, std_score = chemprop.train.cross_validate(args=train_args, train_func=chemprop.train.run_training)

    test_arguments = [
    '--test_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/rand_test_dataset.csv',
    '--preds_path', 'rand_preds.csv',
    '--checkpoint_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/', 
    
    ]   

    test_args = chemprop.args.PredictArgs().parse_args(test_arguments)
    test_args.quiet = True
    preds = chemprop.train.make_predictions(args=test_args)

    preds = pd.read_csv("rand_preds.csv")
    y_preds = [[target] for target in preds["target"].values]
    test = pd.read_csv("rand_test_dataset.csv")
    y_test = [[target] for target in test["target"].values]
    dataset_type = 'test'
    num_tasks = len(y_preds[0])
    results = evaluate_predictions(preds=y_preds, targets=y_test, num_tasks=num_tasks, metrics=['rmse', 'r2'], dataset_type=dataset_type)
  
    return results['rmse'], results['r2']

In [44]:
def random_sampling_chemprop(batchsize):
    seeds = [1, 2, 3]
    
    rmse_vals = []
    r2_vals = []

    data = pd.read_csv("TYK2_final.csv", index_col=False)
    data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
    column_names = ['smiles', 'target']
    data.columns = column_names

    for seed in seeds:
        rmse_instance = []
        r2_instance = []
        # Get the data 
        data = np.array(data)
        shuffled_data = shuffle(data, random_state=seed)
        size = int(len(shuffled_data)*0.1)
        train, test = shuffled_data[:size], shuffled_data[size:]
        train_df = pd.DataFrame(train, columns=column_names)
        train_df.to_csv("rand_train_dataset.csv", index=False)
        test_df = pd.DataFrame(test, columns=column_names)
        test_df.to_csv("rand_test_dataset.csv", index=False)
        
        counter = 0
        while len(train_df) <= round(((len(shuffled_data) * 0.2) / 500 )) * 500:
            if counter % 100 == 0:
                print("running chemprop")
                rmse, r2 = run_chemprop(batchsize) 
                rmse_instance.append(rmse)
                r2_instance.append(r2)
            train_df =  pd.concat([train_df, test_df.iloc[:batchsize]])  
            test_df = test_df.drop(test_df.index[:batchsize]) 
            
            # Update the csv
            train_df.to_csv("rand_train_dataset.csv", index=False)
            test_df.to_csv("rand_test_dataset.csv", index=False)
            
            counter += batchsize
        
        rmse_vals.append(rmse_instance)
        r2_vals.append(r2_instance)
        print("finished seed")
    return rmse_vals, r2_vals


In [None]:
batch_sizes = [1, 25, 50]
rand_rmse_dict = {}
rand_r2_dict = {}

final_rand_data_chemprop = pd.DataFrame()

for batch in batch_sizes:
    print(f"{batch} number")
    rand_rmse, rand_r2 = random_sampling_chemprop(batch)

    rand_rmse_mean = np.mean(rand_rmse, axis=0)
    rand_rmse_stdev = np.std(rand_rmse, axis=0)
    print("mean:", rand_rmse_mean)
    print("standard deviation:", rand_rmse_stdev)
    rand_rmse_dict[batch] = (rand_rmse, rand_rmse_mean, rand_rmse_stdev)
    final_rand_data_chemprop[f"{batch}_rmse_mean"] = rand_rmse_mean
    final_rand_data_chemprop[f"{batch}_rmse_stdev"] = rand_rmse_stdev


    rand_r2_mean = np.mean(rand_r2, axis=0)
    rand_r2_stdev = np.std(rand_r2, axis=0)
    print("mean:", rand_r2_mean)
    print("standard deviation:", rand_r2_stdev)
    rand_r2_dict[batch] = (rand_r2, rand_r2_mean, rand_r2_stdev)
    final_rand_data_chemprop[f"{batch}_r2_mean"] = rand_r2_mean
    final_rand_data_chemprop[f"{batch}_r2_stdev"] = rand_r2_stdev
    

final_rand_data_chemprop.to_csv("final_data_random_chemprop.csv", index=False)

## Monte Carlo Dropout

In [None]:
def chemprop_monte_carlo(batchsize, eval):
    train_arguments = [
        '--data_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/mc_train_dataset.csv',
        '--dataset_type', 'regression',
        '--save_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/',
        '--batch_size', str(batchsize), 
        '--loss_function', 'mve', 
        '--epochs', str(5),
        
    ]

    train_args = chemprop.args.TrainArgs().parse_args(train_arguments)
    train_args.quiet = True
    mean_score, std_score = chemprop.train.cross_validate(args=train_args, train_func=chemprop.train.run_training)

    test_arguments = [
    '--test_path', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/mc_test_dataset.csv',
    '--preds_path', 'mc_preds.csv',
    '--checkpoint_dir', '/Users/sinhanushka_/Documents/GitHub/02-750-Final-Project/', 
    '--uncertainty_dropout_p', str(0.1), 
    '--uncertainty_method', 'mve',
    ]   

    test_args = chemprop.args.PredictArgs().parse_args(test_arguments)
    test_args.quiet = True
    preds = chemprop.train.make_predictions(args=test_args, return_uncertainty=True)

    preds = pd.read_csv("mc_preds.csv")
    uncertainty = preds['target_mve_uncal_var']
    instance_to_add = uncertainty.nlargest(batchsize).index
    
    if eval == True:
        y_preds = [[target] for target in preds["target"].values]
        test = pd.read_csv("mc_test_dataset.csv")
        y_test = [[target] for target in test["target"].values]
        dataset_type = 'test'
        num_tasks = len(y_preds[0])
        results = evaluate_predictions(preds=y_preds, targets=y_test, num_tasks=num_tasks, metrics=['rmse', 'r2'], dataset_type=dataset_type)
        return instance_to_add, results['rmse'], results['r2']
    
    return instance_to_add
  
    

In [None]:
def monte_carlo_dropout(batchsize):
    seeds = [1, 2, 3]
    
    rmse_vals = []
    r2_vals = []

    data = pd.read_csv("TYK2_final.csv", index_col=False)
    data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
    column_names = ['smiles', 'target']
    data.columns = column_names

    for seed in seeds:
        rmse_instance = []
        r2_instance = []
        # Get the data 
        data = np.array(data)
        shuffled_data = shuffle(data, random_state=seed)
        size = int(len(shuffled_data)*0.1)
        train, test = shuffled_data[:size], shuffled_data[size:]
        train_df = pd.DataFrame(train, columns=column_names)
        train_df.to_csv("mc_train_dataset.csv", index=False)
        test_df = pd.DataFrame(test, columns=column_names)
        test_df.to_csv("mc_test_dataset.csv", index=False)
        

        counter = 0
        eval = False
        while len(train_df) <= round(((len(shuffled_data) * 0.2) / 500 )) * 500:
            print("running chemprop")
            if counter % 100 == 0:
                print("in the counter")
                eval = True 
                instance, rmse, r2 = chemprop_monte_carlo(batchsize, eval) 
                rmse_instance.append(rmse)
                r2_instance.append(r2)
            
            else:
                instance = chemprop_monte_carlo(batchsize, eval) 

            eval = False

            train_df =  pd.concat([train_df, test_df.iloc[instance]])  
            test_df = test_df.drop(test_df.index[instance]) 
    
            # Update the csv
            train_df.to_csv("mc_train_dataset.csv", index=False)
            test_df.to_csv("mc_test_dataset.csv", index=False)
            counter += batchsize
        
        rmse_vals.append(rmse_instance)
        r2_vals.append(r2_instance)
        print("finished seed")
    return rmse_vals, r2_vals

In [None]:
batch_sizes = [1, 25, 50]
mc_rmse_dict = {}
mc_r2_dict = {}

final_mc_data_chemprop = pd.DataFrame()

for batch in batch_sizes:
    print(f"{batch} number")
    mc_rmse, mc_r2 = monte_carlo_dropout(batch)

    mc_rmse_mean = np.mean(mc_rmse, axis=0)
    mc_rmse_stdev = np.std(mc_rmse, axis=0)
    print("mean:", mc_rmse_mean)
    print("standard deviation:", mc_rmse_stdev)
    mc_rmse_dict[batch] = (mc_rmse, mc_rmse_mean, mc_rmse_stdev)
    final_mc_data_chemprop[f"{batch}_rmse_mean"] = mc_rmse_mean
    final_mc_data_chemprop[f"{batch}_rmse_stdev"] = mc_rmse_stdev


    mc_r2_mean = np.mean(mc_r2, axis=0)
    mc_r2_stdev = np.std(mc_r2, axis=0)
    print("mean:", mc_r2_mean)
    print("standard deviation:", mc_r2_stdev)
    mc_r2_dict[batch] = (mc_r2, mc_r2_mean, mc_r2_stdev)
    final_mc_data_chemprop[f"{batch}_r2_mean"] = mc_r2_mean
    final_mc_data_chemprop[f"{batch}_r2_stdev"] = mc_r2_stdev
    

final_mc_data_chemprop.to_csv("final_data_mc_chemprop.csv", index=False)