In [6]:
from models.run_models import *

import json

In [None]:
master_df = pd.read_csv('../../dataset_files/transaminase_dataset_dft_descriptors_boltzmann_final.csv', converters=converters)
heldout_df = pd.read_csv('../../dataset_files/additional_cyclic_dataset_dft_descriptors_boltzmann_final.csv', converters=converters)

enzymes = list(set(master_df['Enzyme'].to_list()))
enzymes.sort()

dft_colnames = ['Mapped_Ketone', 'Carbon1_Idx', 'Carbonyl_Idx', 'Carbon2_Idx', 'MoleculeFeatures', 'Carbon1_AtomFeatures', 'Carbonyl_AtomFeatures', 'Carbon2_AtomFeatures', 'CC1_Sterimol', 'CC2_Sterimol']
results_folder = '../../results/boltzmann_modeling'

### Train per-enzyme RF conversion models on original dataset, test on heldout

In [8]:
target_type = 'conversion' 
model_type = 'rf' 
split_type = 'heldout' # doesn't matter
task_type = 'bin'
feature_type = 'dft'  

results_dfs = []

for enzyme in enzymes:
    print(enzyme)
    enzyme_df_master = master_df[master_df['Enzyme'] == enzyme].reset_index(drop=True)
    enzyme_df_heldout = heldout_df[heldout_df['Enzyme'] == enzyme].reset_index(drop=True)
    model = Model_Trainer(model_type, split_type, task_type, feature_type, target_type, enzyme_df_master, enzyme_df_heldout)
    model.train_test_model()
    enzyme_df_heldout['Predicted_conversion'] = list(model.y_pred)
    results_dfs.append(enzyme_df_heldout)

res_df_conv = pd.concat(results_dfs)

Enz1
Enz2
Enz3
Enz4
Enz6
Enz7
Enz8
Enz9


### Train per-enzyme MVLR models on original dataset, test on heldout

In [10]:
master_df = master_df.dropna(subset=['ee']).reset_index(drop=True) # remove rxns with no experimental ee measurement
master_df.shape

(233, 29)

In [11]:
# grab selected features

with open(f'{results_folder}/selected_features_ddg.json', 'r') as file:
    selected_features_ddg = json.load(file)

In [12]:
model_type = 'lin' 
split_type = 'heldout' # doesn't matter
task_type = 'reg'
target_types = ['ddg']

results_dfs = []

for enzyme in enzymes:
    print(enzyme)
    enzyme_df_master = master_df[master_df['Enzyme'] == enzyme].reset_index(drop=True)
    enzyme_df_heldout = heldout_df[heldout_df['Enzyme'] == enzyme].reset_index(drop=True)

    for target_type in target_types:
        feature_type = selected_features_ddg[enzyme]

        model = Model_Trainer(model_type, split_type, task_type, feature_type, target_type, enzyme_df_master, enzyme_df_heldout)
        model.train_test_model()
        #if target_type == 'ee': model.y_pred = model.y_pred*100
        enzyme_df_heldout[f'Predicted_{target_type}'] = list(model.y_pred)
    results_dfs.append(enzyme_df_heldout)

res_df_sel = pd.concat(results_dfs)

Enz1
Enz2
Enz3
Enz4
Enz6
Enz7
Enz8
Enz9


### Combine results and save

In [None]:
# combine rf and mvlr regression modeling results and save
for colname in ['Enzyme','Ketone_Smiles','conversion']:
    assert res_df_conv[colname].to_list() == res_df_sel[colname].to_list()

for target_type in ['ddg']:
    res_df_conv[f'Predicted_{target_type}'] = res_df_sel[f'Predicted_{target_type}'].to_list()

res_df_conv = res_df_conv.drop(columns=dft_colnames)
res_df_conv.to_csv(f'{results_folder}/cyclic_heldout_predictions.csv', index=False)