In [1]:
from model_utils import *
from morfeus_utils import *

os.remove("crest_conformers.xyz")
os.remove("cre_members")
os.remove("crest.energies")

In [2]:
# Load all computational data
ensemble = load_ensemble("47_50")
energies = gen_crest(ensemble)
boltz_weights = get_boltz_weights(".")
conf_track_dict = {"47_50": []}
conf_track_dict, energy_files, rmsd_files = prune(ensemble, energies, "47_50", conf_track_dict)

	Energies more than 3.0 kcal/mol away from lowest energy conformer will be removed ...
	Pruning RMSD, structures with pair-wise RMSD less than 1.5 A will be considered geometrically degenerate
	Pruning RMSD, structures with pair-wise RMSD less than 2.0 A will be considered geometrically degenerate
	Pruning RMSD, structures with pair-wise RMSD less than 2.5 A will be considered geometrically degenerate


In [3]:
# initialize the conformer tracker
monomers = np.arange(50)+1
combs = []
for comb in itertools.combinations(monomers, 2):
    combs.append(list(comb))
conf_tracker = dict(("_".join(map(str, el)), {"exp_file": "", "comp_file": []}) for el in combs)

# identify matches with experimental data
matches, conf_tracker = identify_matches(conf_tracker=conf_tracker)
avg_confs = np.mean(np.array([len(conf_tracker[key]) for key in list(conf_tracker.keys())]))
print(f"on average each complex has {round(avg_confs, 3)} conformers")

264 matching comp and exp spectra
on average each complex has 2.0 conformers


In [6]:
master = compare(conf_tracker, boltz_weight=True)

	Working on 1_2...
	Working on 1_3...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.92, 0.08
	Working on 1_4...
	Working on 1_5...
	Working on 1_6...
	Working on 1_7...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.96, 0.03, 0.01
	Working on 1_8...
	Working on 1_9...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.8, 0.2
	Working on 1_10...
	Working on 1_11...
	Working on 1_12...
	Working on 1_13...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.62, 0.38
	Working on 1_14...
	Working on 1_15...
	Working on 1_16...
	Working on 1_17...
	Working on 1_18...
	Working on 1_19...
	Working on 1_20...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.9, 0.1
	Working on 1_21...
		Boltzmann weighting the spectral data spectrum ...
			weights = 0.51, 0.49
	Working on 1_22...
	Working on 1_23...
		Boltzmann weighting the spectral data spectrum ...
			weights = 1.0
	Working on 1_24...
	Working on 1_

In [8]:
master.shape

(1168, 7)

In [7]:
master.to_pickle("complexes_1168_boltz_weighted.pkl")

In [None]:
##### PREDICTING DIMER COMPUTATIONAL SPECTRUM FROM MONOMER SPECTRA AND FINGERPRINTS #####

# First load all complexes
master = pd.read_pickle("complexes_1168.pkl")
samples = master

# Load monomer data
data = load_monomer("comp_spectral_data/excited_state_monomer_spectra_csvs/", n_pts=32)

# Split into train and test sets 
training, testing = split_data(samples, test_size=0.25)

# Generate features from monomer spectra and fingerprints for both train and testing sets
training = gen_features(training, "comp_spectral_data/excited_state_monomer_spectra_csvs/", 
                        data, monomer_features=True, add_fps=True, fp_bits=128)
testing = gen_features(testing, "comp_spectral_data/excited_state_monomer_spectra_csvs/",
                       data, monomer_features=True, add_fps=True, fp_bits=128)

feature_size = len(training["training_features"].tolist()[0])
training, testing = shrink_spectrum(training, cut=10), shrink_spectrum(testing, cut=10)
target_size = len(training["comp_y"].tolist()[0])

print(f"training examples: {training.shape[0]}\ntesting examples: {testing.shape[0]}\nfeature size: {feature_size}\ntarget size: {target_size}")

In [None]:
# train a multitask GP to predict COMPUTATIONAL DIMER SPECTRUM from features
comp_model, comp_likelihood = train_multitask_GP(training, target="comp_y", n_iterations=250)
os.system("rm -rf after_training/")

# Run prediction and replace COMP spectra with prediction from MODEL
mean_train_preds, training = predict(comp_model, comp_likelihood, training, target="comp_y", train=True, interval=100)
mean_test_preds, testing = predict(comp_model, comp_likelihood, testing, target="comp_y", interval=50)

training["comp_y"] = mean_train_preds
testing["comp_y"] = mean_test_preds

In [None]:
# Rejoin training and testing DataFrames 
exp_samples = pd.concat([training, testing]).dropna()

# Split into train and test sets 
training, testing = split_data(exp_samples, test_size=0.25)

# Generate features from dimer spectra, monomer spectra, and fingerprints for both train and testing sets
training = gen_features(training, "comp_spectral_data/excited_state_monomer_spectra_csvs/", 
                        data, monomer_features=True, add_fps=True, fp_bits=128, comp_features=True)
testing = gen_features(testing, "comp_spectral_data/excited_state_monomer_spectra_csvs/",
                       data, monomer_features=True, add_fps=True, fp_bits=128, comp_features=True)
                       
feature_size = len(training["training_features"].tolist()[0])
target_size = len(training["exp_y"].tolist()[0])

print(f"training examples: {training.shape[0]}\ntesting examples: {testing.shape[0]}\nfeature size: {feature_size}\ntarget size: {target_size}")

In [None]:
##### predicting EXPERIMENTAL DIMER SPECTRUM from PREDICTED COMP SPECTRA, MONOMER SPECTRA, and FPs #####
exp_model, exp_likelihood = train_multitask_GP(training, target="exp_y", n_iterations=250)

# Run predictions
mean_train_preds, training = predict(exp_model, exp_likelihood, training, target="exp_y", train=True, plot=True, compare=True)
mean_test_preds, testing = predict(exp_model, exp_likelihood, testing, target="exp_y", plot=True, compare=True)

In [None]:
r2_after = pd.concat([testing["R2_after"], training["R2_after"]])
mae_after = pd.concat([testing["MAE_after"], training["MAE_after"]])