In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.utils import resample

In [2]:
# Merge our DFT result data with the Kraken ML data for the ligands.
dft_df = pd.read_csv("dft_data/lambda_max_v_bond_len_H_only.csv")
kraken_df = pd.read_csv("kraken_data/ml_8_210.csv")

merged_df = pd.merge(dft_df, kraken_df, on="molecule_id")
# Effectively, I think of this as a left joining kraken_df on dft_df

In [3]:
# Now, as we found before, the model does not predict cases with a "symmetric methyl straddle" very well.
# I think this is because the overall agostic interaction is stronger (as in, Pd with two Hs).
# This notebook is to consider the model when we leave out these cases.
# So, drop these ligands here:
print(merged_df.shape)

symm_straddle_ids = [182847, 183055, 20097, 20103]

for m_id in symm_straddle_ids:
    merged_df = merged_df[merged_df.molecule_id != m_id]

print(merged_df.shape)

(42, 195)
(38, 195)


In [4]:
# Leave-one-out CV
from sklearn.model_selection import LeaveOneOut
def loocv(X, y, model, verbose=False):
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    
    y_preds = []
    for i, (train_indices, test_index) in enumerate(loo.split(X)):
        print(f"Fold {i}: ", end="")
        X_train, y_train = X[train_indices], y[train_indices]
        X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_preds.append(y_pred[0])
        print("complete.  ", end="")
        
        if verbose:
            print()
            print(model.coef_)
            print(model.intercept_)
    
    print()
    return y_preds