In [None]:
import pandas as pd
import glob
from sklearn.metrics import (max_error, mean_absolute_error,mean_squared_error, r2_score)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# from PIL import Image
from IPython.display import Image

from figp import Symbolic_Reg

MMS_COLUMNS = ['chembl-id', 'pot.(log,Ki)', 'pot.(nMol,Ki)', 'aromatic_smiles', 'non_stereo_aromatic_smieles',
               'all-chembl-ids', 'no.-meas.', 'pref_name', 'accession', 'natoms',
               'core', 'sub', 'sub_carbon_replacement', 'arorings', 'a_acc',
               'a_don', 'a_heavy', 'logP(o/w)', 'RBC', 'rings',
               'TPSA', 'vdw_vol', 'Weight']
MMS_COLRENAME = {"arorings": "arings", "a_acc": "acc", "a_don": "don", "logP(o/w)": "logp", "RBC": "rbc",
                 "TPSA": "tpsa", "Weight": "mw", "pot.(log,Ki)":"pot"}
MMS_FEATLIST = ["arings", "acc", "don", "logp", "rbc", # Rotatable Bond Counts
                "tpsa", "mw"]
MMS_PROPERTY = "pot"


In [None]:
files = glob.glob("data/MMS/*.tsv")
print("nfiles:", len(files))

fno = 0
file = files[fno]

df = pd.read_table(file, index_col=0)
df = df.rename(columns=MMS_COLRENAME)
print(df.columns)
print(file, df["core"].iloc[0])
ndata = len(df.index)
ntrain = int(0.8*ndata)
print(f"ndata: {ndata}, ntrain: {ntrain}")

X = df.loc[:, MMS_FEATLIST]
y = df.loc[:, MMS_PROPERTY]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ntrain, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
ydomain = y.min(), y.max()

result_dir = f'./result_MMS{fno:02}_FIGP-FVD'
print("output_dir", result_dir)

res = dict()
for random_state in range(5):
    print("RANDOM STATE:", random_state)
    
    est = Symbolic_Reg( population_size=200,
                        generations=100,
                        tournament_size=5,
                        num_elite_select = 1,
                        max_depth=4,
                        function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'square', 'cube', 'ln', 'exp'),
                        metric='rmse', 
                        p_crossover=0.7, 
                        p_mutation=0.2, 
                        random_state=random_state,
                        x_domain=X,
                        y_domain=ydomain,
                        var_max_trial=5000,
                        function_filter = True, 
                        variable_filter = True, 
                        xydomain_filter = True,
                        constonly_filter= True,
                        domain_equal    = (True, True),
                        results_dir=result_dir)

    # traininig
    est.fit(X_train, y_train)
    y_train_pred = est.predict(X_train)
    r2_train = r2_score(y_true=y_train, y_pred=y_train_pred)
    res[random_state] = (r2_train, est)
    
# save the training results
print(res.values())
sorted(res.values(), reverse=True)[0][1].save_all()


In [None]:
sorted(res.values(), reverse=True)[0][0]

In [None]:
y_train_pred = est.predict(X_train)
y_test_pred  = est.predict(X_test)

# Evaluation
r2_train = r2_score(y_true=y_train, y_pred=y_train_pred)
r2_test  = r2_score(y_true=y_test,  y_pred=y_test_pred)

# YY plots 
fig, ax = plt.subplots(figsize=(7, 7))
ax.set_xlabel('true'); ax.set_ylabel('predict')
ax.plot([y.min(), y.max()], [y.min(), y.max()], c='k')
ax.scatter(y_train, y_train_pred,  label=f'$Training\ R^2 = {r2_train:.2f}$')
ax.scatter(y_test, y_test_pred,    label=f'$Test\ R^2 = {r2_test:.2f}$', zorder=-1)
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)

In [None]:
Image(f'{result_dir}/001_GP_log_min_pl.png')