# Graphs for stability


In [None]:
import matplotlib.pyplot as plt
# import random
import numpy as np
from scipy import interpolate
from sklearn.metrics import mean_squared_error

"""Generate Pink Noise"""
def gen_pink_noise_fs48k(length=2**17, rndseed=0):

    np.random.seed(rndseed)
    tmp = np.random.random(size=length)*2 - 1
    S = np.fft.rfft(tmp)
    fil = 1 / (np.arange(len(S))+1)
    S = S * fil
    s = np.fft.irfft(S)
    s /= np.max(np.abs(s))

    return s

t_noise = gen_pink_noise_fs48k()

"""pick up 32/16 points"""
# xs_noise = t_noise[::2**12]
xs_noise = t_noise[::2**13]
xs = np.array([i for i in range(len(xs_noise))])
a1 = 0.2
a2 = 1.5

"""target function w/ pink noise"""
ys = 0.3*(a1*xs)**4 - 1*(a1*xs)**3 + a1*xs + a2*xs_noise
# ys = xs_noise
ys[0] = 2*ys[1] -ys[2]
ys[-1] = 2*ys[-2] -ys[-3]

# ys = np.array([], dtype="float")

fig, axes = plt.subplots(1,2, figsize=(24,8))
axes[0].scatter(xs, xs_noise)
axes[1].scatter(xs, ys)

"""x_latent"""
x_latent = np.linspace(0, len(xs)-1, 1000)

"""polynominal"""

cf1 = ["最小2乗法（1次式）", lambda x, y: np.polyfit(x, y, 1)]
cf2 = ["最小2乗法（2次式）", lambda x, y: np.polyfit(x, y, 2)]
cf3 = ["最小2乗法（3次式）", lambda x, y: np.polyfit(x, y, 3)]
cf6 = ["最小2乗法（6次式）", lambda x, y: np.polyfit(x, y, 6)]
cf10 = ["最小2乗法（10次式）", lambda x, y: np.polyfit(x, y, 10)]
cf11 = ["最小2乗法（11次式）", lambda x, y: np.polyfit(x, y, 11)]
cf12 = ["最小2乗法（12次式）", lambda x, y: np.polyfit(x, y, 12)]
cf20 = ["最小2乗法（20次式）", lambda x, y: np.polyfit(x, y, 20)]

print(cf1[1](xs,ys))
print(cf2[1](xs,ys))
print(cf3[1](xs,ys))

f1 = cf1[1](xs,ys)
fx1 = np.poly1d(f1)(x_latent)
axes[1].plot(x_latent, fx1, label="1")
f2 = cf2[1](xs,ys)
fx2 = np.poly1d(f2)(x_latent)
axes[1].plot(x_latent, fx2, label="2")
f3 = cf3[1](xs,ys)
fx3 = np.poly1d(f3)(x_latent)
axes[1].plot(x_latent, fx3, label="3")
f6 = cf6[1](xs,ys)
fx6 = np.poly1d(f6)(x_latent)
axes[1].plot(x_latent, fx6, label="6")
f10 = cf10[1](xs,ys)
fx10 = np.poly1d(f10)(x_latent)
axes[1].plot(x_latent, fx10, label="10")
f11 = cf11[1](xs,ys)
fx11 = np.poly1d(f11)(x_latent)
axes[1].plot(x_latent, fx11, label="11")
f12 = cf12[1](xs,ys)
fx12 = np.poly1d(f12)(x_latent)
axes[1].plot(x_latent, fx12, label="12")
axes[1].set_xticklabels([])
axes[1].set_yticklabels([])
axes[1].legend()
plt.show()


_fontsize=18
_noise = 0.33
fig, axes = plt.subplots(1, 2, figsize=(24,8))

"""DATA"""
axes[0].scatter(xs, ys, color="black")

"""diff bars"""
fxs3 = np.poly1d(f3)(xs)
fxs3_p = np.poly1d(f3)(xs+_noise)

fxs_m3 = (fxs3 + ys) / 2
fxs_e3 = np.abs(fxs3 - ys) / 2

fxs_m3_p = (fxs3_p + ys) / 2
fxs_e3_p = np.abs(fxs3_p - ys) / 2

fx3_p = np.poly1d(f3)(x_latent+_noise)
axes[0].plot(x_latent, fx3, label=f"$Fit_1(x)$: RMSE={mean_squared_error(fxs3, ys, squared=False):.2}", color="black", linestyle="-")
axes[0].plot(x_latent, fx3_p, label=f"$Fit_1(x+\epsilon)$: RMSE$_\epsilon$$_x$={mean_squared_error(fxs3_p, ys, squared=False):.2}", color="black", linestyle=":")
axes[0].errorbar(xs, fxs_m3, yerr=fxs_e3, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[0].errorbar(xs+0.1, fxs_m3_p, yerr=fxs_e3_p, fmt="none", capsize=5, capthick=.8, ecolor="black")

# axes[0].set_xticklabels([])
# axes[0].set_yticklabels([])
axes[0].legend(fontsize=_fontsize, loc="lower center")
# axes[0].set_ylim(-2, 1)


"""DATA"""
axes[1].scatter(xs, ys, color="black")

fxs11 = np.poly1d(f11)(xs)
fxs11_p = np.poly1d(f11)(xs+_noise)

fxs_m11 = (fxs11 + ys) / 2
fxs_e11 = np.abs(fxs11 - ys) / 2

fxs_m11_p = (fxs11_p + ys) / 2
fxs_e11_p = np.abs(fxs11_p - ys) / 2

fx11_p = np.poly1d(f11)(x_latent+_noise)
axes[1].plot(x_latent, fx11, label=f"$Fit_2(x)$: RMSE={mean_squared_error(fxs11, ys, squared=False):.2}", color="black", linestyle="-")
axes[1].plot(x_latent, fx11_p, label=f"$Fit_2(x+\epsilon)$: RMSE$_\epsilon$$_x$={mean_squared_error(fxs11_p, ys, squared=False):.2}", color="black", linestyle=":")
axes[1].errorbar(xs, fxs_m11, yerr=fxs_e11, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[1].errorbar(xs+0.1, fxs_m11_p, yerr=fxs_e11_p, fmt="none", capsize=5, capthick=.8, ecolor="black")
# axes[1].plot(xs+0.1, fxs_m11_p+fxs_e11_p, marker="^", ls="", color="black", ms=0)
# axes[1].plot(xs+0.1, fxs_m11_p-fxs_e11_p, marker="v", ls="", color="black", ms=0)
# axes[1].plot(x,y+4-yerr, marker="^", ls="", color='limegreen', ms=8)

# axes[1].set_xticklabels([])
# axes[1].set_yticklabels([])
axes[1].legend(fontsize=_fontsize, loc="lower center")
# axes[1].set_ylim(-2, 1)


# fx = np.poly1d(cf12[1](xs-.1,ys))(x_latent)
# ax.plot(x_latent, fx)

plt.show()



_noise_c = 0.4
fig, axes = plt.subplots(1, 2, figsize=(24,8))

"""DATA"""
axes[0].scatter(xs, ys, color="black")

"""diff bars"""
fxs3_cp = np.poly1d(f3+_noise_c*f3)(xs)
fxs_m3_cp = (fxs3_cp + ys) / 2
fxs_e3_cp = np.abs(fxs3_cp - ys) / 2

fx3_cp = np.poly1d(f3+_noise_c*f3)(x_latent)
axes[0].plot(x_latent, fx3, label=f"$Fit_1(x)$: RMSE={mean_squared_error(fxs3, ys, squared=False):.2}", color="black", linestyle="-")
axes[0].plot(x_latent, fx3_cp, label=f"$Fit_1(x,c+\epsilon)$: RMSE$_\epsilon$$_c$={mean_squared_error(fxs3_cp, ys, squared=False):.2}", color="black", linestyle=":")
axes[0].errorbar(xs, fxs_m3, yerr=fxs_e3, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[0].errorbar(xs+0.1, fxs_m3_cp, yerr=fxs_e3_cp, fmt="none", capsize=5, capthick=.8, ecolor="black")

# axes[0].set_xticklabels([])
# axes[0].set_yticklabels([])
axes[0].legend(fontsize=_fontsize, loc="lower center")
# axes[0].set_ylim(-2, 1)


"""DATA"""
axes[1].scatter(xs, ys, color="black")

fxs11_cp = np.poly1d(f11+_noise_c*f11)(xs)

fxs_m11_cp = (fxs11_cp + ys) / 2
fxs_e11_cp = np.abs(fxs11_cp - ys) / 2

fx11_cp = np.poly1d(f11+_noise_c*f11)(x_latent)
axes[1].plot(x_latent, fx11, label=f"$Fit_2(x,c)$ : RMSE={mean_squared_error(fxs11, ys, squared=False):.2}", color="black", linestyle="-")
axes[1].plot(x_latent, fx11_cp, label=f"$Fit_2(x,c+\epsilon)$ : RMSE$_\epsilon$$_c$={mean_squared_error(fxs11_cp, ys, squared=False):.2}", color="black", linestyle=":")
axes[1].errorbar(xs, fxs_m11, yerr=fxs_e11, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[1].errorbar(xs+0.1, fxs_m11_cp, yerr=fxs_e11_cp, fmt="none", capsize=5, capthick=.8, ecolor="black")
# axes[1].plot(xs+0.1, fxs_m11_cp+fxs_e11_cp, marker="^", ls="", color="black", ms=0)
# axes[1].plot(xs+0.1, fxs_m11_cp-fxs_e11_cp, marker="v", ls="", color="black", ms=0)
# axes[1].plot(x,y+4-yerr, marker="^", ls="", color='limegreen', ms=8)

# axes[1].set_xticklabels([])
# axes[1].set_yticklabels([])
axes[1].legend(fontsize=_fontsize, loc="lower center")
# axes[1].set_ylim(-2, 1)


plt.show()




In [None]:
ys = ys.copy()
# ys2[0] = -.5
# ys2[7] = -.3
# ys2[7] = .2

plt.plot(ys)
plt.plot(ys)


f3_1 = cf3[1](xs,ys)
fx3_1 = np.poly1d(f3_1)(x_latent)
f11_1 = cf11[1](xs,ys)
fx11_1 = np.poly1d(f11_1)(x_latent)

_noise_c = 0.4
fig, axes = plt.subplots(1, 2, figsize=(24,8))

"""DATA"""
axes[0].scatter(xs, ys, color="black")

"""diff bars"""
fxs3_1 = np.poly1d(f3_1)(xs)
fxs3_1_p = np.poly1d(f3_1)(xs+_noise)

fxs_m3_1 = (fxs3_1 + ys) / 2
fxs_e3_1 = np.abs(fxs3_1 - ys) / 2

fxs_m3_1_p = (fxs3_1_p + ys) / 2
fxs_e3_1_p = np.abs(fxs3_1_p - ys) / 2

fx3_1_p = np.poly1d(f3_1)(x_latent+_noise)
axes[0].plot(x_latent, fx3_1, label=f"$Fit_1(x)$: RMSE={mean_squared_error(fxs3_1, ys, squared=False):.2}", color="black", linestyle="-")
axes[0].plot(x_latent, fx3_1_p, label=f"$Fit_1(x+\epsilon)$: RMSE$_\epsilon$$_x$={mean_squared_error(fxs3_1_p, ys, squared=False):.2}", color="black", linestyle=":")
axes[0].errorbar(xs, fxs_m3_1, yerr=fxs_e3_1, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[0].errorbar(xs+0.1, fxs_m3_1_p, yerr=fxs_e3_1_p, fmt="none", capsize=5, capthick=.8, ecolor="black")

# axes[0].set_xticklabels([])
# axes[0].set_yticklabels([])
axes[0].legend(fontsize=_fontsize, loc="lower center")
# axes[0].set_ylim(-2, 1)


"""DATA"""
axes[1].scatter(xs, ys, color="black")

fxs11_1 = np.poly1d(f11_1)(xs)
fxs11_1_p = np.poly1d(f11_1)(xs+_noise)

fxs_m11_1 = (fxs11_1 + ys) / 2
fxs_e11_1 = np.abs(fxs11_1 - ys) / 2

fxs_m11_1_p = (fxs11_1_p + ys) / 2
fxs_e11_1_p = np.abs(fxs11_1_p - ys) / 2

fx11_1_p = np.poly1d(f11_1)(x_latent+_noise)
axes[1].plot(x_latent, fx11_1, label=f"$Fit_2(x)$: RMSE={mean_squared_error(fxs11_1, ys, squared=False):.2}", color="black", linestyle="-")
axes[1].plot(x_latent, fx11_1_p, label=f"$Fit_2(x+\epsilon)$: RMSE$_\epsilon$$_x$={mean_squared_error(fxs11_1_p, ys, squared=False):.2}", color="black", linestyle=":")
axes[1].errorbar(xs, fxs_m11_1, yerr=fxs_e11_1, fmt="none", capsize=0, capthick=0, ecolor="black", lw=1)
axes[1].errorbar(xs+0.1, fxs_m11_1_p, yerr=fxs_e11_1_p, fmt="none", capsize=5, capthick=.8, ecolor="black")

# axes[1].set_xticklabels([])
# axes[1].set_yticklabels([])
axes[1].legend(fontsize=_fontsize, loc="lower center")
# axes[1].set_ylim(-2, 1)


plt.show()




In [None]:
import random
import matplotlib.pyplot as plt
import pandas as pd
import math

a = [ 0.56514978,  0.50583713,  0.5983144 ,  0.52175241,  0.45634347,
      0.22114882,  0.2559339 ,  0.10449878,  0.08629392,  0.13410682,
      0.13520334,  0.08693079,  0.13227931, -0.0634571 , -0.18911029,
     -0.21415424,]
# -0.25105624, -0.30963861, -0.17493603, -0.07848107,
#       0.01505251,  0.13949498,  0.23368917,  0.32850575,  0.44952869,
#       0.36945653]

func1 = lambda x: math.sin((x/5)*math.pi)
func2 = lambda x: (0.05*x)**3-(0.06*x)**2

a = [random.random() + 0.5*random.random()*func1(x) + func2(x) for x in range(30)]

ma = pd.Series(a).rolling(5).mean()
ma2 = pd.Series(ma.dropna().values)

plt.plot(ma2)
plt.plot(a)

In [None]:
ma2.plot()

In [None]:
import numpy as np
import matplotlib.pyplot as plt


"""Min-max filter : Diverging Function"""
npoints = 20
xs = np.array([i for i in range(npoints)])
x_latent = np.linspace(0, len(xs), 1050)

func1 = lambda x: np.abs(x / (x - 5.5)) + .5
np.random.seed(0)
ys = func1(xs)
ys_r = (np.random.rand(npoints) -0.5)*1.2 + ys
ys_r2 = ys_r.copy()
ys_r2[5] = ys_r2[5]*0.7

y_latent = func1(x_latent)
r1 = 266
r2 = 318
range_max = 10
range_min = -1

if False:
    # plt.plot(xs, ys1, "o", color="black")
    # plt.plot(xs[1::2], ys1_r[1::2], "o", color="blue")
    plt.plot(xs[1::2], ys1_r2[1::2], "o", color="black")
    plt.plot(x_latent[:r1], y_latent[:r1], "black", label=f"Diverging Function")
    plt.plot(x_latent[r2:], y_latent[r2:], "black")
    plt.legend()
    plt.show()

# plt.plot(xs, ys1, "o", color="black")
# plt.plot(xs[1::2], ys1_r[1::2], "o", color="blue")
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
ax = axes[0]
ax.plot(xs[1::2], ys_r2[1::2], "o", color="black", label="Data")
ax.plot(x_latent[:r1], y_latent[:r1], "black", label="$f_{filtered}\ (x)$")
ax.plot(x_latent[r2:], y_latent[r2:], "black")
ax.legend(loc="center right", fontsize=_fontsize)
ax.hlines(0, -1, 21, color="black", linestyles="--", lw=0.8)
ax.hlines(10, -1, 21, color="black", linestyles="--", lw=0.8)
ax.vlines(0, -1, 11.5, color="black", linestyles="--", lw=0.8)
ax.vlines(20, -1, 11.5, color="black", linestyles="--", lw=0.8)
ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
ax.set_ylabel("Target property ($y$)", fontsize=_fontsize)
ax.set_xticklabels([])
ax.set_yticklabels([])

ax.fill_between(x_latent, [0 for _ in range(len(x_latent))], [-20 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
ax.fill_between(x_latent, [10 for _ in range(len(x_latent))], [20 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
ax.set_xlim(-1, 21)
ax.set_ylim(-1, 11)

# plt.show()

"""Min-max filter : OK Function"""
# fig, ax = plt.subplots(1, 1, figsize=(9, 6))
ax = axes[1]
ax.plot(xs[1::2], ys_r2[1::2], "o", color="black", label="Data")

f1 = cf1[1](xs[1::2], ys_r2[1::2])
f2 = cf2[1](xs[1::2], ys_r2[1::2])
f3 = cf3[1](xs[1::2], ys_r2[1::2])
f6 = cf6[1](xs[1::2], ys_r2[1::2])
f10 = cf10[1](xs[1::2], ys_r2[1::2])
f11 = cf11[1](xs[1::2], ys_r2[1::2])
f12 = cf12[1](xs[1::2], ys_r2[1::2])
fx = np.poly1d(f6)(x_latent)
ax.plot(x_latent, fx, "black", label="$f_{ok}\ (x)$")
# ax.legend(loc="center right", fontsize=14)
ax.legend(loc="center right", fontsize=_fontsize)
ax.hlines(0, -1, 21, color="black", linestyles="--", lw=0.8)
ax.hlines(10, -1, 21, color="black", linestyles="--", lw=0.8)
ax.vlines(0, -1, 11.5, color="black", linestyles="--", lw=0.8)
ax.vlines(20, -1, 11.5, color="black", linestyles="--", lw=0.8)
ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
# ax.set_ylabel("Target property ($y$)", fontsize=16)
ax.set_xticklabels([])
ax.set_yticklabels([])

ax.fill_between(x_latent, [0 for _ in range(len(x_latent))], [-20 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
ax.fill_between(x_latent, [10 for _ in range(len(x_latent))], [20 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
ax.set_xlim(-1, 21)
ax.set_ylim(-1, 11)

plt.subplots_adjust(wspace=0.05)

plt.savefig("d2_filter.png")
plt.show()

In [None]:

"""Overfit function"""
if False:
    npoints = 40
    xs = np.array([i for i in range(npoints)])
    x_latent = np.linspace(0, len(xs), 1025)

    # func2 = lambda x: 0.1*(0.1*x+10)*np.sin(np.pi*((0.3*(x-20))**2)/5) + 0.2*(0.1*(x-4))**2 + 1.0
    # func2 = lambda x: np.sin((20*(x/10)**3 - 20*(x/10)**2 + 3*(x/10) -1)*np.pi/10) + np.cos((3*(x/10) -1)*np.pi/5) + 0.1*(x-5.0) + 5.0
    # func2 = lambda x: ((x/20)**2)*np.sin((x/2)*np.pi)*((x/10-1.0)**2) + 5.0
    # func2 = lambda x: np.cos((0.4*x)*np.pi)*(x**2)*0.001 + 0.008*(x-30)**2 + 2.0
    # func2 = lambda x: 3.0 / (np.pi*(1.0 + ((x-29.5)/2)**2)) + (0.04*x)**3 - (0.08*(x-26))**2 -0.2*x + 10.0
    func2 = lambda x: -20.0 / (np.pi*(1.0 + ((x-10)/1.5)**2)) + 8.0 / (np.pi*(1.0 + ((x-11.5)/1.0)**2)) + \
                    4.0 / (np.pi*(1.0 + ((x-21)/1.5)**2)) + 1.0 / (np.pi*(1.0 + ((x-22.5)/1.0)**2)) + \
                    2.0 / (np.pi*(1.0 + ((x-30)/1.0)**2)) + 8.0 / (np.pi*(1.0 + ((x-31)/1.0)**2))+ np.sqrt(x+5) + 0

    np.random.seed(2)

    ys = func2(xs)
    ys_r = (np.random.rand(npoints) -0.5)*.4 + ys
    ys_r2 = ys_r.copy()
    # ys_r2[5] = ys_r2[5]*0.7

    y_latent = func2(x_latent)
    y_latent_x = func2(x_latent-1)
    r1 = 266
    r2 = 266
    range_max = 10
    range_min = -1

    if True:
        # plt.plot(xs, ys1, "o", color="black")
        # plt.plot(xs[1::1], ys_r2[1::1], "o", color="blue")
        plt.plot(xs[1::3], ys_r2[1::3], "o", color="black", label="Data points")
        plt.plot(x_latent[:r1], y_latent[:r1], "black", label=f"Overfittingg Function")
        plt.plot(x_latent[r2:], y_latent[r2:], "black")
        plt.legend()
        plt.show()

    # plt.plot(xs, ys1, "o", color="black")
    # plt.plot(xs[1::2], ys1_r[1::2], "o", color="blue")
    fig, ax = plt.subplots(1, 1, figsize=(9, 6))
    ax.plot(xs[1::3], ys_r2[1::3], "o", color="black")
    ax.plot(x_latent, y_latent, "black", label=f"Overfitting Function")
    ax.plot(x_latent, y_latent_x, "black", label="", linestyle="--")
    ax.legend(loc="center right", fontsize=14)
    ax.hlines(0, -1, 41, color="black", linestyles="--", lw=0.8)
    ax.hlines(10, -1, 41, color="black", linestyles="--", lw=0.8)
    ax.vlines(0, -1, 11.5, color="black", linestyles="--", lw=0.8)
    ax.vlines(40, -1, 11.5, color="black", linestyles="--", lw=0.8)

    ax.fill_between(x_latent, [0 for _ in range(len(x_latent))], [-10 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
    ax.fill_between(x_latent, [10 for _ in range(len(x_latent))], [20 for _ in range(len(x_latent))], facecolor="gray", alpha=0.2, label="fill")
    ax.set_xlim(-1, 41)
    # ax.set_xlim(-1, 21)
    ax.set_ylim(-1, 11)

    plt.show()



"""Stability : Overfit function"""
npoints = 40
xs = np.array([i for i in range(npoints)])
x_latent = np.linspace(0, len(xs), 1025)

func2 = lambda x: 0.1*(0.1*x+10)*np.sin(np.pi*((0.333*(x-20)))) + 1.6*(0.1*(x-20))**2 + 1.0

_mc = 0.9
func2_n = lambda x: (_mc*0.01*x+_mc*1)*np.sin(np.pi*((_mc*0.333*x-_mc*0.333*20))) + _mc*1.6*((_mc*0.1*x-_mc*2))**2 + _mc*1.0
_pc = 1.1
func2pn = lambda x: (_pc*0.01*x+_pc*1)*np.sin(np.pi*((_pc*0.333*x-_pc*0.333*20))) + _pc*1.6*((_pc*0.1*x-_pc*2))**2 + _pc*1.0

# func2 = lambda x: 0.1*(0.1*x+10)*np.sin(np.pi*((0.333*(x-20)))) + 0.8*(0.1*(x-4))**2 + 1.0
# func2 = lambda x: 0.1*(0.1*x+10)*np.sin(np.pi*((0.3*(x-20))**2)/5) + 0.2*(0.1*(x-4))**2 + 1.0
# func2 = lambda x: np.sin((20*(x/10)**3 - 20*(x/10)**2 + 3*(x/10) -1)*np.pi/10) + np.cos((3*(x/10) -1)*np.pi/5) + 0.1*(x-5.0) + 5.0
# func2 = lambda x: ((x/20)**2)*np.sin((x/2)*np.pi)*((x/10-1.0)**2) + 5.0
# func2 = lambda x: np.cos((0.4*x)*np.pi)*(x**2)*0.001 + 0.008*(x-30)**2 + 2.0
# func2 = lambda x: 3.0 / (np.pi*(1.0 + ((x-29.5)/2)**2)) + (0.04*x)**3 - (0.08*(x-26))**2 -0.2*x + 10.0
# func2 = lambda x: -10.0 / (np.pi*(1.0 + ((x-10)/1.5)**2)) + 8.0 / (np.pi*(1.0 + ((x-11.5)/1.0)**2)) + \
                #    4.0 / (np.pi*(1.0 + ((x-21)/1.5)**2)) + 1.0 / (np.pi*(1.0 + ((x-22.5)/1.0)**2)) + \
                #    2.0 / (np.pi*(1.0 + ((x-30)/1.0)**2)) + 8.0 / (np.pi*(1.0 + ((x-31)/1.0)**2))+ np.sqrt(x+5) + 0

np.random.seed(2)

ys = func2(xs)
ys_r = (np.random.rand(npoints) -0.5)*.8 + ys
ys_r2 = ys_r.copy()
# ys_r2[5] = ys_r2[5]*0.7

y_latent = func2(x_latent)
y_latent_x = func2(x_latent-1)
y_latentpx = func2(x_latent+1)
r1 = 266
r2 = 266
range_max = 10
range_min = -1

if False:
    # plt.plot(xs, ys1, "o", color="black")
    # plt.plot(xs[1::1], ys_r2[1::1], "o", color="blue")
    plt.plot(xs[1::3], ys_r2[1::3], "o", color="black", label="Data points")
    plt.plot(x_latent[:r1], y_latent[:r1], "black", label=f"Overfittingg Function")
    plt.plot(x_latent[r2:], y_latent[r2:], "black")
    plt.legend()
    plt.show()

# plt.plot(xs, ys1, "o", color="black")
# plt.plot(xs[1::2], ys1_r[1::2], "o", color="blue")
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
ax = axes[0]
ax.plot(xs[0::3], ys_r2[0::3], "o", color="black", label="Data", alpha=0.7)
ax.plot(x_latent, y_latent, "black", label="$f_{overfit}\ (x;\ c)$", linewidth=2)
ax.plot(x_latent, y_latent_x, "black", label="$f_{overfit}\ (x-\delta;\ c)$", linestyle="--", alpha=0.7)
ax.plot(x_latent, y_latentpx, "black", label="$f_{overfit}\ (x+\delta;\ c)$", linestyle="-.", alpha=0.7)


_noise = 1
"""diff bars"""
fxp = func2(xs[0::3])
fxp_n = func2(xs[0::3]-_noise)
fxppn = func2(xs[0::3]+_noise)

fxp_m = (fxp + ys_r2[0::3]) / 2
fxp_e = np.abs(fxp -ys_r2[0::3]) / 2

fxp_m_n = (fxp_n + fxp) / 2
fxp_e_n = np.abs(fxp_n - fxp) / 2

fxp_mpn = (fxppn + fxp) / 2
fxp_epn = np.abs(fxppn - fxp) / 2

ax.errorbar(xs[0::3]+0.2, fxp_m_n, yerr=fxp_e_n, fmt="none", capsize=5, capthick=2, ecolor="tab:blue", lw=2, alpha=0.7)
ax.errorbar(xs[0::3]-0.2, fxp_mpn, yerr=fxp_epn, fmt="none", capsize=5, capthick=2, ecolor="tab:orange", lw=2, alpha=0.7)
# ax.errorbar(xs[0::3]+0.1, fxp_m_n, yerr=fxp_e_n, fmt="none", elinewidth=2, barsabove=True, capsize=5, ecolor="black", 
#             capthick=2, lolims=True, uplims=True, xlolims=True, xuplims=True)
# ax.errorbar(xs[0::3]+0.1, fxp_m_n, yerr=fxp_e_n, fmt="none", elinewidth=2, barsabove=True, capsize=5,
            # capthick=2, lolims=True, uplims=True, xlolims=True, xuplims=True)
# ax.legend(loc="center right", fontsize=14)
ax.legend(fontsize=_fontsize, loc="upper center")


ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
ax.set_ylabel("Target property ($y$)", fontsize=_fontsize)
# ax.set_xticks([])
# ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylim(-1,11)
ax.grid(linestyle="--", alpha=0.7)


# plt.savefig("noise_func01.png", dpi=300)
# plt.show()


"""Stability : OK Function"""
# fig, ax = plt.subplots(1, 1, figsize=(9, 6))
ax = axes[1]
ax.plot(xs[0::3], ys_r2[0::3], "o", color="black", label="Data", alpha=0.7)

# f1 = cf1[1](xs[0::3], ys_r2[0::3])
# f2 = cf2[1](xs[0::3], ys_r2[0::3])
f3 = cf3[1](xs[0::3], ys_r2[0::3])
# f6 = cf6[1](xs[0::3], ys_r2[0::3])
# f10 = cf10[1](xs[0::3], ys_r2[0::3])
# f11 = cf11[1](xs[0::3], ys_r2[0::3])
# f12 = cf12[1](xs[0::3], ys_r2[0::3])
fx = np.poly1d(f3)(x_latent)
fx_x = np.poly1d(f3)(x_latent-1)
fxpx = np.poly1d(f3)(x_latent+1)
ax.plot(x_latent, fx, "black", label="$f_{fit}\ (x;\ c)$", linewidth=2)
ax.plot(x_latent, fx_x, "black", label="$f_{fit}\ (x-\delta;\ c)$", linestyle="--", alpha=0.7)
ax.plot(x_latent, fxpx, "black", label="$f_{fit}\ (x+\delta;\ c)$", linestyle="-.", alpha=0.7)

_noise = 1
"""diff bars"""
fxp = np.poly1d(f3)(xs[0::3])
fxp_n = np.poly1d(f3)(xs[0::3]-_noise)
fxppn = np.poly1d(f3)(xs[0::3]+_noise)

fxp_m = (fxp + ys_r2[0::3]) / 2
fxp_e = np.abs(fxp -ys_r2[0::3]) / 2

fxp_m_n = (fxp_n + fxp) / 2
fxp_e_n = np.abs(fxp_n - fxp) / 2
fxp_mpn = (fxppn + fxp) / 2
fxp_epn = np.abs(fxppn - fxp) / 2
ax.errorbar(xs[0::3]+0.2, fxp_m_n, yerr=fxp_e_n, fmt="none", capsize=5, capthick=2, ecolor="tab:blue", lw=2, alpha=0.7)
ax.errorbar(xs[0::3]-0.2, fxp_mpn, yerr=fxp_epn, fmt="none", capsize=5, capthick=2, ecolor="tab:orange", lw=2, alpha=0.7)

# ax.legend(loc="center right", fontsize=14)
ax.legend(fontsize=_fontsize, loc="upper center")
ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
# ax.set_ylabel("Target property ($y$)", fontsize=16)
# ax.set_xticks([])
# ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylim(-1,11)
ax.grid(linestyle="--", alpha=0.7)

plt.subplots_adjust(wspace=0.05)

plt.savefig("noise_func_x.png", dpi=300)
plt.show()


"""Stability for Coefficients """
y_latent = func2(x_latent)
y_latent_c = func2_n(x_latent)
y_latentpc = func2pn(x_latent)
r1 = 266
r2 = 266
range_max = 10
range_min = -1

if False:
    # plt.plot(xs, ys1, "o", color="black")
    # plt.plot(xs[1::1], ys_r2[1::1], "o", color="blue")
    plt.plot(xs[1::3], ys_r2[1::3], "o", color="black", label="Data points")
    plt.plot(x_latent[:r1], y_latent[:r1], "black", label=f"Overfittingg Function")
    plt.plot(x_latent[r2:], y_latent[r2:], "black")
    plt.legend()
    plt.show()

# plt.plot(xs, ys1, "o", color="black")
# plt.plot(xs[1::2], ys1_r[1::2], "o", color="blue")
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
ax = axes[0]
ax.plot(xs[0::3], ys_r2[0::3], "o", color="black", label="Data", alpha=0.7)
ax.plot(x_latent, y_latent, "black", label="$f_{overfit}\ (x;\ c)$", linewidth=2)
ax.plot(x_latent, y_latent_c, "black", label="$f_{overfit}\ (x;\ c-\delta)$", linestyle="--", alpha=0.7)
ax.plot(x_latent, y_latentpc, "black", label="$f_{overfit}\ (x;\ c+\delta)$", linestyle="-.", alpha=0.7)


_noise = 1
"""diff bars"""
fxp = func2(xs[0::3])
fxp_n = func2_n(xs[0::3])
fxppn = func2pn(xs[0::3])

fxp_m = (fxp + ys_r2[0::3]) / 2
fxp_e = np.abs(fxp -ys_r2[0::3]) / 2

fxp_m_n = (fxp_n + fxp) / 2
fxp_e_n = np.abs(fxp_n - fxp) / 2

fxp_mpn = (fxppn + fxp) / 2
fxp_epn = np.abs(fxppn - fxp) / 2

ax.errorbar(xs[0::3]+0.2, fxp_m_n, yerr=fxp_e_n, fmt="none", capsize=5, capthick=2, ecolor="tab:blue", lw=2, alpha=0.7)
ax.errorbar(xs[0::3]-0.2, fxp_mpn, yerr=fxp_epn, fmt="none", capsize=5, capthick=2, ecolor="tab:orange", lw=2, alpha=0.7)
# ax.errorbar(xs[0::3]+0.1, fxp_m_n, yerr=fxp_e_n, fmt="none", elinewidth=2, barsabove=True, capsize=5, ecolor="black", 
#             capthick=2, lolims=True, uplims=True, xlolims=True, xuplims=True)
# ax.errorbar(xs[0::3]+0.1, fxp_m_n, yerr=fxp_e_n, fmt="none", elinewidth=2, barsabove=True, capsize=5,
            # capthick=2, lolims=True, uplims=True, xlolims=True, xuplims=True)
# ax.legend(loc="center right", fontsize=14)
ax.legend(fontsize=_fontsize, loc="upper center")


ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
ax.set_ylabel("Target property ($y$)", fontsize=_fontsize)
# ax.set_xticks([])
# ax.set_yticks([])
# ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylim(-1,11)
ax.grid(linestyle="--", alpha=0.7)


# plt.savefig("noise_func03.png", dpi=300)
# plt.show()


"""Stability : OK Function"""
# fig, ax = plt.subplots(1, 1, figsize=(9, 6))
ax = axes[1]
ax.plot(xs[0::3], ys_r2[0::3], "o", color="black", label="Data", alpha=0.7)

# f1 = cf1[1](xs[0::3], ys_r2[0::3])
# f2 = cf2[1](xs[0::3], ys_r2[0::3])
f3 = cf3[1](xs[0::3], ys_r2[0::3])
# f6 = cf6[1](xs[0::3], ys_r2[0::3])
# f10 = cf10[1](xs[0::3], ys_r2[0::3])
# f11 = cf11[1](xs[0::3], ys_r2[0::3])
# f12 = cf12[1](xs[0::3], ys_r2[0::3])

fx = np.poly1d(f3)(x_latent)
_noise_c = 0.15
fx_x = np.poly1d(f3*(1-_noise_c))(x_latent)
fxpx = np.poly1d(f3*(1+_noise_c))(x_latent)
ax.plot(x_latent, fx, "black", label="$f_{fit}\ (x;\ c)$", linewidth=2)
ax.plot(x_latent, fx_x, "black", label="$f_{fit}\ (x;\ c-\delta)$", linestyle="--", alpha=0.7)
ax.plot(x_latent, fxpx, "black", label="$f_{fit}\ (x;\ c+\delta)$", linestyle="-.", alpha=0.7)

"""diff bars"""
fxp = np.poly1d(f3)(xs[0::3])
fxp_n = np.poly1d(f3*(1-_noise_c))(xs[0::3])
fxppn = np.poly1d(f3*(1+_noise_c))(xs[0::3])

fxp_m = (fxp + ys_r2[0::3]) / 2
fxp_e = np.abs(fxp -ys_r2[0::3]) / 2

fxp_m_n = (fxp_n + fxp) / 2
fxp_e_n = np.abs(fxp_n - fxp) / 2
fxp_mpn = (fxppn + fxp) / 2
fxp_epn = np.abs(fxppn - fxp) / 2
ax.errorbar(xs[0::3]+0.2, fxp_m_n, yerr=fxp_e_n, fmt="none", capsize=5, capthick=2, ecolor="tab:blue", lw=2, alpha=0.7)
ax.errorbar(xs[0::3]-0.2, fxp_mpn, yerr=fxp_epn, fmt="none", capsize=5, capthick=2, ecolor="tab:orange", lw=2, alpha=0.7)

# ax.legend(loc="center right", fontsize=14)
ax.legend(fontsize=_fontsize, loc="upper center")
ax.set_xlabel("Variable property ($x$)", fontsize=_fontsize)
# ax.set_ylabel("Target property ($y$)", fontsize=16)
# ax.set_xticks([])
# ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylim(-1,11)
ax.grid(linestyle="--", alpha=0.7)

plt.subplots_adjust(wspace=0.05)
# plt.suptitle("Stability in the presence of coefficient perturbations", fontsize=16, y=0.95)

plt.savefig("noise_func_c.png", dpi=300)
plt.show()


# RESULTS TABLE

analyze MMS data

In [None]:
import pandas as pd

mms_files = {
    "01": "../scripts/data/MMS/tid-10142-actives_100mms_sub-moedescriptors.tsv",
    "02": "../scripts/data/MMS/tid-10142-actives_154mms_sub-moedescriptors.tsv",
    "03": "../scripts/data/MMS/tid-10280-actives_90mms_sub-moedescriptors.tsv",
    "04": "../scripts/data/MMS/tid-10627-actives_106mms_sub-moedescriptors.tsv",
    "05": "../scripts/data/MMS/tid-129-actives_462mms_sub-moedescriptors.tsv",
    "06": "../scripts/data/MMS/tid-137-actives_49mms_sub-moedescriptors.tsv",
    "07": "../scripts/data/MMS/tid-194-actives_232mms_sub-moedescriptors.tsv",
    "08": "../scripts/data/MMS/tid-20174-actives_26mms_sub-moedescriptors.tsv",
    "09": "../scripts/data/MMS/tid-278-actives_20mms_sub-moedescriptors.tsv",
    "10": "../scripts/data/MMS/tid-280-actives_417mms_sub-moedescriptors.tsv",
    "11": "../scripts/data/MMS/tid-8-actives_44mms_sub-moedescriptors.tsv",
    "12": "../scripts/data/MMS/tid-8-actives_49mms_sub-moedescriptors.tsv",
}

MMS_COLUMNS = ['chembl-id', 'pot.(log,Ki)', 'pot.(nMol,Ki)', 'aromatic_smiles', 'non_stereo_aromatic_smieles',
               'all-chembl-ids', 'no.-meas.', 'pref_name', 'accession', 'natoms',
               'core', 'sub', 'sub_carbon_replacement', 'arorings', 'a_acc',
               'a_don', 'a_heavy', 'logP(o/w)', 'RBC', 'rings',
               'TPSA', 'vdw_vol', 'Weight']
MMS_COLRENAME = {"arorings": "arings", "a_acc": "acc", "a_don": "don", "logP(o/w)": "logp", "RBC": "rbc",
                 "TPSA": "tpsa", "Weight": "mw", "pot.(log,Ki)":"pot"}
                 # RBC: Rotatable Bond Counts
MMS_FEATLIST = {'10': ["arings", "acc", "don", "a_heavy", "logp", "rbc", "rings", "tpsa", "vdw_vol", "mw"],
                '7' : ["arings", "acc", "don", "logp", "rbc", "tpsa", "mw"],
                '4' : ["logp", "rbc", "tpsa", "mw"],}
MMS_PROPERTY = "pot"

mms_std = {}
df_mms_max = pd.DataFrame(columns=MMS_FEATLIST['10'])
df_mms_min = pd.DataFrame(columns=MMS_FEATLIST['10'])
for no, file in mms_files.items():
    # print(f"MMS{no}")
    df = pd.read_table(file, index_col=0)
    # print(list(df.columns))
    df = df.rename(columns=MMS_COLRENAME)
    # print(file, df["core"].iloc[0])
    ndata = len(df.index)

    x = df.loc[:, MMS_FEATLIST['10']]
    y = df.loc[:, MMS_PROPERTY]
    # print("MAX")
    # display(x.max())
    # print("MIN")
    # display(x.min())
    # print(y.describe())
    df_mms_max.loc[f"MMS{no}", :] = x.max()
    df_mms_min.loc[f"MMS{no}", :] = x.min()
    mms_std[f"MMS{no}"] = y.std()

# display(df_mms_max)
# display(df_mms_min)
print("MAX")
display(df_mms_max.max())
print("MIN")
display(df_mms_min.min())
print(mms_std)

In [None]:
def max_bold_min_italic(val):
    is_max = val == val.max()
    is_min = val == val.min()
    ret = ['font-style: italic' if cell else '' for cell in is_min]
    ret = ['font-weight: bold' if cell else ret[idx] for idx, cell in enumerate(is_max)]
    return ret

def min_bold_max_italic(val):
    is_max = val == val.max()
    is_min= val == val.min()
    ret = ['font-weight: bold' if cell else '' for cell in is_min]
    ret = ['font-style: italic' if cell else ret[idx] for idx, cell in enumerate(is_max)]
    return ret

Load all results

In [None]:
import glob
import re
import pandas as pd

# res_files = sorted(glob.glob("../scripts/RESULTS_230307/*/res.txt"))
res_files = sorted(glob.glob("../scripts/RESULTS_230329/*/res.txt"))
# print(res_files)
print("n results:", len(res_files))

PATH_PTN = re.compile(r"^.*/2303.._D([^_]*)_F([^_]*)_(([^_]*)|([^_]*)_([^_]*)|([^_]*)_([^_]*)_([^_]*))/res.txt")
CMET_PTN = re.compile(r"(MLR|SVR)")

res_df = pd.DataFrame(columns=["METHOD", "N FEATURES", "TRAIN RATIO", "PATH"])

for _path in res_files:
    
    if PATH_PTN.match(_path) is not None:
        _match = PATH_PTN.match(_path)
        # print(_match.groups(), _match.group(0),_match.group(1),_match.group(2),_match.group(3))
        _train_r = float(f"0.{_match.group(1)}")
        _n_feats = int(_match.group(2))
        _method = None
        
        if _match.group(4) is not None:
            _method = _match.group(4)
        elif _match.group(5) is not None:
            _lmd = _match.group(6)[1:].split(":")
            _method = f"{_match.group(5)} +{_lmd[0]}*{_match.group(6)[0]}(d={_lmd[1]})"
        elif _match.group(7) is not None:
            _xlmd = _match.group(8)[1:].split(":")
            _clmd = _match.group(9)[1:].split(":")
            _method = f"{_match.group(7)} +{_xlmd[0]}*{_match.group(8)[0]}(d={_xlmd[1]}) +{_clmd[0]}*{_match.group(9)[0]}(d={_clmd[1]})"
        else:
            raise Exception("no match", _match.groups())

        # print(_n_feats)
        res_df.loc[len(res_df)] = {"METHOD":_method, "N FEATURES": _n_feats, "TRAIN RATIO": _train_r, "PATH": _path}
        # print(res_df.loc[len(res_df)])
            
    else:
        raise Exception("no match:", _path)
    

df = pd.DataFrame(columns=["MMS", "METHOD", "N FEATURES", "TRAIN RATIO", "DATASET", "RSEED", "RMSE", "R2"])

for _idx, _vals in res_df.iterrows():
    _method = _vals[0]
    _n_feats = _vals[1]
    _train_r = _vals[2]
    _path = _vals[3]
    # print(_n_feats, _train_r, _method, _path)
        # _df = dfs[_n_feats][_train_r]
    with open(_path) as f:
        ls = f.readlines()
        # print(ls)
        for line in ls:
            ds = line.split("\t")
            
            # print(ds)
            _val = ds[1].replace("inf", "10.0", 10)
            vals = eval(_val)
            # print(_val, vals)

            dss = re.split(r"[/_]", ds[0])
            _mmsno = dss[0]
            _rndseed = int(dss[1])
            _dataset = dss[2]
            # print(dss, _mmsno, _rndseed, _dataset)

            if CMET_PTN.match(_method):
                df.loc[len(df)] = {"MMS": f"MMS{_mmsno}", "METHOD": _method, "N FEATURES": _n_feats, "TRAIN RATIO": _train_r, "DATASET": _dataset, "RSEED": _rndseed, "RMSE": vals[0], "R2": vals[1]}
            else:
                if "train_all" in ds[0]:
                    df.loc[len(df)] = {"MMS": f"MMS{_mmsno}", "METHOD": _method, "N FEATURES": _n_feats, "TRAIN RATIO": _train_r, "DATASET": _dataset, "RSEED": _rndseed, "RMSE": vals[0][1], "R2": vals[0][2]}
                elif "test" in ds[0]:
                    df.loc[len(df)] = {"MMS": f"MMS{_mmsno}", "METHOD": _method, "N FEATURES": _n_feats, "TRAIN RATIO": _train_r, "DATASET": _dataset, "RSEED": _rndseed, "RMSE": vals[0], "R2": vals[1]}
                else:
                    pass

pd.options.display.float_format = "{:.2e}".format
df

colored table

In [None]:
import seaborn as sns

df_res = pd.DataFrame(columns=["MMS", "METHOD", \
                               "RMSE TRAIN COUNT", "RMSE TRAIN MEAN", "RMSE TRAIN STD", "RMSE TRAIN MIN", "RMSE TRAIN 25%", "RMSE TRAIN 50%", "RMSE TRAIN 75%", "RMSE TRAIN MAX", \
                               "R2 TRAIN COUNT",   "R2 TRAIN MEAN",   "R2 TRAIN STD",   "R2 TRAIN MIN",   "R2 TRAIN 25%",   "R2 TRAIN 50%",   "R2 TRAIN 75%",   "R2 TRAIN MAX", \
                               "RMSE TEST COUNT",  "RMSE TEST MEAN",  "RMSE TEST STD",  "RMSE TEST MIN",  "RMSE TEST 25%",  "RMSE TEST 50%",  "RMSE TEST 75%",  "RMSE TEST MAX", \
                               "R2 TEST COUNT",    "R2 TEST MEAN",    "R2 TEST STD",    "R2 TEST MIN",    "R2 TEST 25%",    "R2 TEST 50%",    "R2 TEST 75%",    "R2 TEST MAX"])                                                                                                                                                                            

mmss = df["MMS"].drop_duplicates().values
# methods = df["METHOD"].drop_duplicates().values
methods = [
        #    'FV',
           'FVD', 'FVD2', 'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
        #    'FV +1*C(d=.01)', 'FV +1*C(d=.02)', 'FV +1*C(d=.05)', 'FV +1*C(d=.1)' 'FV +1*C(d=.2)',
        #    'FV +1*C(d=.5)', 'FV +.5*X(d=.1) +.5*C(d=.1)', 'FV +1*X(d=.01)',
        #    'FV +1*X(d=.02)', 'FV +1*X(d=.05)', 'FV +1*X(d=.1)', 'FV +1*X(d=.2)',
        #    'FV +1*X(d=.5)',
           'MLR',
        #    'N',
        #    'N +.5*X(d=.1) +.5*C(d=.1)',
           'SVR',
         #   'MEAN'
           ]
meth_rep = {'FV':'FV', 'FVD':'FVD', 'FVD2':'FVD2', 'FVD2 +.5*X(d=.1) +.5*C(d=.1)':'FVD2 +NOISE(X, C)',
}

print(mmss)
print(methods)

for _mms in mmss:
    for _method in methods:
        _train = df[(df["MMS"] == _mms) & (df["METHOD"] == _method) & (df["DATASET"] == "train")].loc[:, ["RMSE","R2"]].describe()
        _test = df[(df["MMS"] == _mms) & (df["METHOD"] == _method) & (df["DATASET"] == "test")].loc[:, ["RMSE","R2"]].describe()
        new_row = [_mms, _method]+list(_train["RMSE"].values)+list(_train["R2"])+list(_test["RMSE"].values)+list(_test["R2"])
        # print(len(df_res.columns), len(new_row))
        df_res.loc[len(df_res)] = new_row

# display(df_res)

cm_gr = sns.light_palette('green', reverse=True, as_cmap=True)
cm_br = sns.light_palette('blue', reverse=True, as_cmap=True)
cm_b  = sns.light_palette('blue', reverse=False, as_cmap=True)
cm_cw = sns.color_palette("coolwarm", as_cmap=True)
cm_ryb = sns.color_palette("RdYlBu", as_cmap=True)
cm_ryg = sns.color_palette("RdYlGn_r", as_cmap=True)
cm_bl = sns.color_palette("blend:#B3D5E7,#F8C1A6", as_cmap=True)


print(f"======= RMSE TRAIN =======")
df_rmse_train = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)
df_rmse_train_med = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)
df_rmse_train_count = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)

for _mms in mmss:
    # df_rmse_train.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TRAIN MEAN"].values
    df_rmse_train.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TRAIN MEAN"].values / mms_std[_mms]
    # df_rmse_train_med.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TRAIN 50%"].values
    df_rmse_train_med.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TRAIN 50%"].values / mms_std[_mms]
    df_rmse_train_count.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TRAIN COUNT"].values

s_df_rmse_train = df_rmse_train.astype("float64").style.background_gradient(cmap=cm_gr, axis=1)
s_df_rmse_train_med = df_rmse_train_med.astype("float64").style.background_gradient(cmap=cm_bl, axis=1)
s_df_rmse_train_count = df_rmse_train_count.astype("int").style
s_df_rmse_train.format("{:.2f}")
s_df_rmse_train_med.format("{:.2f}")

s_df_rmse_test = s_df_rmse_train.apply(min_bold_max_italic, axis=1)
s_df_rmse_test = s_df_rmse_train.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_med = s_df_rmse_train_med.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_med = s_df_rmse_train_med.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_count = s_df_rmse_train_count.apply(max_bold_min_italic, axis=1)
s_df_rmse_test_count = s_df_rmse_train_count.apply(max_bold_min_italic, axis=1)
display(s_df_rmse_train_med)


print(f"======= RMSE TEST =======")
df_rmse_test = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)
df_rmse_test_med = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)
df_rmse_test_count = pd.DataFrame(index=mmss, columns=methods).rename(columns=meth_rep)

for _mms in mmss:
    # df_rmse_test.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TEST MEAN"].values
    df_rmse_test.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TEST MEAN"].values / mms_std[_mms]
    # df_rmse_test_med.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TEST 50%"].values
    df_rmse_test_med.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TEST 50%"].values / mms_std[_mms]
    df_rmse_test_count.loc[_mms] = df_res[(df_res["MMS"] == _mms)]["RMSE TEST COUNT"].values

s_df_rmse_test = df_rmse_test.astype("float64").style.background_gradient(cmap=cm_gr, axis=1)
s_df_rmse_test_med = df_rmse_test_med.astype("float64").style.background_gradient(cmap=cm_bl, axis=1)
s_df_rmse_test_count = df_rmse_test_count.astype("int").style
s_df_rmse_test.format("{:.2f}")
s_df_rmse_test_med.format("{:.2f}")

s_df_rmse_test = s_df_rmse_test.apply(min_bold_max_italic, axis=1)
s_df_rmse_test = s_df_rmse_test.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_med = s_df_rmse_test_med.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_med = s_df_rmse_test_med.apply(min_bold_max_italic, axis=1)
s_df_rmse_test_count = s_df_rmse_test_count.apply(max_bold_min_italic, axis=1)
s_df_rmse_test_count = s_df_rmse_test_count.apply(max_bold_min_italic, axis=1)
display(s_df_rmse_test_med)



In [None]:
from ipywidgets import interact, FloatSlider

sns.choose_colorbrewer_palette('diverging')

In [None]:
# _temp = df[df["MMS"] == "MMS03"]["R2"].mean()
# print(_temp)
# _temp = df[df["MMS"] == "MMS04"]["R2"].mean()
# print(_temp)

show boxplot

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

"""filter df by MMS01 and train_all"""
_ylim_rmse = { 4: {'MMS01': (0,10), 'MMS02': (0,1.5), 'MMS03': (0,3), 'MMS04': (0,5), 'MMS05': (0,10), 'MMS06': (0,8),
                  'MMS07': (0,9), 'MMS08': (0,12), 'MMS09': (0,12), 'MMS10': (0,4),  'MMS11': (0,2), 'MMS12': (0,6)},
               7: {'MMS01': (0,6), 'MMS02': (0,2), 'MMS03': (0,6), 'MMS04': (0,8), 'MMS05': (0,4), 'MMS06': (0,2),
                  'MMS07': (0,9), 'MMS08': (0,4), 'MMS09': (0,8), 'MMS10': (0,3),  'MMS11': (0,2), 'MMS12': (0,4)},
              10: {'MMS01': (0,20), 'MMS02': (0,7), 'MMS03': (0,4), 'MMS04': (0,4), 'MMS05': (0,4), 'MMS06': (0,2),
                  'MMS07': (0,5), 'MMS08': (0,8), 'MMS09': (0,4), 'MMS10': (0,3),  'MMS11': (0,9), 'MMS12': (0,5)},
             }

_ylim_r2   = { 4: {'MMS01': (-60,1), 'MMS02': (-2,1), 'MMS03': (-18,1), 'MMS04': (-30,1), 'MMS05': (-100,1), 'MMS06': (-39,1),
                   'MMS07': (-34,1), 'MMS08': (-200,1), 'MMS09': (-300,1), 'MMS10': (-13,1), 'MMS11': (-3,1), 'MMS12': (-30,1)},
               7: {'MMS01': (-34,1), 'MMS02': (-4,1), 'MMS03': (-37,1), 'MMS04': (-65,1), 'MMS05': (-10,1), 'MMS06': (-3,1),
                   'MMS07': (-35,1), 'MMS08': (-12,1), 'MMS09': (-80,1), 'MMS10': (-10,1), 'MMS11': (-3,1), 'MMS12': (-15,1)},
              10: {'MMS01': (-300,1), 'MMS02': (-85,1), 'MMS03': (-14,1), 'MMS04': (-23,1), 'MMS05': (-7,1), 'MMS06': (-4,1),
                   'MMS07': (-28,1), 'MMS08': (-28,1), 'MMS09': (-28,1), 'MMS10': (-8,1), 'MMS11': (-90,1), 'MMS12': (-10,1)},
              }

for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:
# for _mms in ['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']:
# for _mms in ['MMS03', 'MMS05']:
    for _n_feats in [4, 7, 10]:
    # for _n_feats in [4]:
    # for _n_feats in [10]:
        print(f"============================== {_mms} nfeats: {_n_feats} ==============================")
        _df = df[df["MMS"] == _mms]
        _df = _df[_df["N FEATURES"] == _n_feats]
        # print(len(_df))
        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d=", "").replace("X", r"$\bf{X}$").replace("C", r"$\bf{C}$") for _method in _df["METHOD"]]
        # _df.loc[:, "METHOD"] = [_df.loc[_idx, "METHOD"] + f" f_{_df.loc[_idx, 'N FEATURES']:02}" for _idx in _df.index]
        # _df.sort_values(by=["METHOD"], inplace=True)

        _PLOT_RMSE = True
        if _PLOT_RMSE:

            # sns.set(style="whitegrid")
            plt.figure(figsize=(10, 6))
            plt.ylim(_ylim_rmse[_n_feats][_mms])

            sns.set(style="ticks", palette="pastel")

            # Draw a nested boxplot 
            # seaborn.boxplot(data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, 
            #                 saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
            _bplot = sns.boxplot(x="METHOD", y="RMSE",
                                palette=["r", "g", "b"],
                                hue="DATASET",
                                # hue="N FEATURES",
                                dodge=True,
                                data=_df)
            # print(_bplot["whiskers"])
            sns.despine(offset=10, trim=True)
            """rotate xticks"""
            plt.grid(axis="y", linestyle="--")
            plt.legend(loc="upper right")
            plt.xticks(rotation=90)
            plt.ylabel("RMSE [log,Ki]")
            plt.title(_mms+f" (NFEAT={_n_feats})")
            plt.savefig(f"boxplot_{_mms}_f{_n_feats}_rmse.png", dpi=300, bbox_inches="tight")
            plt.show()

        _PLOT_R2 = False
        if _PLOT_R2:
            # sns.set(style="whitegrid")
            plt.figure(figsize=(10, 6))
            plt.ylim(_ylim_r2[_n_feats][_mms])
            sns.set(style="ticks", palette="pastel")

            # Draw a nested boxplot 
            sns.boxplot(x="METHOD", y="R2",
                        palette=["r", "g", "b"],
                        hue="DATASET",
                        # hue="N FEATURES",
                        dodge=True,
                        data=_df)
            sns.despine(offset=10, trim=True)
            """rotate xticks"""
            plt.grid(axis="y", linestyle="--")
            plt.legend(loc="lower right")
            plt.xticks(rotation=90)
            plt.ylabel("R2")
            plt.title(_mms+f" (NFEAT={_n_feats})")
            plt.savefig(f"boxplot_{_mms}_f{_n_feats}_r2.png", dpi=300, bbox_inches="tight")
            plt.show()



show boxplot (by train ratio)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

"""filter df by MMS01 and train_all"""
_ylim_rmse = { 4: {'MMS01': (0,10), 'MMS02': (0,1.5), 'MMS03': (0,3), 'MMS04': (0,5), 'MMS05': (0,10), 'MMS06': (0,8),
                  'MMS07': (0,9), 'MMS08': (0,12), 'MMS09': (0,12), 'MMS10': (0,4),  'MMS11': (0,2), 'MMS12': (0,6)},
               7: {'MMS01': (0,6), 'MMS02': (0,2), 'MMS03': (0,6), 'MMS04': (0,8), 'MMS05': (0,4), 'MMS06': (0,2),
                  'MMS07': (0,9), 'MMS08': (0,4), 'MMS09': (0,8), 'MMS10': (0,3),  'MMS11': (0,2), 'MMS12': (0,4)},
              10: {'MMS01': (0,20), 'MMS02': (0,7), 'MMS03': (0,4), 'MMS04': (0,4), 'MMS05': (0,4), 'MMS06': (0,2),
                  'MMS07': (0,5), 'MMS08': (0,8), 'MMS09': (0,4), 'MMS10': (0,3),  'MMS11': (0,9), 'MMS12': (0,5)},
             }

_ylim_r2   = { 4: {'MMS01': (-60,1), 'MMS02': (-2,1), 'MMS03': (-18,1), 'MMS04': (-30,1), 'MMS05': (-100,1), 'MMS06': (-39,1),
                   'MMS07': (-34,1), 'MMS08': (-200,1), 'MMS09': (-300,1), 'MMS10': (-13,1), 'MMS11': (-3,1), 'MMS12': (-30,1)},
               7: {'MMS01': (-34,1), 'MMS02': (-4,1), 'MMS03': (-37,1), 'MMS04': (-65,1), 'MMS05': (-10,1), 'MMS06': (-3,1),
                   'MMS07': (-35,1), 'MMS08': (-12,1), 'MMS09': (-80,1), 'MMS10': (-10,1), 'MMS11': (-3,1), 'MMS12': (-15,1)},
              10: {'MMS01': (-300,1), 'MMS02': (-85,1), 'MMS03': (-14,1), 'MMS04': (-23,1), 'MMS05': (-7,1), 'MMS06': (-4,1),
                   'MMS07': (-28,1), 'MMS08': (-28,1), 'MMS09': (-28,1), 'MMS10': (-8,1), 'MMS11': (-90,1), 'MMS12': (-10,1)},
              }

for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:
# for _mms in ['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']:
# for _mms in ['MMS03', 'MMS05']:
    for _n_feats in [4, 7, 10]:
    # for _n_feats in [4]:
    # for _n_feats in [10]:
        for _train_r in [0.2, 0.5, 0.8]:
        # for _train_r in [0.2]:
        # for _train_r in [0.8]:
            print(f"============================== {_mms} nfeats: {_n_feats}, train_r: {_train_r} ==============================")
            _df = df[df["MMS"] == _mms]
            _df = _df[_df["N FEATURES"] == _n_feats]
            _df = _df[_df["TRAIN RATIO"] == _train_r]
            print(len(_df))
            # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
            _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d=", "").replace("X", r"$\bf{X}$").replace("C", r"$\bf{C}$") for _method in _df["METHOD"]]
            # _df.loc[:, "METHOD"] = [_df.loc[_idx, "METHOD"] + f" f_{_df.loc[_idx, 'N FEATURES']:02}" for _idx in _df.index]
            
            # _df.sort_values(by=["METHOD"], inplace=True)

            _PLOT_RMSE = False
            if _PLOT_RMSE:

                # sns.set(style="whitegrid")
                plt.figure(figsize=(10, 6))
                
                _quants = pd.DataFrame(columns=[0.25, 0.5, 0.75])
                for _method in list(set(_df["METHOD"].values)):
                    _df_method = _df[(_df["DATASET"].str.contains("test")) & (_df["METHOD"] == _method)]
                    _quants.loc[_method, :] = _df_method["RMSE"].quantile([0.25, 0.5, 0.75])
                    _quants.loc[_method, "1.5IQR"] = (_quants.loc[_method, 0.75] - _quants.loc[_method, 0.25])*1.5 + _quants.loc[_method, 0.75]
                    _quants.loc[_method, "max"] = _df_method[_df_method["RMSE"] < _quants.loc[_method, "1.5IQR"]]["RMSE"].max()

                _max = math.ceil(_quants["max"].max())
                print(_max)
                plt.ylim(0, _max)
                # plt.ylim(_ylim_rmse[_n_feats][_mms])

                sns.set(style="ticks", palette="pastel")

                # Draw a nested boxplot 
                # seaborn.boxplot(data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, 
                #                 saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
                _bplot = sns.boxplot(x="METHOD", y="RMSE",
                                    palette=["r", "g", "b"],
                                    hue="DATASET",
                                    # hue="N FEATURES",
                                    dodge=True,
                                    data=_df)
                # print(_bplot["whiskers"])
                sns.despine(offset=10, trim=True)
                """rotate xticks"""
                plt.grid(axis="y", linestyle="--")
                plt.legend(loc="upper right")
                plt.xticks(rotation=90)
                plt.ylabel("RMSE [log,Ki]")
                plt.title(_mms+f" (NFEAT={_n_feats}, TRAIN_R={_train_r:.1f})")
                plt.savefig(f"boxplot_{_mms}_f{_n_feats}_rmse.png", dpi=300, bbox_inches="tight")
                plt.show()

            _PLOT_R2 = False
            if _PLOT_R2:
                # sns.set(style="whitegrid")
                plt.figure(figsize=(10, 6))
                plt.ylim(_ylim_r2[_n_feats][_mms])
                sns.set(style="ticks", palette="pastel")

                # Draw a nested boxplot 
                sns.boxplot(x="METHOD", y="R2",
                            palette=["r", "g", "b"],
                            hue="DATASET",
                            # hue="N FEATURES",
                            dodge=True,
                            data=_df)
                sns.despine(offset=10, trim=True)
                """rotate xticks"""
                plt.grid(axis="y", linestyle="--")
                plt.legend(loc="lower right")
                plt.xticks(rotation=90)
                plt.ylabel("R2")
                plt.title(_mms+f" (NFEAT={_n_feats})")
                plt.savefig(f"boxplot_{_mms}_f{_n_feats}_r2.png", dpi=300, bbox_inches="tight")
                plt.show()


boxplot compact (each setting)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

_ylim_rmse = { 4: {'MMS01': (0,3.5), 'MMS02': (0,1.5), 'MMS03': (0,3), 'MMS04': (0,2), 'MMS05': (0,2), 'MMS06': (0,3),
                  'MMS07': (0,2.5), 'MMS08': (0,2.5), 'MMS09': (0,3.5), 'MMS10': (0,2.5),  'MMS11': (0,1.5), 'MMS12': (0,3)},
               7: {'MMS01': (0,4), 'MMS02': (0,2), 'MMS03': (0,2), 'MMS04': (0,2.5), 'MMS05': (0,3), 'MMS06': (0,2),
                  'MMS07': (0,3.5), 'MMS08': (0,2), 'MMS09': (0,8), 'MMS10': (0,1.5),  'MMS11': (0,2), 'MMS12': (0,2.5)},
              10: {'MMS01': (0,20), 'MMS02': (0,7), 'MMS03': (0,4), 'MMS04': (0,1.5), 'MMS05': (0,3), 'MMS06': (0,2),
                  'MMS07': (0,5), 'MMS08': (0,2), 'MMS09': (0,3.5), 'MMS10': (0,2),  'MMS11': (0,2), 'MMS12': (0,4)},
             }

_methods = ['FV', 'FVD', 'FVD2',
            'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            'N +.5*X(d=.1) +.5*C(d=.1)', 
            'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            # 'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': '01. SR FLT=FV',
             'FV': '01. FIGP FV',
            #  'FVD': '02. SR FLT=FVD',
             'FVD': '02. FIGP FVD',
            #  'FVD2': '03. SR FLT=FVD2',
             'FVD2': '03. FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'04. SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'04. GPSR+NOISE_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'05. FIGP FV+NOISE_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'06. SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'06. FIGP FVD2+NOISE_XC',
            #  'MLR': '07. MLR',
             'MLR': '07. MLR',
            #  'SVR': '08. SVR',
             'SVR': '08. SVR',
            #  'MEAN': '09. MEAN',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():

# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:
for _mms in ['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']:
# for _mms in ['MMS03', 'MMS05']:
    for _n_feats in [4, 7, 10]:
    # for _n_feats in [4]:
        print(f"============================== {_mms} nfeats: {_n_feats} ==============================")
        _df = df[df["MMS"] == _mms]
        # print(set(_df["METHOD"]))
        # break
        _df = _df[_df["N FEATURES"] == _n_feats]
        print(len(_df))

        """pick up only _methods"""
        _df = _df[_df["METHOD"].isin(_methods)]

        """rename methods"""
        _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]

        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        _df = _df.sort_values(by=["METHOD"])
        _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]

        # _df.loc[:, "METHOD"] = [_df.loc[_idx, "METHOD"] + f" f_{_df.loc[_idx, 'N FEATURES']:02}" for _idx in _df.index]
        
        # _df.sort_values(by=["METHOD"], inplace=True)

        _PLOT_RMSE = True
        if _PLOT_RMSE:

            # sns.set(style="whitegrid")
            plt.figure(figsize=(10, 6))
            plt.ylim(_ylim_rmse[_n_feats][_mms])
            sns.set(style="ticks", palette="pastel")

            # Draw a nested boxplot 
            # seaborn.boxplot(data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, 
            #                 saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
            _bplot = sns.boxplot(x="METHOD", y="RMSE",
                                 palette=["r", "g", "b"],
                                 hue="DATASET",
                                 # hue="N FEATURES",
                                 dodge=True,
                                 data=_df)
            # print(_bplot["whiskers"])
            sns.despine(offset=10, trim=True)
            plt.grid(axis="y", linestyle="--")
            plt.legend(loc="upper right", fontsize=16)
            plt.xticks(rotation=45, fontsize=14)
            plt.yticks(fontsize=14)
            plt.ylabel("RMSE [log,Ki]", fontsize=20)
            plt.xlabel("")
            plt.title(_mms+f" (NFEAT={_n_feats})", fontsize=20)
            plt.savefig(f"boxplot_cpt_{_mms}_f{_n_feats}_rmse.png", dpi=300, bbox_inches="tight")
            plt.show()


boxplot comparison (acum all NFEATs)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

_ylim_rmse = {'MMS01': (0,3.5), 'MMS02': (0,1.2), 'MMS03': (0,2), 'MMS04': (0,1.5), 'MMS05': (0,3), 'MMS06': (0,2.5),
              'MMS07': (0,3), 'MMS08': (0,2.5), 'MMS09': (0,3.5), 'MMS10': (0,2),  'MMS11': (0,2), 'MMS12': (0,3)}

_methods = ['FV', 'FVD', 'FVD2',
            'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            'N +.5*X(d=.1) +.5*C(d=.1)', 
            'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            # 'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': '01. SR FLT=FV',
             'FV': '01. FIGP FV',
            #  'FVD': '02. SR FLT=FVD',
             'FVD': '02. FIGP FVD',
            #  'FVD2': '03. SR FLT=FVD2',
             'FVD2': '03. FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'04. SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'04. GPSR+STBL_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'05. FIGP FV+STBL_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'06. SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'06. FIGP FVD2+STBL_XC',
            #  'MLR': '07. MLR',
             'MLR': '07. MLR',
            #  'SVR': '08. SVR',
             'SVR': '08. SVR',
            #  'MEAN': '09. MEAN',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():

# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:
for _mms in ['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']:
# for _mms in ['MMS03', 'MMS05']:
    print(f"============================== {_mms} ==============================")
    _df = df[df["MMS"] == _mms]
    # print(set(_df["METHOD"]))
    # break
    # _df = _df[_df["N FEATURES"] == _n_feats]
    print(len(_df))

    """pick up only _methods"""
    _df = _df[_df["METHOD"].isin(_methods)]

    """rename methods"""
    _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]

    # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
    _df = _df.sort_values(by=["METHOD"])
    _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]

    # _df.loc[:, "METHOD"] = [_df.loc[_idx, "METHOD"] + f" f_{_df.loc[_idx, 'N FEATURES']:02}" for _idx in _df.index]
    
    # _df.sort_values(by=["METHOD"], inplace=True)

    _PLOT_RMSE = True
    if _PLOT_RMSE:

        # sns.set(style="whitegrid")
        plt.figure(figsize=(10, 6))
        plt.ylim(_ylim_rmse[_mms])
        sns.set(style="ticks", palette="pastel")

        # Draw a nested boxplot 
        # seaborn.boxplot(data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, 
        #                 saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
        _bplot = sns.boxplot(x="METHOD", y="RMSE",
                                palette=["r", "g", "b"],
                                hue="DATASET",
                                # hue="N FEATURES",
                                dodge=True,
                                data=_df)
        # print(_bplot["whiskers"])
        sns.despine(offset=10, trim=True)
        plt.grid(axis="y", linestyle="--")
        plt.legend(loc="upper right", fontsize=16)
        plt.xticks(rotation=45, fontsize=14)
        plt.yticks(fontsize=14)
        plt.ylabel("RMSE [log,Ki]", fontsize=20)
        plt.xlabel("")
        plt.title(_mms, fontsize=20)
        plt.savefig(f"boxplot_cpt_{_mms}_rmse.png", dpi=300, bbox_inches="tight")
        plt.show()


comparison NFEAT (absolute value)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import re

_methods = [
            # 'FV',
            'FVD',
            'FVD2',
            # 'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            # 'N +.5*X(d=.1) +.5*C(d=.1)', 
            # 'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': 'SR FLT=FV',
             'FV': 'FIGP FV',
            #  'FVD': 'SR FLT=FVD',
             'FVD': 'FIGP FVD',
            #  'FVD2': 'SR FLT=FVD2',
             'FVD2': 'FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'GPSR+STBL_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FV+STBL_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FVD2+STBL_XC',
            #  'MLR': 'MLR',
             'SVR': 'SVR',
             'MEAN': r'MEAN of $\it{training}$ $\it{set}$',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():
markers = ["o", "s", "D", "v", "^", ">", "<", "p", "P", "*", "X", "d"]
cmap = plt.get_cmap("tab10")
# cmap = plt.get_cmap("tab10")
# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:

fig, axes = plt.subplots(2, 7, figsize=(16, 10))
axes[-1, -1].axis('off')
axes[-1, -2].axis('off')

for _jdx, _mms in enumerate(['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']):
# for _mms in ['MMS03', 'MMS05']:

    ax = axes[int(_jdx/7), _jdx%7]

    for idx, _method in enumerate(_methods):

        _df = df[df["MMS"] == _mms]
        _df = _df[_df["DATASET"] == "test"]
        _df = _df[_df["METHOD"] == _method]

        """rename methods"""
        _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]
        _method_name = _m_rename[_method]
        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        # _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]

        _df_q = pd.DataFrame(index=[4, 7, 10], columns=["mean", "median", "25%", "40%", "50%", "60%", "75%", "std"])
        for _n_feats in [4, 7, 10]:
            _df_q.loc[_n_feats, :] = [_df[_df["N FEATURES"] == _n_feats]["RMSE"].mean(),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].median(),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.25),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.4),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.5),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.6),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.75),
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].std()]


        """plot median and errorbars from 25% to 75%, linetype is dashed for median"""
        # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "mean"], color=cmap(idx), marker="o", linestyle="solid", label=_method_name)
        # ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name, alpha=0.7)
        # # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "median"], color=cmap(idx), marker="_", linestyle="dashed", label="", linewidth=0.5)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none', 
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        """plot median and errobar with upper side only"""
        ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none',
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "40%"], _df_q.loc[:, "60%"] - _df_q.loc[:, "median"]], fmt='none',
                    ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)


        ax.set_yticks([0.5, 0.7, 0.9, 1.1], fontsize=14)

        """put legend outside of the plot"""
        if _jdx == 11:
            ax.legend(bbox_to_anchor=(1.1, 0.7), loc='upper left', borderaxespad=0, fontsize=14)
        
        if _jdx%7 != 0:
            ax.set_yticklabels([])
        else:
            ax.set_yticklabels(["0.5", "0.7", "0.9", "1.1"], fontsize=14)
            ax.set_ylabel("RMSE [log,Ki]", fontsize=16)

        ax.set_xticks([4, 7, 10])
        ax.set_xticklabels(["4", "7", "10"], fontsize=14)

        """set minor ticks for y axis"""
        ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(2))

        # ax.set_xlabel("N FEAT", fontsize=14)
        ax.set_xlabel("N FEAT")
        ax.set_xlim(3, 11)
        ax.set_ylim(0.4, 1.2)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5, which="minor")

        """set title inside the plot"""
        ax.text(.5, .90, _mms, fontsize=18,
                horizontalalignment='center',
                transform=ax.transAxes)



plt.subplots_adjust(wspace=0.1)
plt.savefig("plot_rmse_nfeat.png", dpi=300, bbox_inches="tight")
plt.show()



comparison NFEAT (relative value)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import re

_methods = [
            # 'FV',
            'FVD',
            'FVD2',
            # 'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            # 'N +.5*X(d=.1) +.5*C(d=.1)', 
            # 'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': 'SR FLT=FV',
             'FV': 'FIGP FV',
            #  'FVD': 'SR FLT=FVD',
             'FVD': 'FIGP FVD',
            #  'FVD2': 'SR FLT=FVD2',
             'FVD2': 'FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'GPSR+STBL_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FV+STBL_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FVD2+STBL_XC',
            #  'MLR': 'MLR',
             'SVR': 'SVR',
             'MEAN': r'MEAN of $\it{training}$ $\it{set}$',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():
markers = ["o", "s", "D", "v", "^", ">", "<", "p", "P", "*", "X", "d"]
cmap = plt.get_cmap("tab10")
# cmap = plt.get_cmap("tab10")
# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:

fig, axes = plt.subplots(2, 7, figsize=(16, 10))
axes[-1, -1].axis('off')
axes[-1, -2].axis('off')

for _jdx, _mms in enumerate(['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']):
# for _mms in ['MMS03', 'MMS05']:

    ax = axes[int(_jdx/7), _jdx%7]

    for idx, _method in enumerate(_methods):

        _df = df[df["MMS"] == _mms]
        _df = _df[_df["DATASET"] == "test"]
        _df_mean = _df[_df["METHOD"] == "MEAN"]
        _df = _df[_df["METHOD"] == _method]

        """rename methods"""
        _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]
        _method_name = _m_rename[_method]
        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        # _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]

        _df_q = pd.DataFrame(index=[4, 7, 10], columns=["mean", "median", "25%", "40%", "50%", "60%", "75%", "std"])
        for _n_feats in [4, 7, 10]:
            _se_mean_q = _df_mean[_df_mean["N FEATURES"] == _n_feats]["RMSE"].median()
            # print(_se_mean_q)

            _df_q.loc[_n_feats, :] = [_df[_df["N FEATURES"] == _n_feats]["RMSE"].mean()/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].median()/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.25)/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.4)/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.5)/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.6)/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].quantile(0.75)/_se_mean_q,
                               _df[_df["N FEATURES"] == _n_feats]["RMSE"].std()/_se_mean_q]


        """plot median and errorbars from 25% to 75%, linetype is dashed for median"""
        # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "mean"], color=cmap(idx), marker="o", linestyle="solid", label=_method_name)
        # ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name, alpha=0.7)
        # # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "median"], color=cmap(idx), marker="_", linestyle="dashed", label="", linewidth=0.5)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none', 
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        """plot median and errobar with upper side only"""
        ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none',
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "40%"], _df_q.loc[:, "60%"] - _df_q.loc[:, "median"]], fmt='none',
                    ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)


        # """set minor ticks for y axis"""
        # ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(2))

        # # ax.set_xlabel("N FEAT", fontsize=14)
        ax.set_xlabel("N FEAT")
        ax.set_xlim(3, 11)
        ax.set_ylim(0.5, 1.5)

        # ax.set_yticks([0.5, 0.7, 0.9, 1.1], fontsize=14)

        """put legend outside of the plot"""
        if _jdx == 11:
            ax.legend(bbox_to_anchor=(1.1, 0.7), loc='upper left', borderaxespad=0, fontsize=14)
        
        if _jdx%7 != 0:
            ax.set_yticklabels([])
        else:
            # print()
            ax.set_yticklabels(ax.get_yticklabels(), fontsize=14)
            ax.set_ylabel("Relative RMSE", fontsize=16)

        ax.set_xticks([4, 7, 10])
        ax.set_xticklabels(["4", "7", "10"], fontsize=14)



        ax.grid(axis="y", linestyle="dashed", alpha=0.5)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5, which="minor")

        """set title inside the plot"""
        ax.text(.5, .90, _mms, fontsize=18,
                horizontalalignment='center',
                transform=ax.transAxes)



plt.subplots_adjust(wspace=0.1)
plt.savefig("plot_rmse_nfeat_r.png", dpi=300, bbox_inches="tight")
plt.show()



comparison TRAIN RATIO

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import re

_methods = [
            # 'FV',
            'FVD',
            'FVD2',
            # 'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            # 'N +.5*X(d=.1) +.5*C(d=.1)', 
            # 'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': 'SR FLT=FV',
             'FV': 'FIGP FV',
            #  'FVD': 'SR FLT=FVD',
             'FVD': 'FIGP FVD',
            #  'FVD2': 'SR FLT=FVD2',
             'FVD2': 'FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'GPSR+STBL_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FV+STBL_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FVD2+STBL_XC',
            #  'MLR': 'MLR',
             'SVR': 'SVR',
             'MEAN': r'MEAN of $\it{training}$ $\it{set}$',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():
markers = ["o", "s", "D", "v", "^", ">", "<", "p", "P", "*", "X", "d"]
cmap = plt.get_cmap("tab10")
# cmap = plt.get_cmap("tab10")
# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:

fig, axes = plt.subplots(2, 7, figsize=(16, 10))
axes[-1, -1].axis('off')
axes[-1, -2].axis('off')

# display(df)

for _jdx, _mms in enumerate(['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']):
# for _mms in ['MMS03', 'MMS05']:

    ax = axes[int(_jdx/7), _jdx%7]

    for idx, _method in enumerate(_methods):

        _df = df[df["MMS"] == _mms]
        _df = _df[_df["DATASET"] == "test"]
        _df = _df[_df["METHOD"] == _method]
        # _df = _df[_df["N FEATURES"] == 10]

        """rename methods"""
        _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]
        _method_name = _m_rename[_method]
        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        # _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]


        _df_q = pd.DataFrame(index=[0.2, 0.5, 0.8], columns=["mean", "median", "25%", "40%", "50%", "60%", "75%", "std"])
        # _df_q = pd.DataFrame(index=[4, 7, 10], columns=["mean", "median", "25%", "40%", "50%", "60%", "75%", "std"])
        for _train_r in [0.2, 0.5, 0.8]:
            _rmse = _df[_df["TRAIN RATIO"] == _train_r]["RMSE"]
            _df_q.loc[_train_r, :] = [_rmse.mean(), _rmse.median(), _rmse.quantile(0.25),
                                      _rmse.quantile(0.4), _rmse.quantile(0.5), _rmse.quantile(0.6),
                                      _rmse.quantile(0.75), _rmse.std()]
            # print(_df_q.loc[_train_r, :])
        
        """plot median and errorbars from 25% to 75%, linetype is dashed for median"""
        # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "mean"], color=cmap(idx), marker="o", linestyle="solid", label=_method_name)
        # ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name, alpha=0.7)
        # # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "median"], color=cmap(idx), marker="_", linestyle="dashed", label="", linewidth=0.5)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none', 
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        """plot median and errobar with upper side only"""
        ax.plot(_df_q.index+0.02*idx-0.01*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none',
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        ax.errorbar(_df_q.index+0.02*idx-0.01*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "40%"], _df_q.loc[:, "60%"] - _df_q.loc[:, "median"]], fmt='none',
                    ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)


        # ax.set_yticks([0.5, 0.7, 0.9, 1.1], fontsize=14)

        """put legend outside of the plot"""
        if _jdx == 11:
            ax.legend(bbox_to_anchor=(1.1, 0.7), loc='upper left', borderaxespad=0, fontsize=14)
        
        """set minor ticks for y axis"""
        # ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(2))

        # ax.set_xlabel("N FEAT", fontsize=14)
        # ax.set_xlabel("N FEAT")
        ax.set_xlim(0, 1)
        ax.set_ylim(0.4, 2.1)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5, which="minor")

        if _jdx%7 != 0:
            ax.set_yticklabels([])
        else:
            # ax.set_yticklabels(["0.5", "0.7", "0.9", "1.1"], fontsize=14)
            ax.set_ylabel("RMSE [log,Ki]", fontsize=16)

        ax.set_xticks([0.2, 0.5, 0.8])
        ax.set_xticklabels(["0.2", "0.5", "0.8"], fontsize=14)


        """set title inside the plot"""
        ax.text(.5, .90, _mms, fontsize=18,
                horizontalalignment='center',
                transform=ax.transAxes)



plt.subplots_adjust(wspace=0.1)
plt.savefig("plot_rmse_trainr.png", dpi=300, bbox_inches="tight")
plt.show()



comparison TRAIN R (relative)

In [None]:
"""BOX PLOT of RMSE"""
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import re

_methods = [
            # 'FV',
            'FVD',
            'FVD2',
            # 'FV +.5*X(d=.1) +.5*C(d=.1)',
            'FVD2 +.5*X(d=.1) +.5*C(d=.1)',
            # 'N +.5*X(d=.1) +.5*C(d=.1)', 
            # 'MLR',
            'SVR',
            # 'FV +1*X(d=.1)', 
            # 'FV +1*C(d=.1)', 
            'MEAN',
            # 'FV +1*X(d=.01)', 'FV +1*X(d=.02)', 'FV +1*C(d=.01)',
            # 'FV +1*C(d=.2)', 'FV +1*X(d=.5)',
            # 'FV +1*C(d=.5)', 'N', 'FV +1*C(d=.05)'
            ]

_m_rename = {
            #  'FV': 'SR FLT=FV',
             'FV': 'FIGP FV',
            #  'FVD': 'SR FLT=FVD',
             'FVD': 'FIGP FVD',
            #  'FVD2': 'SR FLT=FVD2',
             'FVD2': 'FIGP FVD2',
            #  'N +.5*X(d=.1) +.5*C(d=.1)': r'SR NOFLT+NOISE($\bf{X}$,$\bf{C}$)',
             'N +.5*X(d=.1) +.5*C(d=.1)': r'GPSR+STBL_XC',
            #  'FV +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FV+NOISE($\bf{X}$,$\bf{C}$)',
             'FV +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FV+STBL_XC',
            #  'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'SR FLT=FVD2+NOISE($\bf{X}$,$\bf{C}$)',
             'FVD2 +.5*X(d=.1) +.5*C(d=.1)': r'FIGP FVD2+STBL_XC',
            #  'MLR': 'MLR',
             'SVR': 'SVR',
             'MEAN': r'MEAN of $\it{training}$ $\it{set}$',
            #  'FV +1*X(d=.1)': r'05. SR FLT=FV+NOISE($\bf{X}$)',
            #  'FV +1*C(d=.1)': r'06. SR FLT=FV+NOISE($\bf{C}$)',
             }

_selected_data = {

}

# for _mms, _nfeats in _seleced_data.items():
markers = ["o", "s", "D", "v", "^", ">", "<", "p", "P", "*", "X", "d"]
cmap = plt.get_cmap("tab10")
# cmap = plt.get_cmap("tab10")
# for _mms in ['MMS01', 'MMS02', 'MMS06', 'MMS09', 'MMS11']:

fig, axes = plt.subplots(2, 7, figsize=(16, 10))
axes[-1, -1].axis('off')
axes[-1, -2].axis('off')

for _jdx, _mms in enumerate(['MMS01', 'MMS02', 'MMS03', 'MMS04', 'MMS05', 'MMS06', 'MMS07', 'MMS08', 'MMS09', 'MMS10', 'MMS11', 'MMS12']):
# for _mms in ['MMS03', 'MMS05']:

    ax = axes[int(_jdx/7), _jdx%7]

    for idx, _method in enumerate(_methods):

        _df = df[df["MMS"] == _mms]
        _df = _df[_df["DATASET"] == "test"]
        _df_mean = _df[_df["METHOD"] == "MEAN"]
        _df = _df[_df["METHOD"] == _method]
        # _df = _df[_df["N FEATURES"] == 7]

        """rename methods"""
        _df.loc[:, "METHOD"] = [_m_rename[_method] for _method in _df["METHOD"]]
        _method_name = _m_rename[_method]
        # _df.loc[:, "METHOD"] = [_method.replace("*", "", ).replace("+1", "+").replace(".", "0.").replace("d", "$\sigma$").replace("X", "$X$").replace("C", "$C$") for _method in _df["METHOD"]]
        # _df.loc[:, "METHOD"] = [re.sub("^0", "", _method.replace("+", "\n        +")).replace("7. MLR", "      7. MLR      ").replace("8. SVR", "      8. SVR      ") for _method in _df["METHOD"]]

        _df_q = pd.DataFrame(index=[0.2, 0.5, 0.8], columns=["mean", "median", "25%", "40%", "50%", "60%", "75%", "std"])
        for _train_r in [0.2, 0.5, 0.8]:
            _se_mean_q = _df_mean[_df_mean["TRAIN RATIO"] == _train_r]["RMSE"].median()
            _rmse = _df[_df["TRAIN RATIO"] == _train_r]["RMSE"]
            
            _df_q.loc[_train_r, :] = [_rmse.mean()/_se_mean_q, _rmse.median()/_se_mean_q, _rmse.quantile(0.25)/_se_mean_q,
                                      _rmse.quantile(0.4)/_se_mean_q, _rmse.quantile(0.5)/_se_mean_q, _rmse.quantile(0.6)/_se_mean_q,
                                      _rmse.quantile(0.75)/_se_mean_q, _rmse.std()/_se_mean_q]

            # print(_df_q.loc[_train_r, :])
        
        """plot median and errorbars from 25% to 75%, linetype is dashed for median"""
        # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "mean"], color=cmap(idx), marker="o", linestyle="solid", label=_method_name)
        # ax.plot(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name, alpha=0.7)
        # # ax.plot(_df_q.index+0.2*idx, _df_q.loc[:, "median"], color=cmap(idx), marker="_", linestyle="dashed", label="", linewidth=0.5)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none', 
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        """plot median and errobar with upper side only"""
        ax.plot(_df_q.index+0.02*idx-0.01*len(_methods), _df_q.loc[:, "median"], color=cmap(idx), marker=markers[idx], linestyle="solid", label=_method_name)
        # ax.errorbar(_df_q.index+0.2*idx-0.1*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "25%"], _df_q.loc[:, "75%"] - _df_q.loc[:, "median"]], fmt='none',
        #             ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)

        ax.errorbar(_df_q.index+0.02*idx-0.01*len(_methods), _df_q.loc[:, "median"], yerr=[_df_q.loc[:, "median"] - _df_q.loc[:, "40%"], _df_q.loc[:, "60%"] - _df_q.loc[:, "median"]], fmt='none',
                    ecolor=cmap(idx), capsize=3, linewidth=0.5, alpha=0.7)


        """put legend outside of the plot"""
        if _jdx == 11:
            ax.legend(bbox_to_anchor=(1.1, 0.7), loc='upper left', borderaxespad=0, fontsize=14)
        
        # """set minor ticks for y axis"""
        # ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(2))

        """set minor ticks for y axis"""
        # ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(2))

        # ax.set_xlabel("N FEAT", fontsize=14)
        ax.set_xlabel("TRAIN RATIO", fontsize=14)
        ax.set_xlim(0, 1)
        ax.set_ylim(0.5, 2.0)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5, which="minor")

        if _jdx%7 != 0:
            ax.set_yticklabels([])
        else:
            ax.set_yticklabels(ax.get_yticklabels(), fontsize=14)
            ax.set_ylabel("Relative RMSE", fontsize=16)

        ax.set_xticks([0.2, 0.5, 0.8])
        ax.set_xticklabels(["0.2", "0.5", "0.8"], fontsize=14)


        ax.grid(axis="y", linestyle="dashed", alpha=0.5)
        ax.grid(axis="y", linestyle="dashed", alpha=0.5, which="minor")

        """set title inside the plot"""
        ax.text(.5, .90, _mms, fontsize=18,
                horizontalalignment='center',
                transform=ax.transAxes)



plt.subplots_adjust(wspace=0.1)
plt.savefig("plot_rmse_trainr_r.png", dpi=300, bbox_inches="tight")
plt.show()



best expression for each MMS by tex

In [None]:
import sympy as sp
import pandas as pd
import re
from IPython.display import Math

def float_precision(float_str, _precision=3):
    _ret = None
    if "." not in float_str:
        _ret = float_str
    else:
        _match = re.search("[1-9]", float_str)
        _idx = float_str.index(".")
        _len = len(float_str)
        if 0 == _match.start(): # 1.234, 12.345, ..
            if _idx <= _precision and _precision <= _len-1:
                _ret = f"{float(float_str):.{_precision-_idx}f}"
            else:
                _ret = f"{round(float(float_str))}"
        elif 1 < _match.start(): # 0.123, 0.01234, ...
            if _precision+_match.start()-2 < _len-(_idx+1):
                _ret = f"{float(float_str):.{_precision+_match.start()-2}f}"
            else:
                _ret = float_str
        else:
            raise RuntimeError("unexpected float_str", float_str)

    print(float_str, end=" -> ")
    return _ret
    # return f"{float(_f):.{_precision}f}"

res_tex = pd.read_csv("../scripts/res_tex_all.csv", header=None)
# display(res_tex)
res_tex.columns = ["PATH", "MMS", "SUBPATH", "RMSE_TRAIN", "RMSE_TEST", "TEX"]

PATH_PTN = re.compile(r"^2303.._D([^_]*)_F([^_]*)_(([^_]*)|([^_]*)_([^_]*)|([^_]*)_([^_]*)_([^_]*))$")
XX_PTN = re.compile(r"(arings|acc|don|a_heavy|logp|rbc|rings|tpsa|vdw_vol|mw) (arings|acc|don|a_heavy|logp|rbc|rings|tpsa|vdw_vol|mw)")
res_tex.loc[:, ["N FEATURES", "TRAIN RATIO", "METHOD"]] = None


for _idx in res_tex.index:
    
    if PATH_PTN.match(res_tex.loc[_idx, "PATH"]) is not None:
        _match = PATH_PTN.match(res_tex.loc[_idx, "PATH"])
        # print(_match.groups(), _match.group(0),_match.group(1),_match.group(2),_match.group(3))
        _train_r = float(f"0.{_match.group(1)}")
        _n_feats = int(_match.group(2))
        _method = None
        
        if _match.group(4) is not None:
            _method = _match.group(4)
        elif _match.group(5) is not None:
            _lmd = _match.group(6)[1:].split(":")
            _method = f"{_match.group(5)} +{_lmd[0]}*{_match.group(6)[0]}(d={_lmd[1]})"
        elif _match.group(7) is not None:
            _xlmd = _match.group(8)[1:].split(":")
            _clmd = _match.group(9)[1:].split(":")
            _method = f"{_match.group(7)} +{_xlmd[0]}*{_match.group(8)[0]}(d={_xlmd[1]}) +{_clmd[0]}*{_match.group(9)[0]}(d={_clmd[1]})"
        else:
            raise Exception("no match", _match.groups())
        
        res_tex.loc[_idx, ["N FEATURES", "TRAIN RATIO", "METHOD"]] = [_n_feats, _train_r, _method]

res_tex = res_tex.iloc[:, [8, 7, 6, 1, 3, 4, 5, 2, 0]]
for _idx in range(1, 13):
    _mms = f"MMS{_idx:02d}"
    _best = res_tex[res_tex["MMS"] == _mms].sort_values(by="RMSE_TEST").iloc[:1, :]
    _best = _best[_best["MMS"] == _mms].sort_values(by="RMSE_TEST").iloc[:1, :]
    # print(_best)
    _tex = "$$\mathrm{" + _best.iloc[0, :].loc['TEX'] + "}$$"
    display(_best.loc[:, ["MMS", "METHOD", "N FEATURES", "TRAIN RATIO", "SUBPATH", "RMSE_TRAIN", "RMSE_TEST", "SUBPATH"]])
    # display(Math(_best.iloc[0, :].loc["TEX"]))

    _rep = []
    _r = re.finditer(r"(?<![0-9])[0-9]+\.[0-9]+(?![0-9])", _tex)
    for _m in _r:
        # print(_m.group(0), _m.start(), _m.end())
        _rep.append([_m.start(), _m.end(), _m.group(0), float_precision(_m.group(0))])
        # print(_m.groups())

    for item in reversed(_rep):
        _tex = _tex[:item[0]] + item[3] + _tex[item[1]:]
    # print(_tex)
    # print(_match.group())
    print(_tex)
    if XX_PTN.search(_tex):
        print("match!!")
        _tex = XX_PTN.sub("\\1\\\\cdot \\2", _tex)
        print(_tex)
    display(Math(_tex[2:-2].replace("$", "", 4)))
    sp.preview(_tex, viewer='file', filename=f"{_mms}_expr.png", euler=False)
    

#sp.preview(r'$$\int_0^1 e^x\,dx$$', viewer='file', filename='test.png', euler=False)
# sp.preview(r'$$\left(- \frac{tpsa}{- a_{heavy} + mw} + 4.5320462001435\right) \log{\left(don + 4.15689215183525 \right)}$$', viewer='file', filename='test.png', euler=False)

best expression by FVD, FVD2, FVD2+NOISE

In [None]:
_methods = ["FV", "FVD", "FVD2", "FVD2 +.5*X(d=.1) +.5*C(d=.1)"]
_meth_rep = {"FV": "fv", "FVD": "fvd", "FVD2": "fvd2", "FVD2 +.5*X(d=.1) +.5*C(d=.1)": "fvd2xc"}

XX_PTN = re.compile(r"(arings|acc|don|a_heavy|logp|rbc|rings|tpsa|vdw_vol|mw) (arings|acc|don|a_heavy|logp|rbc|rings|tpsa|vdw_vol|mw)")

for _idx in range(1, 13):
    _mms = f"MMS{_idx:02d}"

    for _method in _methods:
        _best = res_tex[res_tex["MMS"] == _mms]
        _best = _best[_best["METHOD"] == _method].sort_values(by="RMSE_TEST").iloc[:1, :]
    
        # print(_expr)
        _tex = "$$\mathrm{" + _best.iloc[0, :].loc['TEX'] + "}$$"
        display(_best.loc[:, ["MMS", "METHOD", "N FEATURES", "TRAIN RATIO", "SUBPATH", "RMSE_TRAIN", "RMSE_TEST", "SUBPATH"]])
        # display(Math(_best.iloc[0, :].loc["TEX"]))

        _rep = []
        _r = re.finditer(r"(?<![0-9])[0-9]+\.[0-9]+(?![0-9])", _tex)
        for _m in _r:
            # print(_m.group(0), _m.start(), _m.end())
            _rep.append([_m.start(), _m.end(), _m.group(0), float_precision(_m.group(0))])
            # print(_m.groups())

        print(_tex)
        for item in reversed(_rep):
            _tex = _tex[:item[0]] + item[3] + _tex[item[1]:]
        # print(_tex)
        # print(_match.group())
        if XX_PTN.search(_tex):
            print("match!!")
            _tex = XX_PTN.sub("\\1\\\\cdot \\2", _tex)
            print(_tex)

        display(Math(_tex[2:-2].replace("$", "", 4)))
        sp.preview(_tex, viewer='file', filename=f"{_mms}_{_meth_rep[_method]}_expr.png", euler=False)
        # 

    # #sp.preview(r'$$\int_0^1 e^x\,dx$$', viewer='file', filename='test.png', euler=False)
    # # sp.preview(r'$$\left(- \frac{tpsa}{- a_{heavy} + mw} + 4.5320462001435\right) \log{\left(don + 4.15689215183525 \right)}$$', viewer='file', filename='test.png', euler=False)

In [None]:
_tex = r"\mathrm{\frac{acc - 122.571990506088}{don - 32.0349830495832 + \frac{788.515122239802}{a_{heavy}}} + \frac{\sqrt{rbc} + 38.0215962513931}{-0.151016396418077 + \frac{834.685645085884}{vdw_{vol}}}}"



display(Math(_tex))

In [None]:
import numpy as np
import pandas as pd
# Create random data
rd = pd.DataFrame(np.random.randn(100, 10))
# Calculate all the desired values
df = pd.DataFrame({'mean': rd.mean(), 'median': rd.median(),
                   '25%': rd.quantile(0.25), '50%': rd.quantile(0.5),
                   '75%': rd.quantile(0.75)})
# And plot it
df.plot()
display(rd)

In [None]:
import re

def float_precision(float_str, _precision=3):
    _ret = None
    if "." not in float_str:
        _ret = float_str
    else:
        _match = re.search("[1-9]", float_str)
        _idx = float_str.index(".")
        _len = len(float_str)
        if 0 == _match.start(): # 1.234, 12.345, ..
            if _idx <= _precision and _precision <= _len-1:
                _ret = f"{float(float_str):.{_precision-_idx}f}"
            else:
                _ret = f"{round(float(float_str))}"
        elif 1 < _match.start(): # 0.123, 0.01234, ...
            if _precision+_match.start()-2 < _len-(_idx+1):
                _ret = f"{float(float_str):.{_precision+_match.start()-2}f}"
            else:
                _ret = float_str
        else:
            raise RuntimeError("unexpected float_str", float_str)

    print(float_str, end=" -> ")
    return _ret
    # return f"{float(_f):.{_precision}f}"

print(float_precision("0.000123456789"))
print(float_precision("0.0123456789"))
print(float_precision("1.23456789"))
print(float_precision("0.123456789"))
print(float_precision("1.2345"))
print(float_precision("12.345"))
print(float_precision("123.45"))
print(float_precision("1234.5"))
print(float_precision("1234.56789"))
print(float_precision("1.2"))
print(float_precision("0.12"))
print(float_precision("0.012"))
print(float_precision("0.01"))
print(float_precision("0.00012345"))
print(float_precision("0.00012"))
print(float_precision("9.5"))


In [None]:
df_res = pd.DataFrame(columns=["MMS", "METHOD", "N FEATURES", "TRAIN RATIO", \
                               "RMSE TRAIN COUNT", "RMSE TRAIN MEAN", "RMSE TRAIN STD", "RMSE TRAIN MIN", "RMSE TRAIN 25%", "RMSE TRAIN 50%", "RMSE TRAIN 75%", "RMSE TRAIN MAX", \
                               "R2 TRAIN COUNT",   "R2 TRAIN MEAN",   "R2 TRAIN STD",   "R2 TRAIN MIN",   "R2 TRAIN 25%",   "R2 TRAIN 50%",   "R2 TRAIN 75%",   "R2 TRAIN MAX", \
                               "RMSE TEST COUNT",  "RMSE TEST MEAN",  "RMSE TEST STD",  "RMSE TEST MIN",  "RMSE TEST 25%",  "RMSE TEST 50%",  "RMSE TEST 75%",  "RMSE TEST MAX", \
                               "R2 TEST COUNT",    "R2 TEST MEAN",    "R2 TEST STD",    "R2 TEST MIN",    "R2 TEST 25%",    "R2 TEST 50%",    "R2 TEST 75%",    "R2 TEST MAX"])                                                                                                                                                                            

mmss = df["MMS"].drop_duplicates().values
methods = df["METHOD"].drop_duplicates().values
nfeatss = df["N FEATURES"].drop_duplicates().values
trainrs = df["TRAIN RATIO"].drop_duplicates().values

print(mmss)
print(methods)
print(nfeatss)
print(trainrs)

for _mms in mmss:
    for _method in methods:
        for _nfeats in nfeatss:
            for _trainr in trainrs:
                _train = df[(df["MMS"] == _mms) & (df["METHOD"] == _method) & (df["N FEATURES"] == _nfeats) & (df["TRAIN RATIO"] == _trainr) & (df["DATASET"] == "train")].loc[:, ["RMSE","R2"]].describe()
                _test = df[(df["MMS"] == _mms) & (df["METHOD"] == _method) & (df["N FEATURES"] == _nfeats) & (df["TRAIN RATIO"] == _trainr) & (df["DATASET"] == "test")].loc[:, ["RMSE","R2"]].describe()
                new_row = [_mms, _method, _nfeats, _trainr]+list(_train["RMSE"].values)+list(_train["R2"])+list(_test["RMSE"].values)+list(_test["R2"])
                # print(len(df_res.columns), len(new_row))
                df_res.loc[len(df_res)] = new_row
                
df_res


In [None]:
# import matplotlib.pyplot as plt
import seaborn as sns

cm_gr = sns.light_palette('green', reverse=True, as_cmap=True)
cm_br = sns.light_palette('blue', reverse=True, as_cmap=True)
cm_b  = sns.light_palette('blue', reverse=False, as_cmap=True)


for _nfeats in nfeatss:
    for _trainr in trainrs:
        print(f"======= RMSE TEST (N FEATS: {_nfeats}, TRAIN RATIO: {_trainr}) =======")
        df_rmse_test = pd.DataFrame(index=mmss, columns=methods)
        df_rmse_test_med = pd.DataFrame(index=mmss, columns=methods)
        df_rmse_test_count = pd.DataFrame(index=mmss, columns=methods)
        _df = df_res[(df_res["N FEATURES"] == _nfeats) & (df_res["TRAIN RATIO"] == _trainr)]
        for _mms in mmss:
            # display(_df[(_df["MMS"] == _mms)].loc[:, ["METHOD", "RMSE TEST MEAN"]])
            df_rmse_test.loc[_mms] = _df[(_df["MMS"] == _mms)]["RMSE TEST MEAN"].values
            df_rmse_test_med.loc[_mms] = _df[(_df["MMS"] == _mms)]["RMSE TEST 50%"].values
            df_rmse_test_count.loc[_mms] = _df[(_df["MMS"] == _mms)]["RMSE TEST COUNT"].values

        # df_rmse_test_F10_D8_s = df_rmse_test_F10_D8.style.background_gradient(cmap=cm_g, axis=1)
        # s_df_rmse_test = df_rmse_test.astype("float64").style.background_gradient(cmap=cm_gr, axis=1).set_precision(3)
        # s_df_rmse_test_med = df_rmse_test_med.astype("float64").style.background_gradient(cmap=cm_br, axis=1).set_precision(3)
        s_df_rmse_test = df_rmse_test.astype("float64").style.background_gradient(cmap=cm_gr, axis=1)
        s_df_rmse_test_med = df_rmse_test_med.astype("float64").style.background_gradient(cmap=cm_br, axis=1)
        s_df_rmse_test_count = df_rmse_test_count.astype("int").style
        s_df_rmse_test.format("{:.2e}")
        s_df_rmse_test_med.format("{:.2e}")

        # s_train_r2 = df_train_r2.style.background_gradient(cmap=cm_gr, vmax=1.0, axis=1).set_precision(3)
        # s_test_r2 = df_test_r2.style.background_gradient(cmap=cm_gr, vmax=1.0, axis=1).set_precision(3)


        s_df_rmse_test = s_df_rmse_test.apply(min_bold_max_italic, axis=1)
        s_df_rmse_test = s_df_rmse_test.apply(min_bold_max_italic, axis=1)
        s_df_rmse_test_med = s_df_rmse_test_med.apply(min_bold_max_italic, axis=1)
        s_df_rmse_test_med = s_df_rmse_test_med.apply(min_bold_max_italic, axis=1)
        s_df_rmse_test_count = s_df_rmse_test_count.apply(max_bold_min_italic, axis=1)
        s_df_rmse_test_count = s_df_rmse_test_count.apply(max_bold_min_italic, axis=1)
        display(s_df_rmse_test)
        display(s_df_rmse_test_med)
        display(s_df_rmse_test_count)
