In [1]:
import numpy as np
import pandas as pd
import random as rnd
import torch
import matplotlib
import matplotlib.pyplot as plt

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

parameters = {"axes.labelsize": 20, "legend.fontsize": 16, "xtick.labelsize": 16, "ytick.labelsize": 16, "lines.linewidth": 2, "lines.markersize": 10}
plt.rcParams.update(parameters)



import lcpfn.lcpfn as lcpfn

path_test = '/mnt/c/Users/prath/PycharmProjects/rp/LCDB_localised/test_curves.pkl'
df_test = pd.read_pickle(path_test)

path_all = '/mnt/c/Users/prath/PycharmProjects/rp/LCDB_localised/all_curves_preprocessed.pkl'
df_all = pd.read_pickle(path_all)

# model_name = '/mnt/c/Users/prath/PycharmProjects/rp/Data/model_lcdb_2.pt'
model_name = None
if model_name is None:
    model = lcpfn.LCPFN()
else:
    model = lcpfn.LCPFN(model_name=model_name)
print(model_name)

# -------------------------------------------------

# path_mmf4_last1 = "/mnt/c/Users/prath/Documents/Uni(Tudelft)/Bsc_cs/Year_3/RP/lcdb/publications/2022-ecml/analysis/"
path_mmf4_last1 = "/mnt/c/Users/prath/Downloads/fitting_results_new/"
try:
    df_total = pd.read_pickle(path_mmf4_last1 + "df_total.gz")
except FileNotFoundError:
    df_total = None
    print("df_total not found")


None


In [3]:
#Variables
CUT_OFF = 10 # 10, 20, 40, 80
NO_LAST1 = []
NO_MMF4 = []

In [4]:
def get_curve(i: int):
    """Retrieve the  curve for a given model and dataset."""
    row = df_all.iloc[i]
    opid = row['openmlid']
    lrnr = row['learner']
    anchrs = row['anchors']
    mns = row['means']
    std = row['std']
    return anchrs, mns, std, opid, lrnr

def get_closest_index(lst, target):
    return min(range(len(lst)), key=lambda i: abs(lst[i] - target))

def get_common_data(cutoff, extend, anchors, means, ):
    mns = np.array(means)
    anchrs = np.array(anchors)
    anchrs = ((anchrs - np.min(anchrs)) / (np.max(anchrs) - np.min(anchrs))) * 100

    cutoff_index = get_closest_index(anchrs, cutoff)
    curve = np.array(mns[:cutoff_index])
    anchrs = anchrs.astype(int)

    add_anchor = np.arange(anchrs[-1], extend, 10)
    a = np.concatenate((anchrs, add_anchor))
    x = torch.from_numpy(a).unsqueeze(1)
    y = torch.from_numpy(curve).float().unsqueeze(1)

    return x, y, anchrs, mns, cutoff_index

def get_mse(model, anchors, means, openlid, learner, cutoff=CUT_OFF, extend=100, plot=False):
    x, y, anchrs, mns, cutoff_index = get_common_data(cutoff, extend, anchors, means, )

    predictions = model.predict_quantiles(x_train=x[:cutoff_index], y_train=y, x_test=x[cutoff_index:],
                                          qs=[0.05, 0.5, 0.95])

    if plot:
        plot_it(anchrs, mns, predictions, cutoff_index, x)

    predictions = predictions.detach().numpy()
    mse_lcfpn = np.mean((predictions[:len(anchrs) - cutoff_index, 1] - mns[cutoff_index:]) ** 2)

    return_tup = (mse_lcfpn,)

    if df_total is not None:
        try:
            row = df_total.query(
                f'openmlid == {openlid} & learner == "{learner}" & percentage_bucket == {CUT_OFF / 100} & curve_model == "last1"')
            li = abs(CUT_OFF / 100 - row['percentage'].values)
            index_min = min(range(len(li)), key=li.__getitem__)
            mse_last1 = row.iloc[index_min]['MSE_tst']
        except:
            print("last1 not found")
            print(openlid, learner)
            NO_LAST1.append((openlid, learner))
            print("---------------")
            mse_last1 = None

        try:
            row = df_total.query(
                f'openmlid == {openlid} & learner == "{learner}" & percentage_bucket == {CUT_OFF / 100} & curve_model == "mmf4"')
            li = abs(CUT_OFF / 100 - row['percentage'].values)
            index_min = min(range(len(li)), key=li.__getitem__)
            mse_mmf4 = row.iloc[index_min]['MSE_tst']
        except:
            print("mmf4 not found")
            print(openlid, learner)
            NO_MMF4.append((openlid, learner))
            print("---------------")
            mse_mmf4 = None

        return_tup = (mse_lcfpn, mse_last1, mse_mmf4)
    return return_tup

def get_mae(model, anchors, means, cutoff=CUT_OFF, extend=100, plot=False):
    x, y, anchors, means, cutoff_index = get_common_data(cutoff, extend, anchors, means, )

    predictions = model.predict_quantiles(x_train=x[:cutoff_index], y_train=y, x_test=x[cutoff_index:],
                                          qs=[0.05, 0.5, 0.95])

    if plot:
        plot_it(anchors, means, predictions, cutoff_index, x)

    predictions = predictions.detach().numpy()
    return np.mean(np.abs(predictions[:len(anchors) - cutoff_index, 1] - means[cutoff_index:]))

def plot_it(anchors, means, predictions, cutoff_index, x):
    plt.plot(anchors, means, "*", label="target")
    plt.plot(anchors, means, label="target")
    plt.plot(x[cutoff_index:], predictions[:, 1], "r*", label="Extrapolation by PFN")
    plt.fill_between(
        x[cutoff_index:].flatten(), predictions[:, 0], predictions[:, 2], color="blue", alpha=0.2, label="CI of 90%"
    )
    plt.vlines(x[cutoff_index], 0, 1, linewidth=0.5, color="k", label="cutoff")
    plt.ylim(0, 1)
    plt.legend(loc="lower right")
    plt.savefig("/mnt/c/Users/prath/PycharmProjects/rp/Data/extrapolation.png")
    plt.show()

In [5]:
anchors, means, std, openlid, learner = get_curve(0)
print(openlid, learner)
print(get_mse(model, anchors, means, openlid, learner, cutoff=CUT_OFF, plot=True))

44 SVC_linear
(0.0014973715581602148, 0.0004788485706655665, 0.00014325503509190595)


  plt.show()


In [6]:
performance = []
performance_lable = []

for i in range(len(df_all)):
    anchors, means, std, openlid, learner = get_curve(i)
    print(i + 1, end='\r')
    performance.append(get_mse(model, anchors, means, openlid, learner))
    performance_lable.append((openlid, learner))
performance = np.array(performance)

last1 not found
1083 SVC_linear
---------------
mmf4 not found
1083 SVC_linear
---------------
last1 not found
1083 SVC_poly
---------------
mmf4 not found
1083 SVC_poly
---------------
last1 not found
1083 SVC_rbf
---------------
mmf4 not found
1083 SVC_rbf
---------------
last1 not found
1083 SVC_sigmoid
---------------
mmf4 not found
1083 SVC_sigmoid
---------------
last1 not found
1083 sklearn.discriminant_analysis.LinearDiscriminantAnalysis
---------------
mmf4 not found
1083 sklearn.discriminant_analysis.LinearDiscriminantAnalysis
---------------
last1 not found
1083 sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis
---------------
mmf4 not found
1083 sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis
---------------
last1 not found
1083 sklearn.ensemble.ExtraTreesClassifier
---------------
mmf4 not found
1083 sklearn.ensemble.ExtraTreesClassifier
---------------
last1 not found
1083 sklearn.ensemble.GradientBoostingClassifier
---------------
mmf4 not found
108


KeyboardInterrupt



In [None]:
# Print the results in a clear format
if performance[1].shape[0] == 3:
    df = pd.DataFrame(performance, columns=['mse_lcpfn', 'mse_last1', 'mse_mmf4'], index=performance_lable)
else:
    df = pd.DataFrame(performance, columns=['mse_lcpfn'], index=performance_lable)

In [None]:
df.to_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_{CUT_OFF}.pkl')
df_last1 = pd.DataFrame(NO_LAST1, columns=['openmlid', 'learner'])
df_mmf4 = pd.DataFrame(NO_MMF4, columns=['openmlid', 'learner'])

counts_last1 = df_last1['openmlid'].value_counts()
counts_mmf4 = df_mmf4['openmlid'].value_counts()

counts_last1.to_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/counts_last1_{CUT_OFF}.pkl')
counts_mmf4.to_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/counts_mmf4_{CUT_OFF}.pkl')

df_last1.to_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_last1_{CUT_OFF}.pkl')
df_mmf4.to_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mmf4_{CUT_OFF}.pkl')

# Plot the results


In [None]:
CUT_OFF = 80
df = pd.read_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_{CUT_OFF}.pkl')
#plot the results

#remove none values and remove outliers
df = df[df['mse_last1'].notna()]
df = df[df['mse_mmf4'].notna()]
df = df[df['mse_last1'] < 1]
df = df[df['mse_mmf4'] < 1]
df = df[df['mse_lcpfn'] < 1]

print(max(df['mse_last1']))
print(max(df['mse_mmf4']))
print(max(df['mse_lcpfn']))



In [None]:
# Assuming df is your DataFrame

fig, ax = plt.subplots(nrows=2, figsize=(6, 10))
fig.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.95)

s=40
ax[0].scatter(df['mse_last1'], df['mse_lcpfn'], label='last1', alpha=0.5,s=s)
x = np.linspace(0, 1, 100)
ax[0].plot(x, x, color='black', linestyle='--')
ax[0].set_xlabel('MSE of last1')
ax[0].set_ylabel('MSE of LCPFN')
ax[0].set_ylim(0, 0.5)
ax[0].set_xlim(0, 0.5)

above_line_last1= df[df['mse_last1'] < df['mse_lcpfn']].shape[0]
below_line_last1 = df[df['mse_last1'] >= df['mse_lcpfn']].shape[0]

ax[0].annotate(f'Below Line: {below_line_last1}', xy=(0.35, 0.32), fontsize=16, color='blue', rotation=40)
ax[0].annotate(f'Above Line: {above_line_last1}', xy=(0.32, 0.35), fontsize=16, color='red', rotation=40)

ax[1].scatter(df['mse_mmf4'], df['mse_lcpfn'], label='mmf4', alpha=0.5, s=s)
x = np.linspace(0, 1, 100)
ax[1].plot(x, x, color='black', linestyle='--')
ax[1].set_xlabel('MSE of mmf4')
ax[1].set_ylabel('MSE of LCPFN')
ax[1].set_ylim(0, 1)
ax[1].set_xlim(0, 1)

above_line_mmf4= df[df['mse_mmf4'] < df['mse_lcpfn']].shape[0]
below_line_mmf4= df[df['mse_mmf4'] >= df['mse_lcpfn']].shape[0]

ax[1].annotate(f'Below Line: {below_line_mmf4}', xy=(0.7, 0.64), fontsize=16, color='blue', rotation=40)
ax[1].annotate(f'Above Line: {above_line_mmf4}', xy=(0.64, 0.7), fontsize=16, color='red', rotation=40)

fig.suptitle(f'MSE of LCPFN vs MSE of last1 and mmf4 for cutoff {CUT_OFF}', fontsize=16)

plt.savefig(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/mse_{CUT_OFF}.png', dpi=400)

In [None]:
fig_2, ax = plt.subplots(figsize=(8, 10))
fig.tight_layout()

ax.boxplot([df['mse_last1'], df['mse_mmf4'], df['mse_lcpfn']], labels=['last1', 'mmf4', 'lcpfn'], showfliers=False, showmeans=True)
ax.set_yticks([0, 0.0025, 0.005, 0.0075, 0.01])
ax.set_ylabel('MSE')
# annotate the medina line
ax.annotate(f'{round(df["mse_last1"].median(), 5)}', xy=(1.15, round(df["mse_last1"].median(), 4)), fontsize=16, color='black', rotation=40)
ax.annotate(f'{round(df["mse_mmf4"].median(), 5)}', xy=(2.15, round(df["mse_mmf4"].median(), 4)), fontsize=16, color='black', rotation=40)
ax.annotate(f'{round(df["mse_lcpfn"].median(), 5)}', xy=(3.15, round(df["mse_lcpfn"].median(), 4)), fontsize=16, color='black', rotation=40)

# annotate the mean line
ax.annotate(f'{round(df["mse_last1"].mean(), 5)}', xy=(1.05, round(df["mse_last1"].mean(), 4)), fontsize=16, color='black', rotation=40)
ax.annotate(f'{round(df["mse_mmf4"].mean(), 5)}', xy=(1.65, round(df["mse_mmf4"].mean(), 4)), fontsize=16, color='black', rotation=-40)
ax.annotate(f'{round(df["mse_lcpfn"].mean(), 5)}', xy=(3.05, round(df["mse_lcpfn"].mean(), 4)), fontsize=16, color='black', rotation=40)

axins = ax.inset_axes([0.6, 0.55, 0.38, 0.4])
axins.boxplot([df['mse_last1'], df['mse_mmf4'], df['mse_lcpfn']], labels=['last1', 'mmf4','lcpfn'], sym='+')
axins.set_ylim(0, 1)

#annotate the outliers
axins.annotate(str(df[df["mse_last1"] > 0.01].shape[0]), xy=(1.1, 0.4), fontsize=16, color='red', rotation=45)
axins.annotate(str(df[df["mse_mmf4"] > 0.01].shape[0]), xy=(2.1, 0.4), fontsize=16, color='red', rotation=45)
axins.annotate(str(df[df["mse_lcpfn"] > 0.01].shape[0]), xy=(3.1, 0.4), fontsize=16, color='red', rotation=45)

plt.title(f'MSE of LCPFN vs last1 vs mmf4 for cutoff {CUT_OFF}', fontsize=20)

# ax.indicate_inset_zoom(axins)
plt.savefig(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/mse_box_plot_{CUT_OFF}.png', dpi=400)
plt.close('all')

In [None]:
# create a df with the average mse for cutoffs 10, 20, 40, 80
df_10 = pd.read_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_10.pkl')
df_20 = pd.read_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_20.pkl')
df_40 = pd.read_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_40.pkl')
df_80 = pd.read_pickle(f'/mnt/c/Users/prath/PycharmProjects/rp/Data/df_mse_80.pkl')

df_10 = df_10[df_10['mse_last1'].notna()]
df_10 = df_10[df_10['mse_mmf4'].notna()]
df_10 = df_10[df_10['mse_last1'] < 1]
df_10 = df_10[df_10['mse_mmf4'] < 1]
df_10 = df_10[df_10['mse_lcpfn'] < 1]

df_20 = df_20[df_20['mse_last1'].notna()]
df_20 = df_20[df_20['mse_mmf4'].notna()]
df_20 = df_20[df_20['mse_last1'] < 1]
df_20 = df_20[df_20['mse_mmf4'] < 1]
df_20 = df_20[df_20['mse_lcpfn'] < 1]

df_40 = df_40[df_40['mse_last1'].notna()]
df_40 = df_40[df_40['mse_mmf4'].notna()]
df_40 = df_40[df_40['mse_last1'] < 1]
df_40 = df_40[df_40['mse_mmf4'] < 1]
df_40 = df_40[df_40['mse_lcpfn'] < 1]

df_80 = df_80[df_80['mse_last1'].notna()]
df_80 = df_80[df_80['mse_mmf4'].notna()]
df_80 = df_80[df_80['mse_last1'] < 1]
df_80 = df_80[df_80['mse_mmf4'] < 1]
df_80 = df_80[df_80['mse_lcpfn'] < 1]

#create a df with the average mse for each modle
df_avg = pd.DataFrame(columns=['mse_last1', 'mse_mmf4', 'mse_lcpfn'])
df_avg.loc['10'] = df_10.mean()
df_avg.loc['20'] = df_20.mean()
df_avg.loc['40'] = df_40.mean()
df_avg.loc['80'] = df_80.mean()

#get the latex table
print(df_avg.to_latex(float_format="%.4f"))