In [66]:
import numpy as np
import os
import torch
import torch.nn as nn
import time
import pandas as pd
from scipy.stats import pearsonr

In [67]:
from model.util import Normalizer
from model.database_util import get_hist_file, get_job_table_sample, collator
from model.model import QueryFormer
from model.database_util import Encoding
from model.dataset import PlanTreeDataset

In [68]:
data_path = './data/imdb/'

In [69]:
class Args:
    pass

In [70]:
hist_file = get_hist_file(data_path + 'histogram_string.csv')
cost_norm = Normalizer(-3.61192, 12.290855)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hist_file['freq'][i] = freq_np
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, 

In [71]:
encoding_ckpt = torch.load('checkpoints/encoding.pt')
encoding = encoding_ckpt['encoding']
checkpoint = torch.load('checkpoints/cost_model.pt', map_location='cpu')

  encoding_ckpt = torch.load('checkpoints/encoding.pt')
  checkpoint = torch.load('checkpoints/cost_model.pt', map_location='cpu')


In [72]:
from model.util import seed_everything
seed_everything()

In [73]:
args = checkpoint['args']

In [74]:
model = QueryFormer(emb_size = args.embed_size ,ffn_dim = args.ffn_dim, head_size = args.head_size, \
                 dropout = args.dropout, n_layers = args.n_layers, \
                 use_sample = True, use_hist = True, \
                 pred_hid = args.pred_hid
                )

In [75]:
model.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [76]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
_ = model.to(device).eval()

In [77]:
to_predict = 'cost'

In [78]:
methods = {
    'get_sample' : get_job_table_sample,
    'encoding': encoding,
    'cost_norm': cost_norm,
    'hist_file': hist_file,
    'model': model,
    'device': device,
    'bs': 512,
}

In [87]:
import json

def print_plan(plan):
    # pretty print the json object

    # the following code parses the json string into a dictionary
    json_parsed = json.loads(plan)
    json_pretty = json.dumps(json_parsed, indent=4)
    print(json_pretty)

    with open('output.json', 'w') as f:
        f.write(json_pretty)



In [79]:
def print_qerror(preds_unnorm, labels_unnorm):
    qerror = []
    for i in range(len(preds_unnorm)):
        print("Predicted: {}, Actual: {}".format(preds_unnorm[i], labels_unnorm[i]))
        if preds_unnorm[i] > float(labels_unnorm[i]):
            qerror.append(preds_unnorm[i] / float(labels_unnorm[i]))
        else:
            qerror.append(float(labels_unnorm[i]) / float(preds_unnorm[i]))

    e_50, e_90 = np.median(qerror), np.percentile(qerror,90)    
    e_mean = np.mean(qerror)
    print("Median: {}".format(e_50))
    print("90th percentile: {}".format(e_90))
    print("Mean: {}".format(e_mean))
    return 

def get_corr(ps, ls): # unnormalised
    ps = np.array(ps)
    ls = np.array(ls)
    corr, _ = pearsonr(np.log(ps), np.log(ls))
    
    return corr

In [119]:
def evaluate(model, ds, bs, norm, device):
    model.eval()
    cost_predss = np.empty(0)

    with torch.no_grad():
        for i in range(0, len(ds), bs):
            batch, batch_labels = collator(list(zip(*[ds[j] for j in range(i,min(i+bs, len(ds)) ) ])))

            batch = batch.to(device)

            cost_preds, _ = model(batch)

            cost_preds = cost_preds.squeeze()

            cost_predss = np.append(cost_predss, cost_preds.cpu().detach().numpy())

            
    print_qerror(norm.unnormalize_labels(cost_predss), ds.costs)
    corr = get_corr(norm.unnormalize_labels(cost_predss), ds.costs)
    print('Corr: ', corr)

    return 

In [121]:
def eval_workload(workload, methods):

    get_table_sample = methods['get_sample']

    workload_file_name = './data/imdb/workloads/' + workload
    output_file_name = './data/imdb/workloads/{}_output.csv'.format(workload)

    table_sample = get_table_sample(workload_file_name)

    plan_df = pd.read_csv('./data/imdb/{}_plan.csv'.format(workload))
    print_plan(plan_df['json'][0])
    workload_csv = pd.read_csv('./data/imdb/workloads/{}.csv'.format(workload),sep='#',header=None)
    workload_csv.columns = ['table','join','predicate','cardinality']

    workload_csv.to_csv(output_file_name, index=False)

    
    ds = PlanTreeDataset(plan_df, workload_csv, \
        methods['encoding'], methods['hist_file'], methods['cost_norm'], \
        methods['cost_norm'], 'cost', table_sample)
    

    evaluate(methods['model'], ds, methods['bs'], methods['cost_norm'], methods['device'])
    return 

In [123]:
eval_workload('job-light', methods)

Loaded queries with len  70
Loaded bitmaps
{
    "Plan": {
        "Node Type": "Nested Loop",
        "Parallel Aware": false,
        "Join Type": "Inner",
        "Startup Cost": 8.85,
        "Total Cost": 5583.26,
        "Plan Rows": 243,
        "Plan Width": 159,
        "Actual Startup Time": 0.099,
        "Actual Total Time": 7.592,
        "Actual Rows": 715,
        "Actual Loops": 1,
        "Inner Unique": false,
        "Plans": [
            {
                "Node Type": "Nested Loop",
                "Parent Relationship": "Outer",
                "Parallel Aware": false,
                "Join Type": "Inner",
                "Startup Cost": 8.42,
                "Total Cost": 5308.24,
                "Plan Rows": 460,
                "Plan Width": 119,
                "Actual Startup Time": 0.062,
                "Actual Total Time": 1.772,
                "Actual Rows": 250,
                "Actual Loops": 1,
                "Inner Unique": true,
                "Pl

In [113]:
eval_workload('synthetic', methods)

Loaded queries with len  5000
Loaded bitmaps
{
    "Plan": {
        "Node Type": "Gather",
        "Parallel Aware": false,
        "Startup Cost": 49877.66,
        "Total Cost": 321335.74,
        "Plan Rows": 183094,
        "Plan Width": 168,
        "Actual Startup Time": 1470.494,
        "Actual Total Time": 7566.75,
        "Actual Rows": 297013,
        "Actual Loops": 1,
        "Workers Planned": 2,
        "Workers Launched": 2,
        "Single Copy": false,
        "Plans": [
            {
                "Node Type": "Hash Join",
                "Parent Relationship": "Outer",
                "Parallel Aware": true,
                "Join Type": "Inner",
                "Startup Cost": 48877.66,
                "Total Cost": 302026.34,
                "Plan Rows": 76289,
                "Plan Width": 168,
                "Actual Startup Time": 1454.579,
                "Actual Total Time": 7497.073,
                "Actual Rows": 99004,
                "Actual Loops": 3,
