# Importing Python Functions

In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
import time
import pandas as pd
from scipy.stats import pearsonr

In [2]:
from model.util import Normalizer
from model.database_util import get_hist_file, get_job_table_sample, collator
from model.model import QueryFormer
from model.database_util import Encoding
from model.dataset import PlanTreeDataset
from model.trainer import eval_workload, train, train_single

In [3]:
data_path = './data/imdb/'

# DL Model's Hyper parameters definition

In [4]:
class Args:
    # bs = 1024
    # SQ: smaller batch size
    bs = 1
    lr = 0.001
    # epochs = 200
    epochs = 1
    clip_size = 50
    embed_size = 64
    pred_hid = 128
    ffn_dim = 128
    head_size = 12
    n_layers = 8
    dropout = 0.1
    sch_decay = 0.6
    # device = 'cuda:0'
    device = 'cpu'
    newpath = './results/full/cost/'
    to_predict = 'card'
args = Args()

import os
if not os.path.exists(args.newpath):
    os.makedirs(args.newpath)

# Defining Normalizing functions for card
1. Is Normalizer a function defined by the authors of QueryFormer?
Yes, it's a class. 

2. Where do the following values for the normalizer come from?

In [5]:

cost_norm = Normalizer(-3.61192, 12.290855)
card_norm = Normalizer(1,100)

# Loading Encoding file
1. Who created this encoding file in the first place?

In [6]:
encoding_ckpt = torch.load('checkpoints/encoding.pt')
type(encoding_ckpt)

dict

In [7]:
encoding_ckpt.keys()

dict_keys(['encoding'])

In [8]:
encoding = encoding_ckpt['encoding']
type(encoding)

model.database_util.Encoding

## Exploring Encoding object

In [9]:
print(encoding.column_min_max_vals)
# column_min_max_vals is a dictionary. It has the min and max value for each numeric column in the dataset. 
# Q. how is column min and max used?
# Q. All these keys, for which min and max are provided, are categorical features. What's the use of these columns' min and max? (if they were numeric columns, gathering their min and max would have made sense.)

{'t.id': [1.0, 2528312.0], 't.kind_id': [1.0, 7.0], 't.production_year': [1880.0, 2019.0], 'mc.id': [1.0, 2609129.0], 'mc.company_id': [1.0, 234997.0], 'mc.movie_id': [2.0, 2525745.0], 'mc.company_type_id': [1.0, 2.0], 'ci.id': [1.0, 36244344.0], 'ci.movie_id': [1.0, 2525975.0], 'ci.person_id': [1.0, 4061926.0], 'ci.role_id': [1.0, 11.0], 'mi.id': [1.0, 14835720.0], 'mi.movie_id': [1.0, 2526430.0], 'mi.info_type_id': [1.0, 110.0], 'mi_idx.id': [1.0, 1380035.0], 'mi_idx.movie_id': [2.0, 2525793.0], 'mi_idx.info_type_id': [99.0, 113.0], 'mk.id': [1.0, 4523930.0], 'mk.movie_id': [2.0, 2525971.0], 'mk.keyword_id': [1.0, 134170.0]}


In [10]:
print(encoding.col2idx)
# the label encoding of each unique column in the dataset

{'t.id': 0, 't.kind_id': 1, 't.production_year': 2, 'mc.id': 3, 'mc.company_id': 4, 'mc.movie_id': 5, 'mc.company_type_id': 6, 'ci.id': 7, 'ci.movie_id': 8, 'ci.person_id': 9, 'ci.role_id': 10, 'mi.id': 11, 'mi.movie_id': 12, 'mi.info_type_id': 13, 'mi_idx.id': 14, 'mi_idx.movie_id': 15, 'mi_idx.info_type_id': 16, 'mk.id': 17, 'mk.movie_id': 18, 'mk.keyword_id': 19, 'NA': 20}


In [11]:
print(encoding.op2idx)

{'>': 0, '=': 1, '<': 2, 'NA': 3}


In [12]:
print(encoding.idx2col)

{0: 't.id', 1: 't.kind_id', 2: 't.production_year', 3: 'mc.id', 4: 'mc.company_id', 5: 'mc.movie_id', 6: 'mc.company_type_id', 7: 'ci.id', 8: 'ci.movie_id', 9: 'ci.person_id', 10: 'ci.role_id', 11: 'mi.id', 12: 'mi.movie_id', 13: 'mi.info_type_id', 14: 'mi_idx.id', 15: 'mi_idx.movie_id', 16: 'mi_idx.info_type_id', 17: 'mk.id', 18: 'mk.movie_id', 19: 'mk.keyword_id', 20: 'NA'}


In [13]:
print(encoding.type2idx)

{'Gather': 0, 'Hash Join': 1, 'Seq Scan': 2, 'Hash': 3, 'Bitmap Heap Scan': 4, 'Bitmap Index Scan': 5, 'Nested Loop': 6, 'Index Scan': 7, 'Merge Join': 8, 'Gather Merge': 9, 'Materialize': 10, 'BitmapAnd': 11, 'Sort': 12}


In [14]:
checkpoint = torch.load('checkpoints/cost_model.pt', map_location='cpu')

In [15]:
from model.util import seed_everything
seed_everything()

In [16]:
model = QueryFormer(emb_size = args.embed_size ,ffn_dim = args.ffn_dim, head_size = args.head_size, \
                 dropout = args.dropout, n_layers = args.n_layers, \
                 use_sample = True, use_hist = True, \
                 pred_hid = args.pred_hid
                )

In [17]:
_ = model.to(args.device)

In [18]:
to_predict = 'cost'

In [19]:
imdb_path = './data/imdb/'
dfs = []  # list to hold DataFrames
# SQ: added
#for i in range(2):
for i in range(18):
    file = imdb_path + 'plan_and_cost/train_plan_part{}.csv'.format(i)
    df = pd.read_csv(file)
    dfs.append(df)

full_train_df = pd.concat(dfs)

val_dfs = []  # list to hold DataFrames
for i in range(18,20):
    file = imdb_path + 'plan_and_cost/train_plan_part{}.csv'.format(i)
    df = pd.read_csv(file)
    val_dfs.append(df)

val_df = pd.concat(val_dfs)

In [20]:
full_train_df.shape

(90000, 2)

In [21]:
val_df.shape

(10000, 2)

In [22]:
table_sample = get_job_table_sample(imdb_path+'train')

Loaded queries with len  100000
Loaded bitmaps


In [23]:
type(table_sample)

list

In [24]:
len(table_sample)

100000

In [25]:
type(table_sample[0])

dict

In [26]:
table_sample[0].keys()

dict_keys(['title', 'movie_info_idx'])

In [27]:
table_sample[0]['title']

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [28]:
full_train_df.head(1)

Unnamed: 0,id,json
0,0,"{""Plan"": {""Node Type"": ""Gather"", ""Parallel Awa..."


In [29]:
val_df.head(1)

Unnamed: 0,id,json
0,90000,"{""Plan"": {""Node Type"": ""Nested Loop"", ""Paralle..."


## Loading and Exploring the Histogram (hist_file)

In [30]:
hist_file = get_hist_file(data_path + 'histogram_string.csv')

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hist_file['freq'][i] = freq_np
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, 

In [31]:
type(hist_file)

pandas.core.frame.DataFrame

In [32]:
hist_file.shape

(9, 5)

In [33]:
hist_file.head(10)

Unnamed: 0,table,column,freq,bins,table_column
0,title,production_year,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1880, 1913, 1923, 1942, 1955, 1960, 1964, 196...",t.production_year
1,title,kind_id,"[0.0, 0.26216118191156074, 0.03593387047716835...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...",t.kind_id
2,movie_companies,company_id,"[0.0, 0.0004959511376981121, 0.000558807386989...","[1, 6, 19, 27, 68, 133, 160, 189, 292, 402, 47...",mc.company_id
3,movie_companies,company_type_id,"[0.0, 0.4883796425472418, 0.5116203574527581]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",mc.company_type_id
4,cast_info,role_id,"[0.0, 0.3495907485479872, 0.20560375449487386,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",ci.role_id
5,movie_keyword,keyword_id,"[0.0, 0.0031748950967179193, 1.989421142551088...","[1, 77, 132, 230, 331, 347, 384, 495, 643, 784...",mk.keyword_id
6,cast_info,person_id,"[0.0, 0.0, 5.518102507748589e-08, 2.7590512538...","[2, 77446, 145798, 212750, 281691, 347240, 419...",ci.person_id
7,movie_info_idx,info_type_id,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 9...",mi_idx.info_type_id
8,movie_info,info_type_id,"[0.0, 0.05406815807174563, 0.08688004942665738...","[1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, ...",mi.info_type_id


In [34]:
hist_file.dtypes

table           object
column          object
freq            object
bins            object
table_column    object
dtype: object

In [35]:
for column in ['table', 'column', 'freq', 'bins', 'table_column']:
    print(f"Column: {column}")
    print(hist_file[column].apply(type).value_counts())
    sample_value = hist_file[column].iloc[0]
    print(f"Sample Value: {sample_value}")
    if isinstance(sample_value, (list, tuple, set, dict, pd.Series, np.ndarray)):
        print(f"Length of Sample Value: {len(sample_value)}")
    print("\n")

Column: table
table
<class 'str'>    9
Name: count, dtype: int64
Sample Value: title


Column: column
column
<class 'str'>    9
Name: count, dtype: int64
Sample Value: production_year


Column: freq
freq
<class 'numpy.ndarray'>    9
Name: count, dtype: int64
Sample Value: [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.22139046e-06
 0.00000000e+00 8.14260304e-07]
Length of Sample Value: 2020


Column: bins
bins
<class 'list'>    9
Name: count, dtype: int64
Sample Value: [1880, 1913, 1923, 1942, 1955, 1960, 1964, 1968, 1971, 1975, 1978, 1982, 1985, 1987, 1990, 1992, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2001, 2002, 2003, 2004, 2004, 2005, 2005, 2006, 2006, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009, 2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012, 2013, 2019]
Length of Sample Value: 51


Column: table_column
table_column
<class 'str'>    9
Name: count, dtype: int64
Sample Value: t.production_year




In [36]:
# Looking into only the columns from the same table
hist_file_title = hist_file[hist_file['table'] == 'title']
hist_file_title.head()

Unnamed: 0,table,column,freq,bins,table_column
0,title,production_year,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1880, 1913, 1923, 1942, 1955, 1960, 1964, 196...",t.production_year
1,title,kind_id,"[0.0, 0.26216118191156074, 0.03593387047716835...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...",t.kind_id


In [37]:
# Checking the length of bins for the columns in the title table
hist_file_title.dtypes

table           object
column          object
freq            object
bins            object
table_column    object
dtype: object

In [38]:
# printing the type of object in the bins column
type(hist_file_title['bins'][0])

list

In [39]:
# printing the length of the list, represenging the bin, for the production year column
len(hist_file_title['bins'][0])

51

In [40]:
# printing the number of bins for the kind_id column. I want to check if the number of bins are same for all columns from a same table, e.g., title. 
len(hist_file_title['bins'][1])

51

In [41]:
# now print the length of bins from all rows, across multiple tables. I expect the number of bins to be equal for all columns within the same table, however, they may differ across tables. Let's see. 
for index, row in hist_file.iterrows():
    print(f"Table: {row['table']}, Column: {row['column']}, Length of freq: {len(row['freq'])}, Length of bins: {len(row['bins'])}")

Table: title, Column: production_year, Length of freq: 2020, Length of bins: 51
Table: title, Column: kind_id, Length of freq: 8, Length of bins: 51
Table: movie_companies, Column: company_id, Length of freq: 234998, Length of bins: 51
Table: movie_companies, Column: company_type_id, Length of freq: 3, Length of bins: 51
Table: cast_info, Column: role_id, Length of freq: 12, Length of bins: 51
Table: movie_keyword, Column: keyword_id, Length of freq: 134171, Length of bins: 51
Table: cast_info, Column: person_id, Length of freq: 4061927, Length of bins: 51
Table: movie_info_idx, Column: info_type_id, Length of freq: 114, Length of bins: 51
Table: movie_info, Column: info_type_id, Length of freq: 111, Length of bins: 51


In [42]:
# let's print the content of the freq and bins column for kind_id
kind_id_row = hist_file[hist_file['column'] == 'kind_id'].iloc[0]

print("Freq for kind_id:")
print(kind_id_row['freq'])

print("\nBins for kind_id:")
print(kind_id_row['bins'])

Freq for kind_id:
[0.         0.26216118 0.03593387 0.03976449 0.04676403 0.
 0.00498356 0.61039287]

Bins for kind_id:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]


In [43]:
# I think the number of values in the freq column is equal to the number of origin bins in a column, not the re-calibrated bins to match the number of bins across columns in the same table. Let's check. 
kind_id_bins = hist_file[hist_file['column'] == 'kind_id'].iloc[0]['bins']

distinct_bins = set(kind_id_bins)

print("Distinct bins for kind_id:")
for bin in distinct_bins:
    print(bin)

Distinct bins for kind_id:
1
2
3
4
7


In [44]:
# let's print the content of the freq and bins column for production_year
production_year_row = hist_file[hist_file['column'] == 'production_year'].iloc[0]

print("Freq for production_year:")
print(production_year_row['freq'])

print("\nBins for production_year:")
print(production_year_row['bins'])

Freq for production_year:
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.22139046e-06
 0.00000000e+00 8.14260304e-07]

Bins for production_year:
[1880, 1913, 1923, 1942, 1955, 1960, 1964, 1968, 1971, 1975, 1978, 1982, 1985, 1987, 1990, 1992, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2001, 2002, 2003, 2004, 2004, 2005, 2005, 2006, 2006, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009, 2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012, 2013, 2019]


# Step 1: Identifying the training dataset and its component

In [45]:
type(card_norm)

model.util.Normalizer

In [46]:
card_norm

<model.util.Normalizer at 0x2c0afa600>

In [47]:
cost_norm

<model.util.Normalizer at 0x10518c5f0>

In [48]:
type(to_predict)

str

In [49]:
to_predict

'cost'

In [50]:
type(table_sample)

list

In [51]:
len(full_train_df)

90000

In [52]:
len(table_sample)

100000

In [53]:
type(table_sample[0])

dict

In [54]:
table_sample[0]

{'title': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 

In [55]:
len(table_sample[0])

2

In [56]:
type(table_sample[0])

dict

In [57]:
table_sample[0].keys()

dict_keys(['title', 'movie_info_idx'])

In [58]:
table_sample[1].keys()

dict_keys(['title'])

In [59]:
full_train_df.shape

(90000, 2)

# exploring the execution of the __init__ function of the PlanTreeDataset for 1 training query

In [60]:
full_train_df.shape

(90000, 2)

In [61]:
train_df_single_query = full_train_df.iloc[[0]]

In [62]:
train_df_single_query.shape

(1, 2)

In [63]:
train_df_single_query

Unnamed: 0,id,json
0,0,"{""Plan"": {""Node Type"": ""Gather"", ""Parallel Awa..."


In [64]:
import json
# the following code parses the json string into a dictionary
json_parsed = json.loads(train_df_single_query['json'].iloc[0])
json_pretty = json.dumps(json_parsed, indent=4)
print(json_pretty)

with open('output.json', 'w') as f:
    f.write(json_pretty)

{
    "Plan": {
        "Node Type": "Gather",
        "Parallel Aware": false,
        "Startup Cost": 23540.58,
        "Total Cost": 154548.95,
        "Plan Rows": 567655,
        "Plan Width": 119,
        "Actual Startup Time": 386.847,
        "Actual Total Time": 646.972,
        "Actual Rows": 283812,
        "Actual Loops": 1,
        "Workers Planned": 2,
        "Workers Launched": 2,
        "Single Copy": false,
        "Plans": [
            {
                "Node Type": "Hash Join",
                "Parent Relationship": "Outer",
                "Parallel Aware": true,
                "Join Type": "Inner",
                "Startup Cost": 22540.58,
                "Total Cost": 96783.45,
                "Plan Rows": 236523,
                "Plan Width": 119,
                "Actual Startup Time": 369.985,
                "Actual Total Time": 518.487,
                "Actual Rows": 94604,
                "Actual Loops": 3,
                "Inner Unique": false,
         

In [65]:
import json

def print_plan(plan, indent=''):
    print(indent + '+--' + plan['Node Type'])
    if 'Plans' in plan:
        for i, subplan in enumerate(plan['Plans']):
            if i == len(plan['Plans']) - 1:
                new_indent = indent + '   '
            else:
                new_indent = indent + '|  '
            print_plan(subplan, new_indent)

# Load the execution plan from a string, file, etc.
execution_plan = json.loads(train_df_single_query['json'].iloc[0])

print_plan(execution_plan['Plan'])

+--Gather
   +--Hash Join
   |  +--Seq Scan
      +--Hash
         +--Seq Scan


The query is similar to the following: (generated by Github copilot)
```sql
SELECT *
FROM title AS t
JOIN movie_info_idx AS mi_idx ON t.id = mi_idx.movie_id
WHERE t.kind_id = 7 AND mi_idx.info_type_id > 99;
```

In [66]:
import logging
import sys

# import importlib

# importlib.reload(model.dataset)
# from model.dataset import PlanTreeDataset

In [67]:
to_predict

'cost'

In [68]:

logging.basicConfig(level=logging.INFO, stream=sys.stdout)


train_ds_single_query = PlanTreeDataset(train_df_single_query, None, encoding, hist_file, card_norm, cost_norm, to_predict, table_sample)


INFO:root:Initializing PlanTreeDataset
INFO:root:self.length = len(json_df): 1
INFO:root:nodes.type: <class 'list'>
INFO:root:number of nodes: 1
INFO:root:type of the first element in the list nodes: <class 'dict'>
INFO:root:keys in the first dictionary in the nodes list: dict_keys(['Node Type', 'Parallel Aware', 'Startup Cost', 'Total Cost', 'Plan Rows', 'Plan Width', 'Actual Startup Time', 'Actual Total Time', 'Actual Rows', 'Actual Loops', 'Workers Planned', 'Workers Launched', 'Single Copy', 'Plans'])
INFO:root:idxs: [0]
INFO:root:beginning js_node2dict(self, idx, node): returns a dictionary of 4 tensors per query plan
INFO:root:returns a collated_dict of 4 tensors: 'x', 'attn_bias', 'rel_pos', 'heights 
INFO:root:nodeType: Gather
INFO:root:typeId: 0
INFO:root:formatFilter - filters: []
INFO:root:formatFilter - alias: None
INFO:root:formatJoin - join: None
INFO:root:formatJoin - joinId: 0
INFO:root:printing node features
INFO:root:node.typeId: 0
INFO:root:node.join: 0
INFO:root:his

In [69]:
train_ds_single_query

<model.dataset.PlanTreeDataset at 0x2c0afb500>

In [70]:
type(train_ds_single_query[0])

tuple

In [71]:
len(train_ds_single_query[0])

2

In [72]:
type(train_ds_single_query[0][0])

dict

In [73]:
train_ds_single_query[0][0].keys()

dict_keys(['x', 'attn_bias', 'rel_pos', 'heights'])

In [74]:
type(train_ds_single_query[0][1])

tuple

In [75]:
len(train_ds_single_query[0][1])

2

In [76]:
type(train_ds_single_query[0][1][0])

torch.Tensor

In [77]:
train_ds_single_query[0][1][0]

tensor(0.6348, dtype=torch.float64)

In [78]:
type(train_ds_single_query[0][1][1])

torch.Tensor

In [79]:
train_ds_single_query[0][1][1]

tensor(0.1167, dtype=torch.float64)

In [80]:
train_ds_single_query.labels

tensor([0.6348], dtype=torch.float64)

# Exporling the Training Pipeline

In [81]:
crit = nn.MSELoss()

In [82]:
type(model)

model.model.QueryFormer

In [83]:
type(crit)

torch.nn.modules.loss.MSELoss

In [84]:
type(args)

__main__.Args

# Making a forward pass through QueryFormer model using 1 Training Example

In [85]:
crit = nn.MSELoss()
model = train_single(model, train_ds_single_query, train_ds_single_query, crit, cost_norm, args)

INFO:root:QuerfyFormer forward
INFO:root:x shape: torch.Size([1, 30, 1165])
INFO:root:node_feature shape: torch.Size([1, 30, 329])
INFO:root:super_token_feature shape: torch.Size([1, 1, 329])
INFO:root:super_token_feature values: tensor([[[-3.6411e-01,  4.5445e-01, -5.2727e-01,  2.3610e-01,  1.9473e+00,
          -1.2709e-01, -9.5878e-01, -1.6393e+00, -1.0220e+00, -6.3672e-01,
          -5.4190e-03,  1.1480e+00,  8.1637e-01,  1.8798e+00,  1.4550e+00,
          -8.9672e-01,  2.0786e+00,  4.2498e-01,  4.9432e-01,  1.2346e+00,
          -5.9200e-02, -3.0412e-01,  4.5852e-01, -9.5504e-01, -4.8362e-01,
           3.3751e-01, -8.2233e-01, -3.5166e-01, -1.9745e+00, -1.3450e+00,
          -4.3359e-01,  1.7492e-02,  1.6619e+00,  2.8184e-01, -6.4980e-02,
          -1.8063e+00, -1.5737e+00, -1.2378e+00, -7.8738e-01, -1.3226e+00,
           1.0962e+00,  5.0930e-01, -2.2963e-01,  6.6542e-01,  7.6330e-01,
          -7.5418e-01,  1.3894e+00,  4.3644e-01,  9.8246e-01, -2.6422e+00,
          -1.1799e+0

  return F.mse_loss(input, target, reduction=self.reduction)
