In [1]:
import pandas as pd
import os
import torch
from model.database_util import *

In [2]:
# Base directory name
base_dir = 'job_queries'

# Dec 1:
# query1_ts = '2024-10-08-04.50.20.045775' # - worked
query1_ts = '2024-12-03-20.44.07.528574'

#query1_ts = '2024-10-08-04.48.13.056756'
# 408
#query1_ts = '2024-10-08-10.06.01.048516' - working but not filtered
# query1_ts = '2024-10-08-04.48.39.202520'
#query1_ts = '2024-10-08-04.48.55.567842'
# query1_ts = '2024-10-08-04.51.06.406285' # not working
# query1_ts = '2024-10-08-10.06.01.048516' # worked in 0 notebook, also worked here


In [3]:


# File paths for the query explanation components
file_paths = {
    'operator': f'{base_dir}/EXPLAIN_OPERATOR.csv',
    'stream': f'{base_dir}/EXPLAIN_STREAM.csv',
    'predicate': f'{base_dir}/EXPLAIN_PREDICATE.csv'
}


In [4]:
max_node = 30
rel_pos_max = 20

# Extract the operator ids and types

In [5]:
import pandas as pd
import os

def extract_operator_ids_and_types(explain_operator_path, timestamp):
    """
    Extract rows from a CSV file where the EXPLAIN_TIME column matches a given timestamp.

    Parameters:
        explain_operator_path (str): Path to the CSV file.
        timestamp: The timestamp value to filter rows by.

    Returns:
        pd.DataFrame: A DataFrame containing the matching rows with EXPLAIN_TIME, OPERATOR_ID, and OPERATOR_TYPE.
    """
    # Check if the file exists
    if not os.path.exists(explain_operator_path):
        raise FileNotFoundError(f"The file '{explain_operator_path}' does not exist.")
    
    # Read the CSV file
    try:
        df = pd.read_csv(explain_operator_path)
    except Exception as e:
        raise ValueError(f"Error reading the file: {e}")

    # Check if the required columns exist
    required_columns = ['EXPLAIN_TIME', 'OPERATOR_ID', 'OPERATOR_TYPE']
    if not all(col in df.columns for col in required_columns):
        raise KeyError(f"The file must contain the columns: {required_columns}")

    # Filter rows where the EXPLAIN_TIME column matches the timestamp value
    df_ops = df[df['EXPLAIN_TIME'] == timestamp][required_columns]

    # Warn if no rows match the timestamp
    if df_ops.empty:
        print(f"No rows found for timestamp '{timestamp}'.")

    return df_ops


# try:
#     result = extract_operator_ids_and_types(explain_operator_path, query1_ts)
#     print(result)
# except Exception as e:
#     print(f"Error: {e}")


# Extract the Operators accessing objects

In [15]:
import pandas as pd

def extract_stream_table(explain_stream_path, timestamp):
    """
    Process the EXPLAIN_STREAM CSV file to extract relevant rows, concatenate schema and name, and prepare the output.

    Parameters:
        explain_stream_path (str): Path to the EXPLAIN_STREAM CSV file.
        timestamp (str): The timestamp value to filter rows by.

    Returns:
        pd.DataFrame: A DataFrame containing EXPLAIN_TIME, OPERATOR_ID, and TABLE columns.
    """
    # Read the CSV file
    df = pd.read_csv(explain_stream_path)

    # Filter for the given EXPLAIN_TIME value and non-NaN OBJECT_NAME
    filtered_df = df.loc[(df['EXPLAIN_TIME'] == timestamp) & (df['OBJECT_NAME'].notna())].copy()

    # Concatenate OBJECT_SCHEMA and OBJECT_NAME to create a new column, removing any extra spaces
    filtered_df['TABLE'] = (
        filtered_df['OBJECT_SCHEMA'].str.strip().fillna('') +
        '.' +
        filtered_df['OBJECT_NAME'].str.strip()
    )

    # Select and rename the TARGET_ID column to OPERATOR_ID
    df_table = filtered_df[['EXPLAIN_TIME', 'SOURCE_ID', 'TARGET_ID', 'TABLE']].copy()
    df_table.rename(columns={'TARGET_ID': 'OPERATOR_ID'}, inplace=True)

    return df_table


# Example usage:
explain_stream_path = f'{base_dir}/EXPLAIN_STREAM.csv'

query1_ts = '2024-12-03-20.44.07.528574'  # Replace with your timestamp
result = extract_stream_table(explain_stream_path, query1_ts)
print(result)


                    EXPLAIN_TIME  SOURCE_ID  OPERATOR_ID  \
6388  2024-12-03-20.44.07.528574         -1            4   
6390  2024-12-03-20.44.07.528574         -1            6   
6392  2024-12-03-20.44.07.528574         -1            8   
6394  2024-12-03-20.44.07.528574         -1           11   
6396  2024-12-03-20.44.07.528574         -1           14   
6398  2024-12-03-20.44.07.528574         -1           16   
6400  2024-12-03-20.44.07.528574         -1           17   
6404  2024-12-03-20.44.07.528574         -1           18   
6408  2024-12-03-20.44.07.528574         -1           19   

                      TABLE  
6388            AHNAF.TITLE  
6390       AHNAF.MOVIE_INFO  
6392     AHNAF.COMPANY_NAME  
6394  AHNAF.MOVIE_COMPANIES  
6396       AHNAF.MOVIE_LINK  
6398    AHNAF.MOVIE_KEYWORD  
6400          AHNAF.KEYWORD  
6404        AHNAF.LINK_TYPE  
6408     AHNAF.COMPANY_TYPE  


# Extract local predicates

In [8]:
import pandas as pd

def extract_local_predicates(explain_predicate_path, timestamp):
    """
    Extract and process local predicates (WHERE/HAVING conditions) from the EXPLAIN_PREDICATE CSV file.

    Parameters:
        explain_predicate_path (str): Path to the EXPLAIN_PREDICATE CSV file.
        timestamp (str): The timestamp value to filter rows by.

    Returns:
        pd.DataFrame: A DataFrame with grouped local predicates (PREDICATE1, PREDICATE2, PREDICATE3) 
                      and their corresponding FILTER_FACTOR columns (FILTER_FACTOR1, FILTER_FACTOR2, FILTER_FACTOR3) 
                      for each EXPLAIN_TIME and OPERATOR_ID.
    """
    # Define the columns to read from the file
    columns_to_read = ['EXPLAIN_TIME', 'OPERATOR_ID', 'HOW_APPLIED', 'PREDICATE_TEXT', 'FILTER_FACTOR']

    # Read the CSV file, selecting only relevant columns
    df = pd.read_csv(explain_predicate_path, usecols=columns_to_read)

    # Strip whitespace from EXPLAIN_TIME and HOW_APPLIED columns
    df['EXPLAIN_TIME'] = df['EXPLAIN_TIME'].str.strip()
    df['HOW_APPLIED'] = df['HOW_APPLIED'].str.strip()

    # Filter rows where EXPLAIN_TIME matches the timestamp and HOW_APPLIED is 'SARG'
    filtered_df = df.loc[(df['EXPLAIN_TIME'] == timestamp) & (df['HOW_APPLIED'] == 'SARG')].copy()

    # Remove only outer parentheses from the PREDICATE_TEXT column
    def remove_outer_parentheses(predicate_text):
        # Check if the text starts and ends with parentheses
        if isinstance(predicate_text, str) and predicate_text.startswith('(') and predicate_text.endswith(')'):
            # Remove only the outermost parentheses
            return predicate_text[1:-1]
        return predicate_text

    filtered_df['PREDICATE_TEXT'] = filtered_df['PREDICATE_TEXT'].apply(remove_outer_parentheses)

    # Select the relevant columns for the result
    df_predicates = filtered_df[['EXPLAIN_TIME', 'OPERATOR_ID', 'PREDICATE_TEXT', 'FILTER_FACTOR']]

    # Group by EXPLAIN_TIME and OPERATOR_ID and apply transformation
    df_local_predicates = df_predicates.groupby(['EXPLAIN_TIME', 'OPERATOR_ID']).agg(
        PREDICATE1=('PREDICATE_TEXT', lambda x: x.iloc[0] if len(x) > 0 else None),
        PREDICATE2=('PREDICATE_TEXT', lambda x: x.iloc[1] if len(x) > 1 else None),
        PREDICATE3=('PREDICATE_TEXT', lambda x: x.iloc[2] if len(x) > 2 else None),
        FILTER_FACTOR1=('FILTER_FACTOR', lambda x: x.iloc[0] if len(x) > 0 else None),
        FILTER_FACTOR2=('FILTER_FACTOR', lambda x: x.iloc[1] if len(x) > 1 else None),
        FILTER_FACTOR3=('FILTER_FACTOR', lambda x: x.iloc[2] if len(x) > 2 else None),
    ).reset_index()

    return df_local_predicates


# Example usage:
# explain_predicate_path = 'SimpleQueriesSQ12c/EXPLAIN_PREDICATE.csv'
# query1_ts = '2024-11-28 12:00:00'  # Replace with your timestamp
explain_predicate_path = f'{base_dir}/EXPLAIN_PREDICATE.csv'
result = extract_local_predicates(explain_predicate_path, query1_ts)
print(result)


                 EXPLAIN_TIME  OPERATOR_ID  \
0  2024-12-03-20.44.07.528574            4   
1  2024-12-03-20.44.07.528574            6   
2  2024-12-03-20.44.07.528574            8   
3  2024-12-03-20.44.07.528574           11   
4  2024-12-03-20.44.07.528574           17   
5  2024-12-03-20.44.07.528574           18   
6  2024-12-03-20.44.07.528574           19   

                                          PREDICATE1  \
0                         Q1.PRODUCTION_YEAR <= 2020   
1  Q4.INFO IN ('Sweden               ', 'Norway  ...   
2                           Q9.COUNTRY_CODE = '[us]'   
3                                   Q5.NOTE IS NULL    
4                          Q7.KEYWORD = 'helicopter'   
5                       Q6.LINK LIKE '%featured in%'   
6                           Q8.KIND = 'distributors'   

                   PREDICATE2 PREDICATE3  FILTER_FACTOR1  FILTER_FACTOR2  \
0  1950 <= Q1.PRODUCTION_YEAR       None        0.971485        0.894801   
1                        None 

In [10]:
result[['OPERATOR_ID', 'PREDICATE1', 'PREDICATE2', 
       'FILTER_FACTOR1', 'FILTER_FACTOR2']]

Unnamed: 0,OPERATOR_ID,PREDICATE1,PREDICATE2,FILTER_FACTOR1,FILTER_FACTOR2
0,4,Q1.PRODUCTION_YEAR <= 2020,1950 <= Q1.PRODUCTION_YEAR,0.971485,0.894801
1,6,"Q4.INFO IN ('Sweden ', 'Norway ...",,0.029517,
2,8,Q9.COUNTRY_CODE = '[us]',,0.359728,
3,11,Q5.NOTE IS NULL,,0.487515,
4,17,Q7.KEYWORD = 'helicopter',,7e-06,
5,18,Q6.LINK LIKE '%featured in%',,0.072222,
6,19,Q8.KIND = 'distributors',,0.25,


In [12]:
pd.set_option('display.max_colwidth', None)

# Extract Join Key

In [13]:
import pandas as pd

def extract_join_keys(explain_predicate_path, timestamp):
    """
    Extract join keys from the EXPLAIN_PREDICATE CSV file where HOW_APPLIED is 'JOIN'.

    Parameters:
        explain_predicate_path (str): Path to the EXPLAIN_PREDICATE CSV file.
        timestamp (str): The timestamp value to filter rows by.

    Returns:
        pd.DataFrame: A DataFrame containing EXPLAIN_TIME, OPERATOR_ID, and JOIN_KEY columns.
    """
    # Define the columns to read from the file
    columns_to_read = ['EXPLAIN_TIME', 'OPERATOR_ID', 'HOW_APPLIED', 'PREDICATE_TEXT']

    # Read the CSV file, selecting only relevant columns
    df = pd.read_csv(explain_predicate_path, usecols=columns_to_read)

    # Strip whitespace from EXPLAIN_TIME and HOW_APPLIED columns
    df['EXPLAIN_TIME'] = df['EXPLAIN_TIME'].str.strip()
    df['HOW_APPLIED'] = df['HOW_APPLIED'].str.strip()

    # Filter rows where EXPLAIN_TIME matches the timestamp and HOW_APPLIED is 'JOIN'
    filtered_df = df[(df['EXPLAIN_TIME'] == timestamp) & (df['HOW_APPLIED'] == 'JOIN')].copy()

    # Remove parentheses from the PREDICATE_TEXT column and assign to JOIN_KEY
    filtered_df.loc[:, 'JOIN_KEY'] = filtered_df['PREDICATE_TEXT'].str.replace(r'[\(\)]', '', regex=True)

    # Select the relevant columns for the result
    df_join = filtered_df[['EXPLAIN_TIME', 'OPERATOR_ID', 'JOIN_KEY']]

    # Check if there are matching rows
    if df_join.empty:
        print("No matching rows found.")
        return pd.DataFrame()  # Return an empty DataFrame

    return df_join

# Example usage:
# explain_predicate_path = 'SimpleQueriesSQ12c/EXPLAIN_PREDICATE.csv'
# query1_ts = '2024-11-28 12:00:00'  # Replace with your timestamp
# result = extract_join_keys(explain_predicate_path, query1_ts)
# print(result)


# Combining Node features

In [16]:
import pandas as pd

# Define the file paths and timestamp value
# File paths
explain_operator_path = f'{base_dir}/EXPLAIN_OPERATOR.csv'
explain_stream_path = f'{base_dir}/EXPLAIN_STREAM.csv'
explain_predicate_path = f'{base_dir}/EXPLAIN_PREDICATE.csv'
# query1_ts = '2024-10-08-04.51.06.406285' # not working

# Use the functions to extract features
df_ops = extract_operator_ids_and_types(explain_operator_path, query1_ts)
df_table = extract_stream_table(explain_stream_path, query1_ts)
df_pred = extract_local_predicates(explain_predicate_path, query1_ts)
df_join = extract_join_keys(explain_predicate_path, query1_ts)

# Define a list of column names
columns = ['EXPLAIN_TIME', 'OPERATOR_ID', 'OPERATOR_TYPE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'FILTER_FACTOR1', 'FILTER_FACTOR2', 'FILTER_FACTOR3', 'JOIN_KEY', 'TABLE']

# Create an empty DataFrame with these column names
df = pd.DataFrame(columns=columns)

# Merging the dataframes based on 'EXPLAIN_TIME' and 'OPERATOR_ID'
df= (
    df_ops.merge(df_table, on=['EXPLAIN_TIME', 'OPERATOR_ID'], how='left')
    .merge(df_pred, on=['EXPLAIN_TIME', 'OPERATOR_ID'], how='left')
    .merge(df_join, on=['EXPLAIN_TIME', 'OPERATOR_ID'], how='left')
)

# Adjust display settings for better visualization
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)        # Set display width for compact output

# Display the merged DataFrame
# print("Merged DataFrame:")
# print(df_merged)

# Apply the transformation row by row
# df = df_merged.apply(apply_transformation, axis=1)

# Display the transformed DataFrame
print(df)


                  EXPLAIN_TIME  OPERATOR_ID OPERATOR_TYPE  SOURCE_ID                  TABLE  \
0   2024-12-03-20.44.07.528574            1        RETURN        NaN                    NaN   
1   2024-12-03-20.44.07.528574            2        TQ            NaN                    NaN   
2   2024-12-03-20.44.07.528574            3        HSJOIN        NaN                    NaN   
3   2024-12-03-20.44.07.528574            4        TBSCAN       -1.0            AHNAF.TITLE   
4   2024-12-03-20.44.07.528574            5        HSJOIN        NaN                    NaN   
5   2024-12-03-20.44.07.528574            6        TBSCAN       -1.0       AHNAF.MOVIE_INFO   
6   2024-12-03-20.44.07.528574            7        HSJOIN        NaN                    NaN   
7   2024-12-03-20.44.07.528574            8        TBSCAN       -1.0     AHNAF.COMPANY_NAME   
8   2024-12-03-20.44.07.528574            9        HSJOIN        NaN                    NaN   
9   2024-12-03-20.44.07.528574           10       

In [17]:
import pandas as pd
import re

# Sample DataFrame provided
# df = pd.DataFrame({
#     'EXPLAIN_TIME': ['2024-10-08-04.48.13.056756', '2024-10-08-04.48.13.056756', '2024-10-08-04.48.13.056756', '2024-10-08-04.48.13.056756'],
#     'OPERATOR_ID': [1, 2, 3, 4],
#     'OPERATOR_TYPE': ['RETURN', 'HSJOIN', 'TBSCAN', 'TBSCAN'],
#     'TABLE': [None, None, 'TPCDS.CUSTOMER', 'TPCDS.DATE_DIM2'],
#     'PREDICATE1': [None, None, None, 'Q2.D_YEAR >= 1958'],
#     'PREDICATE2': [None, None, None, 'Q2.D_MOY = 12'],
#     'PREDICATE3': [None, None, None, None],
#     'JOIN_KEY': [None, 'Q2.D_DATE_SK = Q1.C_FIRST_SHIPTO_DATE_SK', None, None]
# })

# Function to extract the alias from the predicate (before the period)
def extract_alias(predicate):
    if isinstance(predicate, str):
        match = re.search(r'([A-Za-z0-9_]+)\.', predicate)
        if match:
            return match.group(1)  # Return the part before the period (alias)
    return None

# Filter rows where TABLE is not None and at least one PREDICATE column has a value
filtered_df = df[(df['TABLE'].notna()) & (df[['PREDICATE1', 'PREDICATE2', 'PREDICATE3']].notna().any(axis=1))]

# Create a new DataFrame with EXPLAIN_TIME, TABLE, and ALIAS
rows = []
for _, row in filtered_df.iterrows():
    predicates = [row['PREDICATE1'], row['PREDICATE2'], row['PREDICATE3']]
    alias = None
    for predicate in predicates:
        alias = extract_alias(predicate)
        if alias:
            break  # Once an alias is found, no need to check other predicates
    rows.append({'EXPLAIN_TIME': row['EXPLAIN_TIME'], 'TABLE': row['TABLE'], 'ALIAS': alias})

# Create the new dataframe
df_table_alias = pd.DataFrame(rows)

# Display the result
print(df_table_alias)


                 EXPLAIN_TIME                  TABLE ALIAS
0  2024-12-03-20.44.07.528574            AHNAF.TITLE    Q1
1  2024-12-03-20.44.07.528574       AHNAF.MOVIE_INFO    Q4
2  2024-12-03-20.44.07.528574     AHNAF.COMPANY_NAME    Q9
3  2024-12-03-20.44.07.528574  AHNAF.MOVIE_COMPANIES    Q5
4  2024-12-03-20.44.07.528574          AHNAF.KEYWORD    Q7
5  2024-12-03-20.44.07.528574        AHNAF.LINK_TYPE    Q6
6  2024-12-03-20.44.07.528574     AHNAF.COMPANY_TYPE    Q8


# extract optimized statement

In [18]:
if not os.path.isfile('db2.ipynb'):
    os.system('wget https://raw.githubusercontent.com/IBM/db2-jupyter/master/db2.ipynb')

%run db2.ipynb

  firstCommand = "(?:^\s*)([a-zA-Z]+)(?:\s+.*|$)"
  pattern = "\?\*[0-9]+"


         Install itables if you want to enable scrolling of result sets.
Db2 Extensions Loaded. Version: 2024-09-16


In [19]:
from dotenv import dotenv_values

In [20]:
db2creds = dotenv_values('db2con.env')
%sql CONNECT CREDENTIALS db2creds

Connection successful. TESTDB @ localhost 


In [22]:
%sql SET CURRENT SCHEMA AHNAF

Command completed.


In [23]:
explain_stmt = %sql select STATEMENT_TEXT from EXPLAIN_STATEMENT where EXPLAIN_TIME = '{query1_ts}' AND EXPLAIN_LEVEL = 'S'

In [25]:
explain_stmt['STATEMENT_TEXT'][0]

'SELECT Q9.NAME AS "COMPANY_NAME", Q6.LINK AS "LINK_TYPE", Q1.TITLE AS "WESTERN_FOLLOW_UP" FROM AHNAF.TITLE AS Q1, AHNAF.MOVIE_LINK AS Q2, AHNAF.MOVIE_KEYWORD AS Q3, AHNAF.MOVIE_INFO AS Q4, AHNAF.MOVIE_COMPANIES AS Q5, AHNAF.LINK_TYPE AS Q6, AHNAF.KEYWORD AS Q7, AHNAF.COMPANY_TYPE AS Q8, AHNAF.COMPANY_NAME AS Q9 WHERE (Q5.MOVIE_ID = Q4.MOVIE_ID) AND (Q3.MOVIE_ID = Q4.MOVIE_ID) AND (Q2.MOVIE_ID = Q4.MOVIE_ID) AND (Q4.MOVIE_ID = Q1.ID) AND (Q5.COMPANY_ID = Q9.ID) AND (Q5.COMPANY_TYPE_ID = Q8.ID) AND (Q3.KEYWORD_ID = Q7.ID) AND (Q6.ID = Q2.LINK_TYPE_ID) AND (Q1.PRODUCTION_YEAR <= 2020) AND (1950 <= Q1.PRODUCTION_YEAR) AND Q5.NOTE IS NULL AND (Q6.LINK LIKE \'%featured in%\') AND (Q7.KEYWORD = \'helicopter\') AND (Q8.KIND = \'distributors\') AND (Q9.COUNTRY_CODE = \'[us]\') AND Q4.INFO IN (\'Sweden               \', \'Norway               \', \'Germany              \', \'Denmark              \', \'Swedish              \', \'Denish               \', \'Norwegian            \', \'German       

In [26]:
import re

# Sample SQL text (multiline)
sql_text = explain_stmt['STATEMENT_TEXT'][0]

# Function to extract the FROM clause, handling multiline SQL
def extract_from_clause(sql_text):
    # Extract the FROM clause (up to WHERE, GROUP BY, etc.), handling newlines and extra spaces
    from_clause_match = re.search(r'\bFROM\b\s+(.+?)\s*(\bWHERE\b|\bGROUP BY\b|\bORDER BY\b|$)', sql_text, re.IGNORECASE | re.DOTALL)
    if from_clause_match:
        from_clause = from_clause_match.group(1).strip()
        return from_clause
    return None

# Function to extract table names and their aliases from the FROM clause
def extract_table_aliases(sql_text):
    # Extract the FROM clause
    from_clause = extract_from_clause(sql_text)
    
    if from_clause:
        print(f"Captured FROM clause: {from_clause}")  # Debugging step to print captured FROM clause

        # Find all table names and aliases in the form "table AS alias" or "table alias"
        table_alias_pairs = re.findall(r'([A-Za-z0-9_\.]+)\s+(?:AS\s+)?([A-Za-z0-9_]+)', from_clause, re.IGNORECASE)

        # Flip the dictionary: aliases as keys, table names as values
        alias_table_dict = {pair[1]: pair[0] for pair in table_alias_pairs}
        return alias_table_dict
    return {}

# Extract table aliases (flipped)
alias_table_dict = extract_table_aliases(sql_text)

# Display the extracted alias-table dictionary
print("Extracted alias-table dictionary:", alias_table_dict)


Captured FROM clause: AHNAF.TITLE AS Q1, AHNAF.MOVIE_LINK AS Q2, AHNAF.MOVIE_KEYWORD AS Q3, AHNAF.MOVIE_INFO AS Q4, AHNAF.MOVIE_COMPANIES AS Q5, AHNAF.LINK_TYPE AS Q6, AHNAF.KEYWORD AS Q7, AHNAF.COMPANY_TYPE AS Q8, AHNAF.COMPANY_NAME AS Q9
Extracted alias-table dictionary: {'Q1': 'AHNAF.TITLE', 'Q2': 'AHNAF.MOVIE_LINK', 'Q3': 'AHNAF.MOVIE_KEYWORD', 'Q4': 'AHNAF.MOVIE_INFO', 'Q5': 'AHNAF.MOVIE_COMPANIES', 'Q6': 'AHNAF.LINK_TYPE', 'Q7': 'AHNAF.KEYWORD', 'Q8': 'AHNAF.COMPANY_TYPE', 'Q9': 'AHNAF.COMPANY_NAME'}


In [28]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,SOURCE_ID,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,FILTER_FACTOR1,FILTER_FACTOR2,FILTER_FACTOR3,JOIN_KEY
0,2024-12-03-20.44.07.528574,1,RETURN,,,,,,,,,
1,2024-12-03-20.44.07.528574,2,TQ,,,,,,,,,
2,2024-12-03-20.44.07.528574,3,HSJOIN,,,,,,,,,Q4.MOVIE_ID = Q1.ID
3,2024-12-03-20.44.07.528574,4,TBSCAN,-1.0,AHNAF.TITLE,Q1.PRODUCTION_YEAR <= 2020,1950 <= Q1.PRODUCTION_YEAR,,0.971485,0.894801,,
4,2024-12-03-20.44.07.528574,5,HSJOIN,,,,,,,,,Q5.MOVIE_ID = Q4.MOVIE_ID
5,2024-12-03-20.44.07.528574,6,TBSCAN,-1.0,AHNAF.MOVIE_INFO,"Q4.INFO IN ('Sweden ', 'Norway ', 'Germany ', 'Denmark ', 'Swedish ', 'Denish ', 'Norwegian ', 'German ', 'USA ', 'CANADA ', 'Netherlands ', 'Brazil ', 'UK ', 'Belgium ', 'Finland ', 'Hungary ', 'Estonia ', 'worldwide ', 'Australia ', 'Spain ', 'France ', 'Japan ', 'Columbia ', 'Slovenia ', 'Israel ', 'Venezuela ', 'Nigeria ', 'Philippines ', 'New Zealand ', 'Ireland ', 'Romania ', 'Non-USA ', 'Bulgaria ', 'Argentina ', 'Malaysia ', 'Singapore ', 'Turkey ', 'Sri Lanka ', 'Italy ', 'Indonesia ', 'South Korea ', 'Vietnam ', 'Slovakia ', 'Czech Republic ', 'China ', 'Portugal ', 'Greece ', 'Republic of Macedonia', 'Serbia ', 'Jamaica ', 'Switzerland ', 'Yugoslavia ', 'Mexico ', 'Austria ', 'Russia ')",,,0.029517,,,
6,2024-12-03-20.44.07.528574,7,HSJOIN,,,,,,,,,Q5.COMPANY_ID = Q9.ID
7,2024-12-03-20.44.07.528574,8,TBSCAN,-1.0,AHNAF.COMPANY_NAME,Q9.COUNTRY_CODE = '[us]',,,0.359728,,,
8,2024-12-03-20.44.07.528574,9,HSJOIN,,,,,,,,,Q5.COMPANY_TYPE_ID = Q8.ID
9,2024-12-03-20.44.07.528574,10,HSJOIN,,,,,,,,,Q2.MOVIE_ID = Q5.MOVIE_ID


In [29]:
pd.set_option('display.max_colwidth', None)

In [30]:
import pandas as pd
import re

# # Sample alias-table dictionary extracted from SQL
# alias_table_dict = {
#     'Q1': 'TPCDS.CUSTOMER',
#     'Q2': 'TPCDS.DATE_DIM2'
# }

# Function to replace all aliases in the predicate with the corresponding table names using alias_table_dict
def replace_alias_with_table(predicate, alias_table_dict):
    if isinstance(predicate, str):
        # Replace all aliases in the string with the corresponding table names
        for alias, table_name in alias_table_dict.items():
            predicate = re.sub(fr'\b{alias}\b\.', f'{table_name}.', predicate)
    return predicate

# Apply the replacement to each predicate column
for index, row in df.iterrows():
    df.at[index, 'PREDICATE1'] = replace_alias_with_table(row['PREDICATE1'], alias_table_dict)
    df.at[index, 'PREDICATE2'] = replace_alias_with_table(row['PREDICATE2'], alias_table_dict)
    df.at[index, 'PREDICATE3'] = replace_alias_with_table(row['PREDICATE3'], alias_table_dict)
    df.at[index, 'JOIN_KEY'] = replace_alias_with_table(row['JOIN_KEY'], alias_table_dict)

# Display the updated DataFrame
print(df)


                  EXPLAIN_TIME  OPERATOR_ID OPERATOR_TYPE  SOURCE_ID                  TABLE  \
0   2024-12-03-20.44.07.528574            1        RETURN        NaN                    NaN   
1   2024-12-03-20.44.07.528574            2        TQ            NaN                    NaN   
2   2024-12-03-20.44.07.528574            3        HSJOIN        NaN                    NaN   
3   2024-12-03-20.44.07.528574            4        TBSCAN       -1.0            AHNAF.TITLE   
4   2024-12-03-20.44.07.528574            5        HSJOIN        NaN                    NaN   
5   2024-12-03-20.44.07.528574            6        TBSCAN       -1.0       AHNAF.MOVIE_INFO   
6   2024-12-03-20.44.07.528574            7        HSJOIN        NaN                    NaN   
7   2024-12-03-20.44.07.528574            8        TBSCAN       -1.0     AHNAF.COMPANY_NAME   
8   2024-12-03-20.44.07.528574            9        HSJOIN        NaN                    NaN   
9   2024-12-03-20.44.07.528574           10       

In [31]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,SOURCE_ID,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,FILTER_FACTOR1,FILTER_FACTOR2,FILTER_FACTOR3,JOIN_KEY
0,2024-12-03-20.44.07.528574,1,RETURN,,,,,,,,,
1,2024-12-03-20.44.07.528574,2,TQ,,,,,,,,,
2,2024-12-03-20.44.07.528574,3,HSJOIN,,,,,,,,,AHNAF.MOVIE_INFO.MOVIE_ID = AHNAF.TITLE.ID
3,2024-12-03-20.44.07.528574,4,TBSCAN,-1.0,AHNAF.TITLE,AHNAF.TITLE.PRODUCTION_YEAR <= 2020,1950 <= AHNAF.TITLE.PRODUCTION_YEAR,,0.971485,0.894801,,
4,2024-12-03-20.44.07.528574,5,HSJOIN,,,,,,,,,AHNAF.MOVIE_COMPANIES.MOVIE_ID = AHNAF.MOVIE_INFO.MOVIE_ID
5,2024-12-03-20.44.07.528574,6,TBSCAN,-1.0,AHNAF.MOVIE_INFO,"AHNAF.MOVIE_INFO.INFO IN ('Sweden ', 'Norway ', 'Germany ', 'Denmark ', 'Swedish ', 'Denish ', 'Norwegian ', 'German ', 'USA ', 'CANADA ', 'Netherlands ', 'Brazil ', 'UK ', 'Belgium ', 'Finland ', 'Hungary ', 'Estonia ', 'worldwide ', 'Australia ', 'Spain ', 'France ', 'Japan ', 'Columbia ', 'Slovenia ', 'Israel ', 'Venezuela ', 'Nigeria ', 'Philippines ', 'New Zealand ', 'Ireland ', 'Romania ', 'Non-USA ', 'Bulgaria ', 'Argentina ', 'Malaysia ', 'Singapore ', 'Turkey ', 'Sri Lanka ', 'Italy ', 'Indonesia ', 'South Korea ', 'Vietnam ', 'Slovakia ', 'Czech Republic ', 'China ', 'Portugal ', 'Greece ', 'Republic of Macedonia', 'Serbia ', 'Jamaica ', 'Switzerland ', 'Yugoslavia ', 'Mexico ', 'Austria ', 'Russia ')",,,0.029517,,,
6,2024-12-03-20.44.07.528574,7,HSJOIN,,,,,,,,,AHNAF.MOVIE_COMPANIES.COMPANY_ID = AHNAF.COMPANY_NAME.ID
7,2024-12-03-20.44.07.528574,8,TBSCAN,-1.0,AHNAF.COMPANY_NAME,AHNAF.COMPANY_NAME.COUNTRY_CODE = '[us]',,,0.359728,,,
8,2024-12-03-20.44.07.528574,9,HSJOIN,,,,,,,,,AHNAF.MOVIE_COMPANIES.COMPANY_TYPE_ID = AHNAF.COMPANY_TYPE.ID
9,2024-12-03-20.44.07.528574,10,HSJOIN,,,,,,,,,AHNAF.MOVIE_LINK.MOVIE_ID = AHNAF.MOVIE_COMPANIES.MOVIE_ID


# Create a dictionary of column distributions

In [26]:
hist = {}

In [27]:
# Iterating through the dataframe and checking the values in PREDICATE1, PREDICATE2, and PREDICATE3
for index, row in df.iterrows():
    if row['PREDICATE1'] is not None:
        print(f"Row {index}, PREDICATE1: {row['PREDICATE1']}")
    if row['PREDICATE2'] is not None:
        print(f"Row {index}, PREDICATE2: {row['PREDICATE2']}")
    if row['PREDICATE3'] is not None:
        print(f"Row {index}, PREDICATE3: {row['PREDICATE3']}")


Row 0, PREDICATE1: nan
Row 0, PREDICATE2: nan
Row 0, PREDICATE3: nan
Row 1, PREDICATE1: nan
Row 1, PREDICATE2: nan
Row 1, PREDICATE3: nan
Row 2, PREDICATE1: nan
Row 2, PREDICATE2: nan
Row 2, PREDICATE3: nan
Row 3, PREDICATE1: AHNAF.TITLE.PRODUCTION_YEAR <= 2020
Row 3, PREDICATE2: 1950 <= AHNAF.TITLE.PRODUCTION_YEAR
Row 4, PREDICATE1: nan
Row 4, PREDICATE2: nan
Row 4, PREDICATE3: nan
Row 5, PREDICATE1: AHNAF.MOVIE_INFO.INFO IN ('Sweden               ', 'Norway               ', 'Germany              ', 'Denmark              ', 'Swedish              ', 'Denish               ', 'Norwegian            ', 'German               ', 'USA                  ', 'CANADA               ', 'Netherlands          ', 'Brazil               ', 'UK                   ', 'Belgium              ', 'Finland              ', 'Hungary              ', 'Estonia              ', 'worldwide            ', 'Australia            ', 'Spain                ', 'France               ', 'Japan                ', 'Columbia          

In [28]:
df[['PREDICATE1', 'PREDICATE2', 'PREDICATE3']]

Unnamed: 0,PREDICATE1,PREDICATE2,PREDICATE3
0,,,
1,,,
2,,,
3,AHNAF.TITLE.PRODUCTION_YEAR <= 2020,1950 <= AHNAF.TITLE.PRODUCTION_YEAR,
4,,,
5,"AHNAF.MOVIE_INFO.INFO IN ('Sweden ', 'Norway ', 'Germany ', 'Denmark ', 'Swedish ', 'Denish ', 'Norwegian ', 'German ', 'USA ', 'CANADA ', 'Netherlands ', 'Brazil ', 'UK ', 'Belgium ', 'Finland ', 'Hungary ', 'Estonia ', 'worldwide ', 'Australia ', 'Spain ', 'France ', 'Japan ', 'Columbia ', 'Slovenia ', 'Israel ', 'Venezuela ', 'Nigeria ', 'Philippines ', 'New Zealand ', 'Ireland ', 'Romania ', 'Non-USA ', 'Bulgaria ', 'Argentina ', 'Malaysia ', 'Singapore ', 'Turkey ', 'Sri Lanka ', 'Italy ', 'Indonesia ', 'South Korea ', 'Vietnam ', 'Slovakia ', 'Czech Republic ', 'China ', 'Portugal ', 'Greece ', 'Republic of Macedonia', 'Serbia ', 'Jamaica ', 'Switzerland ', 'Yugoslavia ', 'Mexico ', 'Austria ', 'Russia ')",,
6,,,
7,AHNAF.COMPANY_NAME.COUNTRY_CODE = '[us]',,
8,,,
9,,,


In [29]:
df.columns

Index(['EXPLAIN_TIME', 'OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY'], dtype='object')

# Parse predicate text for extracting (col, op, val) and storing them in a new column

In [44]:
import pandas as pd
import re

# Function to parse a single predicate
def parse_predicate(predicate):
    print('parse_predicate: ', predicate)
    if predicate is None or pd.isna(predicate):
        return None
    # Match basic predicate (col, op, val)
    basic_pattern = r'(?P<col>[\w\.]+)\s*(?P<op><=|>=|<|>|=|!=|LIKE)\s*(?P<val>.+)'
    # Match IN predicate
    in_pattern = r'(?P<col>[\w\.]+)\s+IN\s+\((?P<val>.+?)\)'

    # Try to match basic predicate
    match = re.match(basic_pattern, predicate.strip())
    if match:
        col = match.group('col')
        op = match.group('op')
        val = match.group('val')
        return (col, op, val)

    # Try to match IN predicate
    match = re.match(in_pattern, predicate.strip(), flags=re.IGNORECASE)
    if match:
        col = match.group('col')
        op = 'IN'
        val = match.group('val')
       
        # Split by commas and clean spaces inside quotes
        # Split by commas and clean spaces inside quotes
        val_list = [
            f"'{v.strip().strip('\'').strip()}'" if v.strip().startswith("'") else v.strip()
            for v in val.split(',')
        ]
        
        return (col, op, val_list)

    return None

# Create PARSED_PREDICATE columns dynamically
for i in range(1, 4):  # Assuming PREDICATE1, PREDICATE2, PREDICATE3
    predicate_col = f'PREDICATE{i}'
    parsed_col = f'PARSED_PREDICATE{i}'
    df[parsed_col] = df[predicate_col].apply(parse_predicate)

# Display the result
#import ace_tools as tools; tools.display_dataframe_to_user(name="Parsed Predicates DataFrame with Dynamic Parsing", dataframe=df)


parse_predicate:  nan
parse_predicate:  nan
parse_predicate:  nan
parse_predicate:  AHNAF.TITLE.PRODUCTION_YEAR <= 2020
parse_predicate:  nan
parse_predicate:  AHNAF.MOVIE_INFO.INFO IN ('Sweden               ', 'Norway               ', 'Germany              ', 'Denmark              ', 'Swedish              ', 'Denish               ', 'Norwegian            ', 'German               ', 'USA                  ', 'CANADA               ', 'Netherlands          ', 'Brazil               ', 'UK                   ', 'Belgium              ', 'Finland              ', 'Hungary              ', 'Estonia              ', 'worldwide            ', 'Australia            ', 'Spain                ', 'France               ', 'Japan                ', 'Columbia             ', 'Slovenia             ', 'Israel               ', 'Venezuela            ', 'Nigeria              ', 'Philippines          ', 'New Zealand          ', 'Ireland              ', 'Romania              ', 'Non-USA              ', 'Bulgaria     

In [45]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,PARSED_PREDICATE1,PARSED_PREDICATE2,PARSED_PREDICATE3
0,2024-12-03-20.44.07.528574,1,RETURN,,,,,,,,
1,2024-12-03-20.44.07.528574,2,TQ,,,,,,,,
2,2024-12-03-20.44.07.528574,3,HSJOIN,,,,,AHNAF.MOVIE_INFO.MOVIE_ID = AHNAF.TITLE.ID,,,
3,2024-12-03-20.44.07.528574,4,TBSCAN,AHNAF.TITLE,AHNAF.TITLE.PRODUCTION_YEAR <= 2020,1950 <= AHNAF.TITLE.PRODUCTION_YEAR,,,"(AHNAF.TITLE.PRODUCTION_YEAR, <=, 2020)","(1950, <=, AHNAF.TITLE.PRODUCTION_YEAR)",
4,2024-12-03-20.44.07.528574,5,HSJOIN,,,,,AHNAF.MOVIE_COMPANIES.MOVIE_ID = AHNAF.MOVIE_INFO.MOVIE_ID,,,
5,2024-12-03-20.44.07.528574,6,TBSCAN,AHNAF.MOVIE_INFO,"AHNAF.MOVIE_INFO.INFO IN ('Sweden ', 'Norway ', 'Germany ', 'Denmark ', 'Swedish ', 'Denish ', 'Norwegian ', 'German ', 'USA ', 'CANADA ', 'Netherlands ', 'Brazil ', 'UK ', 'Belgium ', 'Finland ', 'Hungary ', 'Estonia ', 'worldwide ', 'Australia ', 'Spain ', 'France ', 'Japan ', 'Columbia ', 'Slovenia ', 'Israel ', 'Venezuela ', 'Nigeria ', 'Philippines ', 'New Zealand ', 'Ireland ', 'Romania ', 'Non-USA ', 'Bulgaria ', 'Argentina ', 'Malaysia ', 'Singapore ', 'Turkey ', 'Sri Lanka ', 'Italy ', 'Indonesia ', 'South Korea ', 'Vietnam ', 'Slovakia ', 'Czech Republic ', 'China ', 'Portugal ', 'Greece ', 'Republic of Macedonia', 'Serbia ', 'Jamaica ', 'Switzerland ', 'Yugoslavia ', 'Mexico ', 'Austria ', 'Russia ')",,,,"(AHNAF.MOVIE_INFO.INFO, IN, ['Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'CANADA', 'Netherlands', 'Brazil', 'UK', 'Belgium', 'Finland', 'Hungary', 'Estonia', 'worldwide', 'Australia', 'Spain', 'France', 'Japan', 'Columbia', 'Slovenia', 'Israel', 'Venezuela', 'Nigeria', 'Philippines', 'New Zealand', 'Ireland', 'Romania', 'Non-USA', 'Bulgaria', 'Argentina', 'Malaysia', 'Singapore', 'Turkey', 'Sri Lanka', 'Italy', 'Indonesia', 'South Korea', 'Vietnam', 'Slovakia', 'Czech Republic', 'China', 'Portugal', 'Greece', 'Republic of Macedonia', 'Serbia', 'Jamaica', 'Switzerland', 'Yugoslavia', 'Mexico', 'Austria', 'Russia'])",,
6,2024-12-03-20.44.07.528574,7,HSJOIN,,,,,AHNAF.MOVIE_COMPANIES.COMPANY_ID = AHNAF.COMPANY_NAME.ID,,,
7,2024-12-03-20.44.07.528574,8,TBSCAN,AHNAF.COMPANY_NAME,AHNAF.COMPANY_NAME.COUNTRY_CODE = '[us]',,,,"(AHNAF.COMPANY_NAME.COUNTRY_CODE, =, '[us]')",,
8,2024-12-03-20.44.07.528574,9,HSJOIN,,,,,AHNAF.MOVIE_COMPANIES.COMPANY_TYPE_ID = AHNAF.COMPANY_TYPE.ID,,,
9,2024-12-03-20.44.07.528574,10,HSJOIN,,,,,AHNAF.MOVIE_LINK.MOVIE_ID = AHNAF.MOVIE_COMPANIES.MOVIE_ID,,,


In [205]:
import re
import pandas as pd

# Function to parse predicate and extract the column
def parse_predicate_column(predicate):
    # Regular expression to match the format <column> <operator> <value>
    if isinstance(predicate, str):
        match = re.match(r'(\S+)\s*(>=|<=|>|<|=|!=)\s*(.*)', predicate)
        if match:
            column, _, _ = match.groups()
            return column
    return None

# Function to generate SQL queries for non-None predicates
def generate_sql_queries(df):
    queries = []
    for index, row in df.iterrows():
        table = row['TABLE']
        if table:
            # Check each predicate column (PREDICATE1, PREDICATE2, PREDICATE3)
            for predicate_col in ['PREDICATE1', 'PREDICATE2', 'PREDICATE3']:
                predicate = row[predicate_col]
                if isinstance(predicate, str):  # Ensure the predicate is a string
                    column = parse_predicate_column(predicate)
                    if column:
                        # Generate SQL query with ORDER BY to ensure sorting in ascending order
                        query = f"SELECT {column} FROM {table} ORDER BY {column} ASC;"
                        queries.append(query)
                        print(f"Generated query: {query}")
    return queries

# # Example DataFrame
# data = {
#     'TABLE': ['table1', 'table2', None],
#     'PREDICATE1': ['col1 >= 10', None, 'col3 < 5'],
#     'PREDICATE2': ['col2 <= 20', 'col4 != 100', None],
#     'PREDICATE3': [None, 'col5 = 50', 'col6 > 15']
# }

# df = pd.DataFrame(data)

# Call the function to generate and print the queries
sql_queries = generate_sql_queries(df)

Generated query: SELECT TPCDS.WEB_SALES.WS_ITEM_SK FROM TPCDS.WEB_SALES ORDER BY TPCDS.WEB_SALES.WS_ITEM_SK ASC;
Generated query: SELECT TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX FROM TPCDS.WEB_SALES ORDER BY TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX ASC;


In [206]:
# Dictionary to store the results
results_dict = {}

# List of SQL queries generated from the earlier steps
# sql_queries = [
#     "SELECT TPCDS.DATE_DIM2.D_YEAR FROM TPCDS.DATE_DIM2;",
#     "SELECT TPCDS.DATE_DIM2.D_MOY FROM TPCDS.DATE_DIM2;"
# ]

# Iterate through each query
for query in sql_queries:
    # Running the SQL query using Db2 magic command and getting the result into a pandas dataframe
    result_df = %sql {query}
    
    # Extracting the column name from the query (the first column in SELECT)
    full_column_name = query.split()[1]  # This is like 'TPCDS.DATE_DIM2.D_YEAR'
    
    # The actual column name may be simplified (e.g., 'D_YEAR'). Get it from the result_df columns.
    # We assume that the first column in result_df corresponds to the selected column.
    actual_column_name = result_df.columns[0]  # Take the first column of the dataframe

    # Converting the dataframe column into a list
    result_list = result_df[actual_column_name].tolist()
    
    # Adding the result list to the dictionary with the full column name as the key
    results_dict[full_column_name] = result_list

# Print the resulting dictionary
#print(results_dict)


In [207]:
results_dict.keys()

dict_keys(['TPCDS.WEB_SALES.WS_ITEM_SK', 'TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX'])

In [208]:
import numpy as np
import pandas as pd

# Dictionary to store the bin boundaries for each column
bin_boundaries_dict = {}

# Iterate over each key (column name) and the associated list of values in the dictionary
for column, values_list in results_dict.items():
    if len(values_list) > 0:
        # Convert the list of values into a numpy array, ignoring NA values
        values_array = np.array([v for v in values_list if pd.notna(v)], dtype=float)
        
        if len(values_array) == 0:
            print(f"Column {column} has no valid numeric data.")
            continue
        
        # Increase the maximum value by 0.001 for exclusive upper boundary
        max_value = values_array.max() + 0.001
        values_array[values_array.argmax()] = max_value
        
        # Calculate the bin boundaries using quantiles (50 equal-sized bins)
        bin_edges = np.quantile(values_array, np.linspace(0, 1, 51))  # 50 bins (51 edges)
        
        # Store the bin boundaries in the dictionary for future retrieval
        bin_boundaries_dict[column] = bin_edges
        
        # Display the bin boundaries
        print(f"Column: {column}")
        print(f"Bin boundaries: {bin_edges}\n")

# To retrieve bin boundaries later
# For example, to retrieve the bin boundaries for 'TPCDS.DATE_DIM2.D_YEAR':
#retrieved_boundaries = bin_boundaries_dict['TPCDS.DATE_DIM2.D_YEAR']
#print(f"Retrieved bin boundaries for TPCDS.DATE_DIM2.D_YEAR: {retrieved_boundaries}")


Column: TPCDS.WEB_SALES.WS_ITEM_SK
Bin boundaries: [1.0000000e+00 3.6100000e+02 7.2300000e+02 1.0840000e+03 1.4400000e+03
 1.7980000e+03 2.1579600e+03 2.5220000e+03 2.8810000e+03 3.2380000e+03
 3.5960000e+03 3.9580000e+03 4.3180000e+03 4.6790000e+03 5.0390000e+03
 5.4010000e+03 5.7640000e+03 6.1180000e+03 6.4780000e+03 6.8370000e+03
 7.2010000e+03 7.5610000e+03 7.9230000e+03 8.2830000e+03 8.6430000e+03
 9.0040000e+03 9.3670000e+03 9.7310000e+03 1.0093000e+04 1.0456000e+04
 1.0819000e+04 1.1175000e+04 1.1535000e+04 1.1898000e+04 1.2258000e+04
 1.2618000e+04 1.2975000e+04 1.3336000e+04 1.3695000e+04 1.4054000e+04
 1.4412000e+04 1.4767000e+04 1.5127000e+04 1.5485000e+04 1.5846000e+04
 1.6202000e+04 1.6565000e+04 1.6928000e+04 1.7282000e+04 1.7643000e+04
 1.8000001e+04]

Column: TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX
Bin boundaries: [0.0000000e+00 5.5316000e+00 1.9970000e+01 3.6550000e+01 5.5570000e+01
 7.6520000e+01 9.9850000e+01 1.2535000e+02 1.5364280e+02 1.8466440e+02
 2.1852000e+02 2.554

# Retrieve the bin boundaries for each non None predicate

In [209]:
import pandas as pd
import re

# Example dataframe
# data = {
#     'EXPLAIN_TIME': ['2024-10-08-04.48.13.056756']*4,
#     'OPERATOR_ID': [1, 2, 3, 4],
#     'OPERATOR_TYPE': ['RETURN', 'HSJOIN', 'TBSCAN', 'TBSCAN'],
#     'TABLE': [None, None, 'TPCDS.CUSTOMER', 'TPCDS.DATE_DIM2'],
#     'PREDICATE1': [None, None, None, 'TPCDS.DATE_DIM2.D_YEAR >= 1958'],
#     'PREDICATE2': [None, None, None, 'TPCDS.DATE_DIM2.D_MOY = 12'],
#     'PREDICATE3': [None, None, None, None],
#     'JOIN_KEY': [None, 'TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.CUSTOMER.C_FIRST_SHIPTO_DATE_SK', None, None]
# }

# df = pd.DataFrame(data)

# Define a pattern to extract the column part from the predicate
pattern = r'([A-Z0-9_]+\.[A-Z0-9_]+\.[A-Z0-9_]+)'

# Iterate over each row and check PREDICATE1, PREDICATE2, PREDICATE3
for index, row in df.iterrows():
    for col in ['PREDICATE1', 'PREDICATE2', 'PREDICATE3']:
        predicate = row[col]
        if isinstance(predicate, str):
            # Extract the column part from the predicate
            match = re.search(pattern, predicate)
            if match:
                column_name = match.group(1)
                # Retrieve bin boundaries from the dictionary
                if column_name in bin_boundaries_dict:
                    bin_boundaries = bin_boundaries_dict[column_name]
                    print(f"Predicate: {predicate}")
                    print(f"Column: {column_name}")
                    print(f"Bin boundaries: {bin_boundaries}\n")


Predicate: TPCDS.WEB_SALES.WS_ITEM_SK >= 16111
Column: TPCDS.WEB_SALES.WS_ITEM_SK
Bin boundaries: [1.0000000e+00 3.6100000e+02 7.2300000e+02 1.0840000e+03 1.4400000e+03
 1.7980000e+03 2.1579600e+03 2.5220000e+03 2.8810000e+03 3.2380000e+03
 3.5960000e+03 3.9580000e+03 4.3180000e+03 4.6790000e+03 5.0390000e+03
 5.4010000e+03 5.7640000e+03 6.1180000e+03 6.4780000e+03 6.8370000e+03
 7.2010000e+03 7.5610000e+03 7.9230000e+03 8.2830000e+03 8.6430000e+03
 9.0040000e+03 9.3670000e+03 9.7310000e+03 1.0093000e+04 1.0456000e+04
 1.0819000e+04 1.1175000e+04 1.1535000e+04 1.1898000e+04 1.2258000e+04
 1.2618000e+04 1.2975000e+04 1.3336000e+04 1.3695000e+04 1.4054000e+04
 1.4412000e+04 1.4767000e+04 1.5127000e+04 1.5485000e+04 1.5846000e+04
 1.6202000e+04 1.6565000e+04 1.6928000e+04 1.7282000e+04 1.7643000e+04
 1.8000001e+04]

Predicate: TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48
Column: TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX
Bin boundaries: [0.0000000e+00 5.5316000e+00 1.9970000e+01 3.6550000e+0

# computing fraction

In [210]:
import re
import numpy as np

# Initialize the dictionary to store the results
bin_coverage_dict = {}

# Function to extract the operator and value from a predicate
def extract_value(predicate):
    match = re.search(r'([<>]=?|=)\s*(\d+)', predicate)
    if match:
        operator, value = match.groups()
        return operator, int(value)
    return None, None

# Function to check which bins are covered by the predicate
def check_bins_coverage(operator, value, bin_edges):
    bin_coverage = []

    for i in range(len(bin_edges) - 1):
        bin_start = bin_edges[i]
        bin_end = bin_edges[i + 1]
        bin_range = bin_end - bin_start  # The width of the bin

        # Handle '>='
        if operator == '>=':
            if value <= bin_start:
                bin_coverage.append(1)  # Bin is entirely covered
            elif value > bin_end:
                bin_coverage.append(0)  # No coverage
            else:
                # Partial coverage - fraction of bin covered
                covered_fraction = (bin_end - value) / bin_range
                bin_coverage.append(covered_fraction)

        # Handle '>'
        elif operator == '>':
            if value < bin_start:
                bin_coverage.append(1)  # Bin is entirely covered
            elif value >= bin_end:
                bin_coverage.append(0)  # No coverage
            else:
                covered_fraction = (bin_end - value) / bin_range
                bin_coverage.append(covered_fraction)

        # Handle '<='
        elif operator == '<=':
            if value >= bin_end:
                bin_coverage.append(1)  # Bin is entirely covered
            elif value < bin_start:
                bin_coverage.append(0)  # No coverage
            else:
                covered_fraction = (value - bin_start) / bin_range
                bin_coverage.append(covered_fraction)

        # Handle '<'
        elif operator == '<':
            if value > bin_end:
                bin_coverage.append(1)  # Bin is entirely covered
            elif value <= bin_start:
                bin_coverage.append(0)  # No coverage
            else:
                covered_fraction = (value - bin_start) / bin_range
                bin_coverage.append(covered_fraction)

        # Handle '='
        elif operator == '=':
            if bin_start <= value < bin_end:
                bin_coverage.append(1)  # Exact match within the bin
            else:
                bin_coverage.append(0)

    return bin_coverage

# Iterate over each row and check PREDICATE1, PREDICATE2, PREDICATE3
for index, row in df.iterrows():
    for col in ['PREDICATE1', 'PREDICATE2', 'PREDICATE3']:
        predicate = row[col]
        if isinstance(predicate, str):
            # Extract the column part from the predicate
            match = re.search(pattern, predicate)
            if match:
                column_name = match.group(1)

                # Retrieve bin boundaries from the dictionary
                if column_name in bin_boundaries_dict:
                    bin_edges = bin_boundaries_dict[column_name]

                    # Extract the operator and value from the predicate
                    operator, value = extract_value(predicate)

                    if operator and value is not None:
                        # Check which bins are covered
                        bin_coverage = check_bins_coverage(operator, value, bin_edges)

                        # Store the result in the dictionary with the predicate as the key
                        bin_coverage_dict[predicate] = bin_coverage

                        print(f"Predicate: {predicate}")
                        print(f"Column: {column_name}")
                        print(f"Bin boundaries: {bin_edges}")
                        print(f"Bin coverage: {bin_coverage}\n")

# Now you can access the results in bin_coverage_dict
print(bin_coverage_dict)


Predicate: TPCDS.WEB_SALES.WS_ITEM_SK >= 16111
Column: TPCDS.WEB_SALES.WS_ITEM_SK
Bin boundaries: [1.0000000e+00 3.6100000e+02 7.2300000e+02 1.0840000e+03 1.4400000e+03
 1.7980000e+03 2.1579600e+03 2.5220000e+03 2.8810000e+03 3.2380000e+03
 3.5960000e+03 3.9580000e+03 4.3180000e+03 4.6790000e+03 5.0390000e+03
 5.4010000e+03 5.7640000e+03 6.1180000e+03 6.4780000e+03 6.8370000e+03
 7.2010000e+03 7.5610000e+03 7.9230000e+03 8.2830000e+03 8.6430000e+03
 9.0040000e+03 9.3670000e+03 9.7310000e+03 1.0093000e+04 1.0456000e+04
 1.0819000e+04 1.1175000e+04 1.1535000e+04 1.1898000e+04 1.2258000e+04
 1.2618000e+04 1.2975000e+04 1.3336000e+04 1.3695000e+04 1.4054000e+04
 1.4412000e+04 1.4767000e+04 1.5127000e+04 1.5485000e+04 1.5846000e+04
 1.6202000e+04 1.6565000e+04 1.6928000e+04 1.7282000e+04 1.7643000e+04
 1.8000001e+04]
Bin coverage: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2556179775280899, 1, 1, 1, 

# Calculating Tree Node heights

In [211]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY
0,2024-10-08-04.52.21.704464,1,RETURN,,,,,
1,2024-10-08-04.52.21.704464,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK
2,2024-10-08-04.52.21.704464,3,TBSCAN,TPCDS.DATE_DIM2,,,,
3,2024-10-08-04.52.21.704464,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,
4,2024-10-08-04.52.21.704464,5,RIDSCN,,,,,
5,2024-10-08-04.52.21.704464,6,SORT,,,,,
6,2024-10-08-04.52.21.704464,7,IXSCAN,SYSIBM.SQL240509064049930,,,,


In [212]:
# Read the CSV file
file_path = 'SimpleQueriesSQ12c/EXPLAIN_STREAM.csv'
df_stream = pd.read_csv(file_path)

# Filter for the given EXPLAIN_TIME value and non-NaN OBJECT_NAME
df_stream = df_stream.loc[(df_stream['EXPLAIN_TIME'] == query1_ts)].copy()

In [213]:
df_stream.columns

Index(['EXPLAIN_REQUESTER', 'EXPLAIN_TIME', 'SOURCE_NAME', 'SOURCE_SCHEMA', 'SOURCE_VERSION', 'EXPLAIN_LEVEL', 'STMTNO', 'SECTNO', 'STREAM_ID', 'SOURCE_TYPE', 'SOURCE_ID', 'TARGET_TYPE', 'TARGET_ID', 'OBJECT_SCHEMA', 'OBJECT_NAME', 'STREAM_COUNT', 'COLUMN_COUNT', 'PREDICATE_ID', 'COLUMN_NAMES', 'PMID', 'SINGLE_NODE', 'PARTITION_COLUMNS', 'SEQUENCE_SIZES', 'OBJECT_TENANTID'], dtype='object')

In [214]:
df_stream = df_stream[['SOURCE_ID', 'TARGET_ID']]

In [215]:
df_stream.head()

Unnamed: 0,SOURCE_ID,TARGET_ID
569,-1,3
570,3,2
571,-1,7
572,7,6
573,6,5


In [216]:
df_stream

Unnamed: 0,SOURCE_ID,TARGET_ID
569,-1,3
570,3,2
571,-1,7
572,7,6
573,6,5
574,5,4
575,-1,4
576,4,2
577,2,1


In [217]:

import pandas as pd
import numpy as np

# Step 1: Generate the adjacency matrix (from previous code)
# data = {
#     'SOURCE_ID': [-1, 3, -1, 4, 2],
#     'TARGET_ID': [3, 2, 4, 2, 1]
# }
# df = pd.DataFrame(data)

# Extract all unique nodes from SOURCE_ID and TARGET_ID
node_ids = set(df_stream['SOURCE_ID']).union(set(df_stream['TARGET_ID'])) - {-1}
node_ids = sorted(node_ids)

# Map node ids to indices for the adjacency matrix
node_to_index = {node: idx for idx, node in enumerate(node_ids)}

# Initialize the adjacency matrix with zeros
n = len(node_ids)
adj_matrix = np.zeros((n, n), dtype=int)

# Fill the adjacency matrix based on the parent-child relationships
for _, row in df_stream.iterrows():
    source = row['SOURCE_ID']
    target = row['TARGET_ID']
    
    if source != -1:
        parent_idx = node_to_index[target]
        child_idx = node_to_index[source]
        adj_matrix[parent_idx][child_idx] = 1

# Convert to DataFrame for better display
adj_matrix_df = pd.DataFrame(adj_matrix, index=node_ids, columns=node_ids)
print("Adjacency Matrix:")
print(adj_matrix_df)


Adjacency Matrix:
   1  2  3  4  5  6  7
1  0  1  0  0  0  0  0
2  0  0  1  1  0  0  0
3  0  0  0  0  0  0  0
4  0  0  0  0  1  0  0
5  0  0  0  0  0  1  0
6  0  0  0  0  0  0  1
7  0  0  0  0  0  0  0


In [218]:
# Reset the index of the adjacency matrix to be 0-based
adj_matrix_df.index = adj_matrix_df.index - 1  # Convert row indices to 0-based
adj_matrix_df.columns = adj_matrix_df.columns - 1  # Convert column indices to 0-based

print("Adjacency Matrix with 0-Based Index:")
print(adj_matrix_df)

Adjacency Matrix with 0-Based Index:
   0  1  2  3  4  5  6
0  0  1  0  0  0  0  0
1  0  0  1  1  0  0  0
2  0  0  0  0  0  0  0
3  0  0  0  0  1  0  0
4  0  0  0  0  0  1  0
5  0  0  0  0  0  0  1
6  0  0  0  0  0  0  0


# Find the BFS sequence

In [219]:
import numpy as np
from collections import deque

def bfs_from_adj_matrix(adj_matrix, start_node):
    """
    Perform BFS on a graph represented by an adjacency matrix.

    Args:
        adj_matrix (np.ndarray): Adjacency matrix of the graph.
        start_node (int): Starting node for BFS.

    Returns:
        list: List of nodes in the order they are visited.
    """
    num_nodes = adj_matrix.shape[0]  # Number of nodes
    visited = [False] * num_nodes    # Track visited nodes
    queue = deque([start_node])      # Initialize queue with the start node
    visited[start_node] = True       # Mark the start node as visited

    bfs_order = []  # List to store BFS traversal order

    while queue:
        node = queue.popleft()       # Dequeue the next node
        bfs_order.append(node)       # Record the visited node

        # Check all neighbors (columns in the row of this node)
        for neighbor, is_connected in enumerate(adj_matrix[node]):
            if is_connected and not visited[neighbor]:  # If connected and not visited
                queue.append(neighbor)                  # Enqueue the neighbor
                visited[neighbor] = True                # Mark as visited

    return bfs_order

# Convert adj_matrix_df to NumPy array
adj_matrix_np = adj_matrix_df.values

# Perform BFS starting from node 0
bfs_order = bfs_from_adj_matrix(adj_matrix_np, start_node=0)
print("BFS Row Index Order:", bfs_order)


BFS Row Index Order: [0, 1, 2, 3, 4, 5, 6]


In [220]:
# Convert 0-based indexed adjacency matrix to adj_list
adj_list = []

# Iterate over the adjacency matrix
for parent_idx, row in enumerate(adj_matrix):  # parent_idx is the row index
    for child_idx, is_child in enumerate(row):  # child_idx is the column index
        if is_child:  # If there's a connection (value is 1)
            adj_list.append((parent_idx, child_idx))  # Use 0-based indices directly

adj_list_tensor = torch.LongTensor(np.array(adj_list))

print("Adjacency List:")
print(adj_list_tensor)


Adjacency List:
tensor([[0, 1],
        [1, 2],
        [1, 3],
        [3, 4],
        [4, 5],
        [5, 6]])


In [221]:
# Step 1: Perform BFS to get the order of nodes
bfs_order = bfs_from_adj_matrix(adj_matrix, start_node=0)  # Adjust start_node as needed

# Step 2: Create a mapping from BFS order to indices for sorting
bfs_index_map = {node: i for i, node in enumerate(bfs_order)}

# Step 3: Reorder adj_list based on BFS order
adj_list = [
    (parent, child)
    for parent in bfs_order
    for child, is_child in enumerate(adj_matrix[parent])
    if is_child  # Keep only valid edges
]

# Step 4: Convert to tensor if needed
adj_list_tensor = torch.LongTensor(np.array(adj_list))

print("Reordered Adjacency List:")
print(adj_list_tensor)

Reordered Adjacency List:
tensor([[0, 1],
        [1, 2],
        [1, 3],
        [3, 4],
        [4, 5],
        [5, 6]])


In [222]:
edge_index = adj_list_tensor.t()

In [223]:
edge_index

tensor([[0, 1, 1, 3, 4, 5],
        [1, 2, 3, 4, 5, 6]])

In [224]:
from model.database_util import floyd_warshall_rewrite

In [225]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY
0,2024-10-08-04.52.21.704464,1,RETURN,,,,,
1,2024-10-08-04.52.21.704464,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK
2,2024-10-08-04.52.21.704464,3,TBSCAN,TPCDS.DATE_DIM2,,,,
3,2024-10-08-04.52.21.704464,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,
4,2024-10-08-04.52.21.704464,5,RIDSCN,,,,,
5,2024-10-08-04.52.21.704464,6,SORT,,,,,
6,2024-10-08-04.52.21.704464,7,IXSCAN,SYSIBM.SQL240509064049930,,,,


In [226]:
bfs_order

[0, 1, 2, 3, 4, 5, 6]

In [227]:
# Reorder the DataFrame rows based on BFS order
df = df.iloc[bfs_order].reset_index(drop=True)

df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY
0,2024-10-08-04.52.21.704464,1,RETURN,,,,,
1,2024-10-08-04.52.21.704464,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK
2,2024-10-08-04.52.21.704464,3,TBSCAN,TPCDS.DATE_DIM2,,,,
3,2024-10-08-04.52.21.704464,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,
4,2024-10-08-04.52.21.704464,5,RIDSCN,,,,,
5,2024-10-08-04.52.21.704464,6,SORT,,,,,
6,2024-10-08-04.52.21.704464,7,IXSCAN,SYSIBM.SQL240509064049930,,,,


In [228]:
df.shape
N = df.shape[0]

In [229]:
if len(edge_index) == 0:
            shortest_path_result = np.array([[0]])
            path = np.array([[0]])
            adj = torch.tensor([[0]]).bool()
else:
            adj = torch.zeros([N,N], dtype=torch.bool)
            adj[edge_index[0,:], edge_index[1,:]] = True
            
            shortest_path_result = floyd_warshall_rewrite(adj.numpy())
        
rel_pos = torch.from_numpy((shortest_path_result)).long()

In [230]:
rel_pos

tensor([[ 0,  1,  2,  2,  3,  4,  5],
        [60,  0,  1,  1,  2,  3,  4],
        [60, 60,  0, 60, 60, 60, 60],
        [60, 60, 60,  0,  1,  2,  3],
        [60, 60, 60, 60,  0,  1,  2],
        [60, 60, 60, 60, 60,  0,  1],
        [60, 60, 60, 60, 60, 60,  0]])

In [231]:
def pad_attn_bias_unsqueeze(x, padlen):
    xlen = x.size(0)
    if xlen < padlen:
        new_x = x.new_zeros([padlen, padlen], dtype=x.dtype).fill_(float('-inf'))
        new_x[:xlen, :xlen] = x
        new_x[xlen:, :xlen] = 0
        x = new_x
    return x.unsqueeze(0)

In [232]:
attn_bias = torch.zeros([N+1,N+1], dtype=torch.float)
attn_bias[1:, 1:][rel_pos >= rel_pos_max] = float('-inf')

In [233]:
attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node + 1)

In [234]:
def pad_rel_pos_unsqueeze(x, padlen):
    x = x + 1
    xlen = x.size(0)
    if xlen < padlen:
        new_x = x.new_zeros([padlen, padlen], dtype=x.dtype)
        new_x[:xlen, :xlen] = x
        x = new_x
    return x.unsqueeze(0)

In [235]:
rel_pos.shape

torch.Size([7, 7])

In [236]:
rel_pos

tensor([[ 0,  1,  2,  2,  3,  4,  5],
        [60,  0,  1,  1,  2,  3,  4],
        [60, 60,  0, 60, 60, 60, 60],
        [60, 60, 60,  0,  1,  2,  3],
        [60, 60, 60, 60,  0,  1,  2],
        [60, 60, 60, 60, 60,  0,  1],
        [60, 60, 60, 60, 60, 60,  0]])

# creating attn_bias tensor

In [237]:
attn_bias = torch.zeros([N+1,N+1], dtype=torch.float)
attn_bias[1:, 1:][rel_pos >= rel_pos_max] = float('-inf')
#attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node + 1)

In [238]:
attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node + 1)
rel_pos = pad_rel_pos_unsqueeze(rel_pos, max_node)

In [239]:
attn_bias.shape

torch.Size([1, 31, 31])

In [240]:
rel_pos.shape

torch.Size([1, 30, 30])

# Computing longest path for each node

In [241]:
# Step 6: Define a function to compute the longest path from a node to a leaf
def longest_path_from_node(node_idx, adj_matrix, memo):
    # If this node's longest path is already computed, return the cached result
    if node_idx in memo:
        return memo[node_idx]
    
    # If the node is a leaf node (no outgoing edges)
    if np.sum(adj_matrix[node_idx]) == 0:
        memo[node_idx] = 0  # Longest path from a leaf is 0
        return 0
    
    # Otherwise, recursively calculate the longest path from this node to a leaf
    max_length = 0
    for child_idx in range(len(adj_matrix)):
        if adj_matrix[node_idx][child_idx] == 1:  # There's a connection to this child
            max_length = max(max_length, 1 + longest_path_from_node(child_idx, adj_matrix, memo))
    
    memo[node_idx] = max_length  # Memoize the result
    return memo[node_idx]

memo = {}
longest_paths = {}

for node in node_ids:
    node_idx = node_to_index[node]
    longest_paths[node] = longest_path_from_node(node_idx, adj_matrix, memo) + 1

# Step 8: Display the longest path for each node
print("\nLongest Path from Each Node to a Leaf:")
for node, length in longest_paths.items():
    print(f"Node {node}: Longest path = {length}")


Longest Path from Each Node to a Leaf:
Node 1: Longest path = 6
Node 2: Longest path = 5
Node 3: Longest path = 1
Node 4: Longest path = 4
Node 5: Longest path = 3
Node 6: Longest path = 2
Node 7: Longest path = 1


In [242]:
type(longest_paths)

dict

In [243]:
longest_paths.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7])

In [244]:
# Add a new column 'height' based on the matching OPERATOR_ID
df['height'] = df['OPERATOR_ID'].map(longest_paths)

In [245]:
df.shape

(7, 9)

In [246]:
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height
0,2024-10-08-04.52.21.704464,1,RETURN,,,,,,6
1,2024-10-08-04.52.21.704464,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5
2,2024-10-08-04.52.21.704464,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1
3,2024-10-08-04.52.21.704464,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4
4,2024-10-08-04.52.21.704464,5,RIDSCN,,,,,,3
5,2024-10-08-04.52.21.704464,6,SORT,,,,,,2
6,2024-10-08-04.52.21.704464,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1


# Creating a tensor of heights

In [247]:
heights = df['height'].values

In [248]:
heights

array([6, 5, 1, 4, 3, 2, 1])

In [249]:
heights = pad_1d_unsqueeze(torch.LongTensor(heights), max_node)


In [250]:
heights

tensor([[7, 6, 2, 5, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])

In [251]:
heights.shape

torch.Size([1, 30])

In [252]:
N

7

In [253]:
heights

tensor([[7, 6, 2, 5, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])

# Look up the `SORT_SHRHEAP_TOP` from the activity files

In [254]:
### Step 1: look up the `APPL_ID`, 'ACTIVITY_ID', 'UOW_ID'
# Define the file path and the timestamp value
file_path = 'SimpleQueriesSQ12c/success.csv'  # Replace with your CSV file path

# Read the CSV file, selecting only relevant columns
column_headers = ['QUERYID', 'APPL_ID', 'UOW_ID', 'ACTIVITY_ID', 'EXPLAIN_TIME', 'QUERY']
df_success = pd.read_csv(file_path, header=None)

df_success.columns = column_headers

In [255]:
df_success.shape

(6957, 6)

In [256]:
df_query1 = df_success[df_success['EXPLAIN_TIME'] == query1_ts]

In [257]:
df_query1

Unnamed: 0,QUERYID,APPL_ID,UOW_ID,ACTIVITY_ID,EXPLAIN_TIME,QUERY
81,83,*LOCAL.shaikhq.241008114819,656.0,1.0,2024-10-08-04.52.21.704464,"SELECT TPCDS.WEB_SALES.WS_EXT_TAX , TPCDS.DATE_DIM2.D_DOM FROM TPCDS.WEB_SALES INNER JOIN TPCDS.DATE_DIM2 ON TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK WHERE TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48 AND TPCDS.WEB_SALES.WS_ITEM_SK >= 16111"


In [258]:
query1_APPL_ID = df_query1['APPL_ID'].values[0]
query1_UOW_ID = int(df_query1['UOW_ID'].values[0])
query1_ACTIVITY_ID = int(df_query1['ACTIVITY_ID'].values[0])

In [259]:
query1_APPL_ID

'*LOCAL.shaikhq.241008114819'

In [260]:
query1_UOW_ID

656

In [261]:
query1_ACTIVITY_ID

1

In [262]:
# Now read the ACTIVITY_DB2ACTIVITIES.csv file

file_path = 'SimpleQueriesSQ12c/ACTIVITY_DB2ACTIVITIES.csv'  # Replace with your CSV file path

# Read the CSV file, selecting only relevant columns
columns_to_read = ['APPL_ID', 'UOW_ID', 'ACTIVITY_ID', 'SORT_SHRHEAP_TOP']
df_activity = pd.read_csv(file_path, usecols=columns_to_read, encoding='ISO-8859-1')

df_activity_query1 = df_activity[(df_activity['APPL_ID'] == query1_APPL_ID) & 
                                  (df_activity['UOW_ID'] == query1_UOW_ID) &
                                 (df_activity['ACTIVITY_ID'] == query1_ACTIVITY_ID)]


In [263]:
df_activity_query1.shape

(0, 4)

In [264]:
df_activity_view = %sql SELECT * FROM ACTIVITY_VIEW; 

In [265]:
df_activity_view.columns

Index(['ACTIVITY_ID', 'APPL_ID', 'UOW_ID', 'SORT_SHRHEAP_TOP', 'STMT_TEXT'], dtype='object')

In [266]:
df_success.columns

Index(['QUERYID', 'APPL_ID', 'UOW_ID', 'ACTIVITY_ID', 'EXPLAIN_TIME', 'QUERY'], dtype='object')

In [267]:
df_success.shape

(6957, 6)

In [268]:
df_activity_view.shape

(3229, 5)

In [269]:
output_df = pd.merge(
    df_success,
    df_activity_view,
    left_on=['ACTIVITY_ID', 'APPL_ID', 'UOW_ID', 'QUERY'],
    right_on=['ACTIVITY_ID', 'APPL_ID', 'UOW_ID', 'STMT_TEXT'],
    how='inner'
)

In [270]:
output_df.shape

(2536, 8)

In [271]:
output_df = output_df[['QUERYID', 'EXPLAIN_TIME', 'QUERY', 'SORT_SHRHEAP_TOP']]

In [272]:
output_df.shape

(2536, 4)

In [273]:
output_df.to_csv("query_list.csv")

In [274]:
# output_df.sample(10)

In [275]:
output_df[output_df['EXPLAIN_TIME'] == query1_ts]

Unnamed: 0,QUERYID,EXPLAIN_TIME,QUERY,SORT_SHRHEAP_TOP
30,83,2024-10-08-04.52.21.704464,"SELECT TPCDS.WEB_SALES.WS_EXT_TAX , TPCDS.DATE_DIM2.D_DOM FROM TPCDS.WEB_SALES INNER JOIN TPCDS.DATE_DIM2 ON TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK WHERE TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48 AND TPCDS.WEB_SALES.WS_ITEM_SK >= 16111",632


## List the final set of dataframes, which I will convert to tensors

In [276]:
# get the SORT_SHRHEAP_TOP
raw_costs = output_df[output_df['EXPLAIN_TIME'] == query1_ts]['SORT_SHRHEAP_TOP'].values.tolist()
QUERYID = output_df[output_df['EXPLAIN_TIME'] == query1_ts]['QUERYID'].values[0]

print(raw_costs)

[632]


In [277]:
type(raw_costs)

list

In [278]:
import torch
from model.util import Normalizer

# cost_norm = Normalizer(1, 100)
# cost_norm = Normalizer(-3.61192, 12.290855)
#cost_norm = Normalizer(5, 2611)
cost_norm = Normalizer(1.609, 7.865)
cost_labels = torch.from_numpy(cost_norm.normalize_labels(raw_costs))

In [279]:
cost_labels

tensor([0.7736], dtype=torch.float64)

In [280]:
QUERYID

'83'

In [281]:
# get the node features
df

Unnamed: 0,EXPLAIN_TIME,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height
0,2024-10-08-04.52.21.704464,1,RETURN,,,,,,6
1,2024-10-08-04.52.21.704464,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5
2,2024-10-08-04.52.21.704464,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1
3,2024-10-08-04.52.21.704464,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4
4,2024-10-08-04.52.21.704464,5,RIDSCN,,,,,,3
5,2024-10-08-04.52.21.704464,6,SORT,,,,,,2
6,2024-10-08-04.52.21.704464,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1


In [282]:
df.shape

(7, 9)

In [283]:
df.columns

Index(['EXPLAIN_TIME', 'OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height'], dtype='object')

In [284]:
df_node_feat = df[['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height']]

In [285]:
df_node_feat

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height
0,1,RETURN,,,,,,6
1,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5
2,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1
3,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4
4,5,RIDSCN,,,,,,3
5,6,SORT,,,,,,2
6,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1


In [286]:
# Generate the bitmap indicating NaN (0) or non-NaN (1) in PREDICATE1, PREDICATE2, and PREDICATE3
bitmap = df_node_feat[['PREDICATE1', 'PREDICATE2', 'PREDICATE3']].notna().astype(int).values.tolist()

# Display the bitmap
print(bitmap)

df_node_feat['PREDICATE_MASK'] = bitmap

[[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]


In [287]:
df_node_feat

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height,PREDICATE_MASK
0,1,RETURN,,,,,,6,"[0, 0, 0]"
1,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5,"[0, 0, 0]"
2,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1,"[0, 0, 0]"
3,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4,"[1, 1, 0]"
4,5,RIDSCN,,,,,,3,"[0, 0, 0]"
5,6,SORT,,,,,,2,"[0, 0, 0]"
6,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1,"[0, 0, 0]"


In [288]:
# Gather all non-NaN values from PREDICATE1, PREDICATE2, and PREDICATE3 columns
non_nan_predicates = df_node_feat[['PREDICATE1', 'PREDICATE2', 'PREDICATE3']].stack().dropna().tolist()

# Display the gathered non-NaN predicate values
print(non_nan_predicates)

['TPCDS.WEB_SALES.WS_ITEM_SK >= 16111', 'TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48']


In [289]:
# Transform and combine each predicate into a single quoted string with [SEP]
transformed_predicates = [
    f"'col: {parts[0]} [SEP] op: {parts[1]} [SEP] val: {parts[2]}'"
    for predicate in predicates
    if isinstance(predicate, str)  # Ensure it's a string
    for parts in [predicate.split(' ', 2)]
]

# Display the transformed predicates
for transformed in transformed_predicates:
    print(transformed)

'col: Q1.WS_ITEM_SK [SEP] op: >= [SEP] val: 16111'
'col: Q1.WS_NET_PAID_INC_TAX [SEP] op: <= [SEP] val: +00183.48'


In [290]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Original predicates
predicates = ['TPCDS.DATE_DIM2.D_DOM <= 31', 'TPCDS.DATE_DIM2.D_SAME_DAY_LQ >= 2414988']

# Generate embeddings for each predicate
embeddings = model.encode(predicates)

# Create a dictionary where the original string is the key, and the embedding is the value
predicate_embedding_dict = {predicate: embedding for predicate, embedding in zip(predicates, embeddings)}

# Display the dictionary
for predicate, embedding in predicate_embedding_dict.items():
    print(f"Predicate: {predicate}")
    print(f"Embedding: {embedding}")
    print(f"Shape: {embedding.shape}")
    print()

Predicate: TPCDS.DATE_DIM2.D_DOM <= 31
Embedding: [ 5.68409041e-02  3.00318245e-02  2.42162403e-02  2.12519094e-02
 -1.58383953e-03 -2.48856619e-02 -7.23423511e-02  4.81826700e-02
  1.32549116e-02  5.29146753e-02  5.97863160e-02 -4.96302657e-02
 -5.22931628e-02 -5.83586618e-02 -3.75243872e-02 -5.78090269e-03
 -5.04243895e-02 -9.57685895e-03  1.07128389e-01  5.30022494e-02
 -7.80352717e-03 -1.60809234e-02 -2.30016652e-02  6.48317263e-02
 -2.53171846e-02  3.90802249e-02  3.76865454e-02 -5.04741035e-02
  2.67909542e-02  3.89553756e-02 -4.15233262e-02  3.67712006e-02
 -5.25134690e-02  4.03491072e-02  2.97929160e-02  3.70051451e-02
 -6.39251843e-02  6.46820962e-02 -1.71961784e-02 -3.37731913e-02
  3.25575471e-02 -6.00705072e-02  6.06203601e-02 -1.85770988e-02
 -4.40071262e-02  1.28346933e-02 -1.52654620e-02  2.59828521e-03
 -1.62462294e-02 -9.70714632e-03  6.12187141e-05  2.49067452e-02
 -7.18900096e-03 -2.00947206e-02 -4.07133065e-02  2.63743121e-02
 -1.39704207e-02 -3.80217377e-03  5.8495

In [291]:
predicate_embedding_dict.keys()

dict_keys(['TPCDS.DATE_DIM2.D_DOM <= 31', 'TPCDS.DATE_DIM2.D_SAME_DAY_LQ >= 2414988'])

In [292]:
predicate_embedding_dict['TPCDS.DATE_DIM2.D_DOM <= 31']

array([ 5.68409041e-02,  3.00318245e-02,  2.42162403e-02,  2.12519094e-02,
       -1.58383953e-03, -2.48856619e-02, -7.23423511e-02,  4.81826700e-02,
        1.32549116e-02,  5.29146753e-02,  5.97863160e-02, -4.96302657e-02,
       -5.22931628e-02, -5.83586618e-02, -3.75243872e-02, -5.78090269e-03,
       -5.04243895e-02, -9.57685895e-03,  1.07128389e-01,  5.30022494e-02,
       -7.80352717e-03, -1.60809234e-02, -2.30016652e-02,  6.48317263e-02,
       -2.53171846e-02,  3.90802249e-02,  3.76865454e-02, -5.04741035e-02,
        2.67909542e-02,  3.89553756e-02, -4.15233262e-02,  3.67712006e-02,
       -5.25134690e-02,  4.03491072e-02,  2.97929160e-02,  3.70051451e-02,
       -6.39251843e-02,  6.46820962e-02, -1.71961784e-02, -3.37731913e-02,
        3.25575471e-02, -6.00705072e-02,  6.06203601e-02, -1.85770988e-02,
       -4.40071262e-02,  1.28346933e-02, -1.52654620e-02,  2.59828521e-03,
       -1.62462294e-02, -9.70714632e-03,  6.12187141e-05,  2.49067452e-02,
       -7.18900096e-03, -

In [293]:
df_node_feat.head()

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height,PREDICATE_MASK
0,1,RETURN,,,,,,6,"[0, 0, 0]"
1,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5,"[0, 0, 0]"
2,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1,"[0, 0, 0]"
3,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4,"[1, 1, 0]"
4,5,RIDSCN,,,,,,3,"[0, 0, 0]"


In [294]:
import numpy as np

# Example embedding dictionary
embedding_dim = 384  # Replace with the actual dimension of your embeddings
embedding_dict = {
    'TPCDS.DATE_DIM2.D_DOM <= 31': np.random.rand(embedding_dim).tolist(),
    'TPCDS.DATE_DIM2.D_SAME_DAY_LQ >= 2414988': np.random.rand(embedding_dim).tolist()
}

# Function to generate new embedding columns
def add_embedding_columns(row, columns, embedding_dict, embedding_dim):
    for col in columns:
        embedding_col = f"{col}_EMBEDDING"  # Create a new column name
        if pd.notna(row[col]) and row[col] in embedding_dict:
            row[embedding_col] = embedding_dict[row[col]]  # Add embedding to the new column
        else:
            row[embedding_col] = [0.0] * embedding_dim  # Add zeros to the new column
    return row

# List of predicate columns to process
predicate_columns = ['PREDICATE1', 'PREDICATE2', 'PREDICATE3']

# Apply the transformation row by row
df_node_feat = df_node_feat.apply(
    add_embedding_columns,
    axis=1,
    columns=predicate_columns,
    embedding_dict=predicate_embedding_dict,
    embedding_dim=embedding_dim
)

# Check the updated DataFrame
print(df_node_feat.head())


   OPERATOR_ID OPERATOR_TYPE            TABLE                           PREDICATE1                                        PREDICATE2  PREDICATE3                                                     JOIN_KEY  height PREDICATE_MASK                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       PREDICATE1_EMBEDDING  \
0            1        RETURN              NaN                                  NaN                                               NaN         NaN                                                          NaN       6      [0, 0, 0]  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [295]:
df_node_feat.columns

Index(['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height', 'PREDICATE_MASK', 'PREDICATE1_EMBEDDING', 'PREDICATE2_EMBEDDING', 'PREDICATE3_EMBEDDING'], dtype='object')

In [296]:
df_node_feat[['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'JOIN_KEY', 'height']]

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,JOIN_KEY,height
0,1,RETURN,,,6
1,2,HSJOIN,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5
2,3,TBSCAN,TPCDS.DATE_DIM2,,1
3,4,FETCH,TPCDS.WEB_SALES,,4
4,5,RIDSCN,,,3
5,6,SORT,,,2
6,7,IXSCAN,SYSIBM.SQL240509064049930,,1


In [297]:
# Initialize encoding dictionaries and counters starting at 1 for each column
op_type_encoder = {}
table_encoder = {}
join_key_encoder = {}

# Helper function for label encoding with NaN values set to 0
def label_encode_with_nan(value, encoder, counter_start=1):
    if pd.isna(value):
        return 0, counter_start  # Return 0 for NaN and do not increment the counter
    if value not in encoder:
        encoder[value] = counter_start
        counter_start += 1
    return encoder[value], counter_start

# Label encode each column
op_type_counter, table_counter, join_key_counter = 1, 1, 1
encoded_operator_type = []
encoded_table = []
encoded_join_key = []

for op, tbl, jk in zip(df_node_feat['OPERATOR_TYPE'], df_node_feat['TABLE'], df_node_feat['JOIN_KEY']):
    # Encode OPERATOR_TYPE
    encoded_op, op_type_counter = label_encode_with_nan(op, op_type_encoder, op_type_counter)
    encoded_operator_type.append(encoded_op)
    
    # Encode TABLE
    encoded_tbl, table_counter = label_encode_with_nan(tbl, table_encoder, table_counter)
    encoded_table.append(encoded_tbl)
    
    # Encode JOIN_KEY
    encoded_jk, join_key_counter = label_encode_with_nan(jk, join_key_encoder, join_key_counter)
    encoded_join_key.append(encoded_jk)

# Add encoded columns to DataFrame
df_node_feat['ENCODED_OPERATOR_TYPE'] = encoded_operator_type
df_node_feat['ENCODED_TABLE'] = encoded_table
df_node_feat['ENCODED_JOIN_KEY'] = encoded_join_key

# Display the DataFrame with encoded columns and PREDICATE_MASK
print(df_node_feat[['OPERATOR_TYPE', 'TABLE', 'JOIN_KEY', 'PREDICATE_MASK', 'ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY']])
print("\nOPERATOR_TYPE Encoding Dictionary:", op_type_encoder)
print("TABLE Encoding Dictionary:", table_encoder)
print("JOIN_KEY Encoding Dictionary:", join_key_encoder)

  OPERATOR_TYPE                      TABLE                                                     JOIN_KEY PREDICATE_MASK  ENCODED_OPERATOR_TYPE  ENCODED_TABLE  ENCODED_JOIN_KEY
0        RETURN                        NaN                                                          NaN      [0, 0, 0]                      1              0                 0
1        HSJOIN                        NaN  TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK      [0, 0, 0]                      2              0                 1
2        TBSCAN            TPCDS.DATE_DIM2                                                          NaN      [0, 0, 0]                      3              1                 0
3        FETCH             TPCDS.WEB_SALES                                                          NaN      [1, 1, 0]                      4              2                 0
4        RIDSCN                        NaN                                                          NaN      [0, 0, 0]       

In [298]:
df_node_feat.columns

Index(['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height', 'PREDICATE_MASK', 'PREDICATE1_EMBEDDING', 'PREDICATE2_EMBEDDING', 'PREDICATE3_EMBEDDING', 'ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY'], dtype='object')

In [299]:
df_node_feat

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height,PREDICATE_MASK,PREDICATE1_EMBEDDING,PREDICATE2_EMBEDDING,PREDICATE3_EMBEDDING,ENCODED_OPERATOR_TYPE,ENCODED_TABLE,ENCODED_JOIN_KEY
0,1,RETURN,,,,,,6,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",1,0,0
1,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",2,0,1
2,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",3,1,0
3,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4,"[1, 1, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",4,2,0
4,5,RIDSCN,,,,,,3,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",5,0,0
5,6,SORT,,,,,,2,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",6,0,0
6,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",7,3,0


In [300]:
df_node_feat[['ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY', 'PREDICATE_MASK']]

Unnamed: 0,ENCODED_OPERATOR_TYPE,ENCODED_TABLE,ENCODED_JOIN_KEY,PREDICATE_MASK
0,1,0,0,"[0, 0, 0]"
1,2,0,1,"[0, 0, 0]"
2,3,1,0,"[0, 0, 0]"
3,4,2,0,"[1, 1, 0]"
4,5,0,0,"[0, 0, 0]"
5,6,0,0,"[0, 0, 0]"
6,7,3,0,"[0, 0, 0]"


In [301]:
# Initialize the new columns with a list of 50 zeros
df_node_feat['PREDICATE1_HIST'] = [[0] * 50] * len(df_node_feat)
df_node_feat['PREDICATE2_HIST'] = [[0] * 50] * len(df_node_feat)
df_node_feat['PREDICATE3_HIST'] = [[0] * 50] * len(df_node_feat)

In [302]:
bin_coverage_dict.keys()

dict_keys(['TPCDS.WEB_SALES.WS_ITEM_SK >= 16111'])

In [303]:
# Update PREDICATE1_HIST, PREDICATE2_HIST, and PREDICATE3_HIST based on corresponding predicate values
for index, row in df_node_feat.iterrows():
    # Update PREDICATE1_HIST based on PREDICATE1 values
    predicate1_value = row['PREDICATE1']
    if pd.notna(predicate1_value) and predicate1_value in bin_coverage_dict:
        value = bin_coverage_dict[predicate1_value]
        if isinstance(value, list):  # Check if the value is a list
            df_node_feat.at[index, 'PREDICATE1_HIST'] = [round(v, 3) for v in value]
        else:
            df_node_feat.at[index, 'PREDICATE1_HIST'] = round(value, 3)
        
    # Update PREDICATE2_HIST based on PREDICATE2 values
    predicate2_value = row['PREDICATE2']
    if pd.notna(predicate2_value) and predicate2_value in bin_coverage_dict:
        value = bin_coverage_dict[predicate2_value]
        if isinstance(value, list):  # Check if the value is a list
            df_node_feat.at[index, 'PREDICATE2_HIST'] = [round(v, 3) for v in value]
        else:
            df_node_feat.at[index, 'PREDICATE2_HIST'] = round(value, 3)
        
    # Update PREDICATE3_HIST based on PREDICATE3 values
    predicate3_value = row['PREDICATE3']
    if pd.notna(predicate3_value) and predicate3_value in bin_coverage_dict:
        value = bin_coverage_dict[predicate3_value]
        if isinstance(value, list):  # Check if the value is a list
            df_node_feat.at[index, 'PREDICATE3_HIST'] = [round(v, 3) for v in value]
        else:
            df_node_feat.at[index, 'PREDICATE3_HIST'] = round(value, 3)

In [304]:
df_node_feat[['OPERATOR_ID', 'ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY', 'PREDICATE_MASK', 'PREDICATE1_HIST', 'PREDICATE2_HIST', 'PREDICATE3_HIST', 'height']]

Unnamed: 0,OPERATOR_ID,ENCODED_OPERATOR_TYPE,ENCODED_TABLE,ENCODED_JOIN_KEY,PREDICATE_MASK,PREDICATE1_HIST,PREDICATE2_HIST,PREDICATE3_HIST,height
0,1,1,0,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",6
1,2,2,0,1,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",5
2,3,3,1,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1
3,4,4,2,0,"[1, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.256, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",4
4,5,5,0,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",3
5,6,6,0,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2
6,7,7,3,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1


In [305]:
x_cols = ['ENCODED_OPERATOR_TYPE', 'ENCODED_JOIN_KEY', 'PREDICATE_MASK', 'PREDICATE1_HIST', 'PREDICATE2_HIST', 'PREDICATE3_HIST', 'ENCODED_TABLE']

In [306]:
df_node_feat[x_cols]

Unnamed: 0,ENCODED_OPERATOR_TYPE,ENCODED_JOIN_KEY,PREDICATE_MASK,PREDICATE1_HIST,PREDICATE2_HIST,PREDICATE3_HIST,ENCODED_TABLE
0,1,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
1,2,1,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
2,3,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1
3,4,0,"[1, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.256, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2
4,5,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
5,6,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
6,7,0,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",3


# Making a tensor of node features

In [307]:
df.shape

(7, 9)

In [308]:
df_node_feat

Unnamed: 0,OPERATOR_ID,OPERATOR_TYPE,TABLE,PREDICATE1,PREDICATE2,PREDICATE3,JOIN_KEY,height,PREDICATE_MASK,PREDICATE1_EMBEDDING,PREDICATE2_EMBEDDING,PREDICATE3_EMBEDDING,ENCODED_OPERATOR_TYPE,ENCODED_TABLE,ENCODED_JOIN_KEY,PREDICATE1_HIST,PREDICATE2_HIST,PREDICATE3_HIST
0,1,RETURN,,,,,,6,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,2,HSJOIN,,,,,TPCDS.DATE_DIM2.D_DATE_SK = TPCDS.WEB_SALES.WS_SHIP_DATE_SK,5,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",2,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,3,TBSCAN,TPCDS.DATE_DIM2,,,,,1,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",3,1,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,4,FETCH,TPCDS.WEB_SALES,TPCDS.WEB_SALES.WS_ITEM_SK >= 16111,TPCDS.WEB_SALES.WS_NET_PAID_INC_TAX <= +00183.48,,,4,"[1, 1, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",4,2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.256, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,5,RIDSCN,,,,,,3,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",5,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,6,SORT,,,,,,2,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",6,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,7,IXSCAN,SYSIBM.SQL240509064049930,,,,,1,"[0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]",7,3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [309]:
df_node_feat.columns

Index(['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height', 'PREDICATE_MASK', 'PREDICATE1_EMBEDDING', 'PREDICATE2_EMBEDDING', 'PREDICATE3_EMBEDDING', 'ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY', 'PREDICATE1_HIST', 'PREDICATE2_HIST', 'PREDICATE3_HIST'], dtype='object')

In [310]:
len(df_node_feat)

7

In [311]:
df_node_feat['SAMPLE'] = [[0] * 1000 for _ in range(len(df_node_feat))]

In [312]:
df_node_feat.columns

Index(['OPERATOR_ID', 'OPERATOR_TYPE', 'TABLE', 'PREDICATE1', 'PREDICATE2', 'PREDICATE3', 'JOIN_KEY', 'height', 'PREDICATE_MASK', 'PREDICATE1_EMBEDDING', 'PREDICATE2_EMBEDDING', 'PREDICATE3_EMBEDDING', 'ENCODED_OPERATOR_TYPE', 'ENCODED_TABLE', 'ENCODED_JOIN_KEY', 'PREDICATE1_HIST', 'PREDICATE2_HIST', 'PREDICATE3_HIST', 'SAMPLE'], dtype='object')

In [313]:
# Concatenate values across each row
tensor_data = []
for _, row in df_node_feat.iterrows():
    # Flatten all values into a single list
    flat_row = np.concatenate([
        [row['ENCODED_OPERATOR_TYPE']],
        [row['ENCODED_JOIN_KEY']],
        row['PREDICATE1_EMBEDDING'],
        row['PREDICATE2_EMBEDDING'],
        row['PREDICATE3_EMBEDDING'],
        row['PREDICATE_MASK'],
        row['PREDICATE1_HIST'],
        row['PREDICATE2_HIST'],
        row['PREDICATE3_HIST'],
        [row['ENCODED_TABLE']]
    ])
    tensor_data.append(flat_row)

# Convert the list of concatenated rows into a tensor
x = torch.tensor(tensor_data, dtype=torch.float32)
x = x.unsqueeze(0)

In [314]:
x.shape

torch.Size([1, 7, 1308])

In [315]:
def pad_2d_unsqueeze(x, padlen):
    # dont know why add 1, comment out first
#    x = x + 1 # pad id = 0
    _, xlen, xdim = x.size()
    if xlen < padlen:
        new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) + 1
        new_x[:xlen, :] = x
        x = new_x
    return x.unsqueeze(0)

def pad_1d_unsqueeze(x, padlen):
    x = x + 1 # pad id = 0
    xlen = x.size(1)
    if xlen < padlen:
        new_x = x.new_zeros([padlen], dtype=x.dtype)
        new_x[:xlen] = x
        x = new_x
    return x.unsqueeze(0)

In [316]:
x = pad_2d_unsqueeze(x, max_node)

In [317]:
x.shape

torch.Size([1, 30, 1308])

In [318]:
heights.shape

torch.Size([1, 30])

In [319]:
attn_bias.shape

torch.Size([1, 31, 31])

In [320]:
attn_bias

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, -inf, 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, -inf, -inf, 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, -inf,

## Define Model

In [321]:
class Args:
    # bs = 1024
    # SQ: smaller batch size
    bs = 1
    #lr = 0.001
    lr = 0.001
    # epochs = 200
    epochs = 5
    clip_size = 50
    embed_size = 64
    pred_hid = 128
    ffn_dim = 128
    head_size = 12
    n_layers = 8
    dropout = 0.1
    sch_decay = 0.6
    # device = 'cuda:0'
    device = 'cpu'
    newpath = './results/full/cost/'
    to_predict = 'cost'
args = Args()

import os
if not os.path.exists(args.newpath):
    os.makedirs(args.newpath)

In [322]:
from model.model import QueryFormer

model = QueryFormer(emb_size = args.embed_size ,ffn_dim = args.ffn_dim, head_size = args.head_size, \
                 dropout = args.dropout, n_layers = args.n_layers, \
                 use_sample = False, use_hist = True, \
                 pred_hid = args.pred_hid
                )

In [323]:
from model.dataset import PlanTreeDataset

In [324]:
cost_labels

tensor([0.7736], dtype=torch.float64)

In [325]:
raw_costs

[632]

In [326]:
cost_labels

tensor([0.7736], dtype=torch.float64)

In [327]:
cost_labels

tensor([0.7736], dtype=torch.float64)

In [328]:
raw_costs

[632]

In [329]:
# Initialize the PlanTreeDataset with optional costs
dataset = PlanTreeDataset(1, x, attn_bias, rel_pos, heights, cost_labels, raw_costs)

# Print the dataset information
print(dataset)


<model.dataset.PlanTreeDataset object at 0x7f015ea9b020>


In [330]:
dataset[0]

({'x': tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
           [2., 1., 0.,  ..., 0., 0., 0.],
           [3., 0., 0.,  ..., 0., 0., 1.],
           ...,
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.]]]),
  'attn_bias': tensor([[[0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., -inf, 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., -inf, -inf, 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -i

In [331]:
dataset[0]

({'x': tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
           [2., 1., 0.,  ..., 0., 0., 0.],
           [3., 0., 0.,  ..., 0., 0., 1.],
           ...,
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.],
           [1., 1., 1.,  ..., 1., 1., 1.]]]),
  'attn_bias': tensor([[[0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., -inf, 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
            -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
           [0., -inf, -inf, 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -i

In [332]:
# Get the length of the dataset
print(f"Dataset size: {len(dataset)}")

# Access a single sample
sample, label = dataset[0]

# Print the sample contents
print("Sample contents:")
print("Feature Matrix (x):", sample['x'].shape)
print("Attention Bias (attn_bias):", sample['attn_bias'].shape)
print("Relative Positions (rel_pos):", sample['rel_pos'].shape)
print("Heights (heights):", sample['heights'].shape)
print("Label:", label)

Dataset size: 1
Sample contents:
Feature Matrix (x): torch.Size([1, 30, 1308])
Attention Bias (attn_bias): torch.Size([1, 31, 31])
Relative Positions (rel_pos): torch.Size([1, 30, 30])
Heights (heights): torch.Size([1, 30])
Label: (tensor(0.7736, dtype=torch.float64), tensor(632))


In [333]:
# Example numpy label
import numpy as np
import torch.nn as nn
import importlib

from model import trainer
importlib.reload(trainer)
from  model.trainer import train_single


crit = nn.MSELoss()

# Train the model with the numpy label
trained_model = train_single(model, dataset, dataset, crit, cost_norm, args)


idxs:  [0]
type(batch):  <class 'model.database_util.Batch'>
batch_labels:  <class 'tuple'>
batch_labels:  ((tensor(0.7736, dtype=torch.float64), tensor(632)),)
type(batch_labels):  <class 'tuple'>
type(l):  <class 'tuple'>
At QueryFormer Forward - Attention Bias: tensor([[[0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., -inf, -inf, 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
          -inf, 

histEmb:  tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.3255,  0.2079, -0.5367,  ...,  0.9899,  0.9641,  0.4577],
        [-0.3255,  0.2079, -0.5367,  ...,  0.9899,  0.9641,  0.4577],
        [-0.3255,  0.2079, -0.5367,  ...,  0.9899,  0.9641,  0.4577]],
       grad_fn=<DivBackward0>)
histEmb shape:  torch.Size([30, 64])
final shape:  torch.Size([30, 1408])
QueryFormer Forward - node_feature:  tensor([[[-2.1015e-03,  1.2098e-01,  9.6435e-02,  ..., -2.8428e-03,
           2.2812e-02,  1.6221e-01],
         [-3.1620e-04,  3.5302e-02,  3.6206e-01,  ...,  5.9183e-02,
          -1.5153e-03, -1.4192e-03],
         [-3.2544e-04, -1.7621e-03,  4.1178e-02,  ..., -5.3324e-04,
           3.1442e-01,  1.4066e-01],
         ...,
         [-9.5248e-03,  6.8215e-01,  2.9544e-01,  ..., -1.0499e-03,
           1.5353

## Find the min and max values for `SORT_SHRHEAP_TOP`

In [334]:
df_query_list = pd.read_csv("query_list.csv")

In [335]:
df_query_list.describe()

Unnamed: 0.1,Unnamed: 0,QUERYID,SORT_SHRHEAP_TOP
count,2536.0,2536.0,2536.0
mean,1267.5,3496.109621,435.967666
std,732.224465,2039.324957,362.367795
min,0.0,1.0,5.0
25%,633.75,1694.5,79.0
50%,1267.5,3549.0,361.0
75%,1901.25,5287.75,802.0
max,2535.0,6996.0,2611.0
