# Create a single query model using embeddings

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import neural_network
import seaborn as sns
import matplotlib.cm as cm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.layers import BatchNormalization 
from tensorflow.keras.callbacks import ModelCheckpoint # new!
import os # new!
# import seaborn as sns
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout 
# from tensorflow.keras.layers import BatchNormalization 
# from tensorflow.keras.callbacks import ModelCheckpoint # new!


2025-02-06 20:12:21.194255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
batch_size = 10
box_plot_title = 'Memory Estimation Error (MB)'
pd.set_option('display.max_columns', None)
cluster_set = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df_success = pd.read_csv('utils/success_db2_est.csv')

# Display the first few rows to verify
df_success.head()


Unnamed: 0,QUERYID,APPL_ID,UOW_ID,ACTIVITY_ID,EXPLAIN_TIME,SORT_SHRHEAP_TOP,Db2_ESTIMATE,QUERY
0,1,*LOCAL.db2inst1.241204044349,4,1,2024-12-03-20.43.44.200877,66844.0,66835,"SELECT cn.name AS company_name, lt.link AS li..."
1,2,*LOCAL.db2inst1.241204044349,12,1,2024-12-03-20.43.45.954590,66860.0,66835,"SELECT cn.name AS company_name, lt.link AS li..."
2,3,*LOCAL.db2inst1.241204044349,20,1,2024-12-03-20.43.47.182565,66821.0,66837,"SELECT cn.name AS company_name, lt.link AS li..."
3,4,*LOCAL.db2inst1.241204044349,28,1,2024-12-03-20.43.48.420714,66825.0,66835,"SELECT cn.name AS company_name, lt.link AS li..."
4,5,*LOCAL.db2inst1.241204044349,36,1,2024-12-03-20.43.51.205244,38663.0,38565,"SELECT cn.name AS company_name, lt.link AS li..."


In [4]:
df_success.shape

(2332, 8)

# Reading validation set

In [5]:
import json
import pandas as pd

# Path to the JSON file
val_embeddings_path = "val_embeddings.json"

# Load the JSON file
with open(val_embeddings_path, "r") as f:
    val_embeddings_data = json.load(f)

# Convert to DataFrame
val_embeddings_df = pd.DataFrame({
    "file_name": list(val_embeddings_data.keys()),
    "embedding": list(val_embeddings_data.values())
})

# Display the DataFrame
print(val_embeddings_df.head())


                                  file_name  \
0  query_1952_2024-12-03-21.34.02.429279.pt   
1   query_993_2024-12-03-21.10.08.618148.pt   
2  query_1591_2024-12-03-21.23.17.861948.pt   
3   query_793_2024-12-03-21.05.49.462784.pt   
4  query_2262_2024-12-03-21.44.00.539251.pt   

                                           embedding  
0  [-6.322105407714844, -1.2161089181900024, 0.43...  
1  [-7.167495250701904, 1.262258529663086, -1.407...  
2  [-6.322115421295166, -1.2161129713058472, 0.43...  
3  [-7.363716125488281, -0.7742083072662354, 1.37...  
4  [1.0355658531188965, 6.791748523712158, -3.595...  


In [6]:
val_embeddings_df.head()

Unnamed: 0,file_name,embedding
0,query_1952_2024-12-03-21.34.02.429279.pt,"[-6.322105407714844, -1.2161089181900024, 0.43..."
1,query_993_2024-12-03-21.10.08.618148.pt,"[-7.167495250701904, 1.262258529663086, -1.407..."
2,query_1591_2024-12-03-21.23.17.861948.pt,"[-6.322115421295166, -1.2161129713058472, 0.43..."
3,query_793_2024-12-03-21.05.49.462784.pt,"[-7.363716125488281, -0.7742083072662354, 1.37..."
4,query_2262_2024-12-03-21.44.00.539251.pt,"[1.0355658531188965, 6.791748523712158, -3.595..."


In [7]:
# Extract QUERYID and EXPLAIN_TIME
val_embeddings_df["QUERYID"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
val_embeddings_df["EXPLAIN_TIME"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
val_embeddings_df["QUERYID"] = val_embeddings_df["QUERYID"].astype(int)

In [8]:
val_embeddings_df.head()

Unnamed: 0,file_name,embedding,QUERYID,EXPLAIN_TIME
0,query_1952_2024-12-03-21.34.02.429279.pt,"[-6.322105407714844, -1.2161089181900024, 0.43...",1952,2024-12-03-21.34.02.429279
1,query_993_2024-12-03-21.10.08.618148.pt,"[-7.167495250701904, 1.262258529663086, -1.407...",993,2024-12-03-21.10.08.618148
2,query_1591_2024-12-03-21.23.17.861948.pt,"[-6.322115421295166, -1.2161129713058472, 0.43...",1591,2024-12-03-21.23.17.861948
3,query_793_2024-12-03-21.05.49.462784.pt,"[-7.363716125488281, -0.7742083072662354, 1.37...",793,2024-12-03-21.05.49.462784
4,query_2262_2024-12-03-21.44.00.539251.pt,"[1.0355658531188965, 6.791748523712158, -3.595...",2262,2024-12-03-21.44.00.539251


In [None]:
import pandas as pd

# Assuming val_embeddings_df and df_success are already defined

# Perform the join
result_df = pd.merge(
    val_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

# Display the resulting DataFrame
print(result_df.head())


                                  file_name  \
0  query_1952_2024-12-03-21.34.02.429279.pt   
1   query_993_2024-12-03-21.10.08.618148.pt   
2  query_1591_2024-12-03-21.23.17.861948.pt   
3   query_793_2024-12-03-21.05.49.462784.pt   
4  query_2262_2024-12-03-21.44.00.539251.pt   

                                           embedding  QUERYID  \
0  [-6.322105407714844, -1.2161089181900024, 0.43...     1952   
1  [-7.167495250701904, 1.262258529663086, -1.407...      993   
2  [-6.322115421295166, -1.2161129713058472, 0.43...     1591   
3  [-7.363716125488281, -0.7742083072662354, 1.37...      793   
4  [1.0355658531188965, 6.791748523712158, -3.595...     2262   

                 EXPLAIN_TIME                       APPL_ID  UOW_ID  \
0  2024-12-03-21.34.02.429279  *LOCAL.db2inst1.241204044349   15612   
1  2024-12-03-21.10.08.618148  *LOCAL.db2inst1.241204044349    7940   
2  2024-12-03-21.23.17.861948  *LOCAL.db2inst1.241204044349   12724   
3  2024-12-03-21.05.49.462784  *LOCAL.db2i

In [10]:
val_embeddings_df.shape

(467, 4)

In [11]:
result_df.shape

(467, 10)

In [12]:
df_test = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

# Loading Training Set

In [13]:
import json
import pandas as pd

# Path to the JSON file
train_embeddings_path = "train_embeddings.json"

# Load the JSON file
with open(train_embeddings_path, "r") as f:
    train_embeddings_path_embeddings_data = json.load(f)

# Convert to DataFrame
train_embeddings_df = pd.DataFrame({
    "file_name": list(train_embeddings_path_embeddings_data.keys()),
    "embedding": list(train_embeddings_path_embeddings_data.values())
})

# Display the DataFrame
# print(train_embeddings_path_embeddings_df.head())

# Extract QUERYID and EXPLAIN_TIME
train_embeddings_df["QUERYID"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
train_embeddings_df["EXPLAIN_TIME"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
train_embeddings_df["QUERYID"] = train_embeddings_df["QUERYID"].astype(int)

# Perform the join
result_df = pd.merge(
    train_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

df_train = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

In [14]:
df_train.shape

(1865, 3)

In [None]:
df_train.head()

Unnamed: 0,embedding,SORT_SHRHEAP_TOP,Db2_ESTIMATE
0,"[-0.5597949028015137, 6.249405384063721, 2.199...",58512.0,58512
1,"[-0.13114792108535767, 2.876124858856201, -0.3...",9951.0,23455
2,"[-0.5244169235229492, 9.943038940429688, -3.58...",58512.0,58512
3,"[-9.048543930053711, -3.4981002807617188, 3.39...",14175.0,23455
4,"[-9.814127922058105, -6.830276012420654, 3.638...",66823.0,66821


In [16]:
df_train.columns

Index(['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE'], dtype='object')

In [17]:
# Rename columns for df_train
df_train.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_train = df_train[['sql_embedding', 'db2', 'actual']]

# Rename columns for df_test
df_test.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_test = df_test[['sql_embedding', 'db2', 'actual']]

# Verify the changes
print(df_train.head())
print(df_test.head())


                                       sql_embedding    db2   actual
0  [-0.5597949028015137, 6.249405384063721, 2.199...  58512  58512.0
1  [-0.13114792108535767, 2.876124858856201, -0.3...  23455   9951.0
2  [-0.5244169235229492, 9.943038940429688, -3.58...  58512  58512.0
3  [-9.048543930053711, -3.4981002807617188, 3.39...  23455  14175.0
4  [-9.814127922058105, -6.830276012420654, 3.638...  66821  66823.0
                                       sql_embedding    db2   actual
0  [-6.322105407714844, -1.2161089181900024, 0.43...  37907  35153.0
1  [-7.167495250701904, 1.262258529663086, -1.407...  18834   9951.0
2  [-6.322115421295166, -1.2161129713058472, 0.43...  38047  38815.0
3  [-7.363716125488281, -0.7742083072662354, 1.37...  23455  14175.0
4  [1.0355658531188965, 6.791748523712158, -3.595...  66821  66821.0


In [None]:
df_train.head()

Unnamed: 0,sql_embedding,db2,actual
0,"[-0.5597949028015137, 6.249405384063721, 2.199...",58512,58512.0
1,"[-0.13114792108535767, 2.876124858856201, -0.3...",23455,9951.0
2,"[-0.5244169235229492, 9.943038940429688, -3.58...",58512,58512.0
3,"[-9.048543930053711, -3.4981002807617188, 3.39...",23455,14175.0
4,"[-9.814127922058105, -6.830276012420654, 3.638...",66821,66823.0


In [None]:
df_test.head()

Unnamed: 0,sql_embedding,db2,actual
0,"[-6.322105407714844, -1.2161089181900024, 0.43...",37907,35153.0
1,"[-7.167495250701904, 1.262258529663086, -1.407...",18834,9951.0
2,"[-6.322115421295166, -1.2161129713058472, 0.43...",38047,38815.0
3,"[-7.363716125488281, -0.7742083072662354, 1.37...",23455,14175.0
4,"[1.0355658531188965, 6.791748523712158, -3.595...",66821,66821.0


In [20]:
df_train['db2'] = df_train['db2'] * 4000 / 1000000
df_train['actual'] = df_train['actual'] * 4000 / 1000000

In [21]:
df_test['db2'] = df_test['db2'] * 4000 / 1000000
df_test['actual'] = df_test['actual'] * 4000 / 1000000

In [None]:
df_train.head()

Unnamed: 0,sql_embedding,db2,actual
0,"[-0.5597949028015137, 6.249405384063721, 2.199...",234.048,234.048
1,"[-0.13114792108535767, 2.876124858856201, -0.3...",93.82,39.804
2,"[-0.5244169235229492, 9.943038940429688, -3.58...",234.048,234.048
3,"[-9.048543930053711, -3.4981002807617188, 3.39...",93.82,56.7
4,"[-9.814127922058105, -6.830276012420654, 3.638...",267.284,267.292


In [23]:
df_train.shape

(1865, 3)

In [24]:
df_test.shape

(467, 3)

In [25]:
df_test.head()

Unnamed: 0,sql_embedding,db2,actual
0,"[-6.322105407714844, -1.2161089181900024, 0.43...",151.628,140.612
1,"[-7.167495250701904, 1.262258529663086, -1.407...",75.336,39.804
2,"[-6.322115421295166, -1.2161129713058472, 0.43...",152.188,155.26
3,"[-7.363716125488281, -0.7742083072662354, 1.37...",93.82,56.7
4,"[1.0355658531188965, 6.791748523712158, -3.595...",267.284,267.284


In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1865 entries, 0 to 1864
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sql_embedding  1865 non-null   object 
 1   db2            1865 non-null   float64
 2   actual         1865 non-null   float64
dtypes: float64(2), object(1)
memory usage: 43.8+ KB


In [None]:
df_train[['sql_embedding', 'actual']].head()

Unnamed: 0,sql_embedding,actual
0,"[-0.5597949028015137, 6.249405384063721, 2.199...",234.048
1,"[-0.13114792108535767, 2.876124858856201, -0.3...",39.804
2,"[-0.5244169235229492, 9.943038940429688, -3.58...",234.048
3,"[-9.048543930053711, -3.4981002807617188, 3.39...",56.7
4,"[-9.814127922058105, -6.830276012420654, 3.638...",267.292


In [28]:
df_train[['db2', 'actual']].head()

Unnamed: 0,db2,actual
0,234.048,234.048
1,93.82,39.804
2,234.048,234.048
3,93.82,56.7
4,267.284,267.292


In [29]:
df_train_copy = df_train.copy()

In [30]:
df_train.head()

Unnamed: 0,sql_embedding,db2,actual
0,"[-0.5597949028015137, 6.249405384063721, 2.199...",234.048,234.048
1,"[-0.13114792108535767, 2.876124858856201, -0.3...",93.82,39.804
2,"[-0.5244169235229492, 9.943038940429688, -3.58...",234.048,234.048
3,"[-9.048543930053711, -3.4981002807617188, 3.39...",93.82,56.7
4,"[-9.814127922058105, -6.830276012420654, 3.638...",267.284,267.292


In [31]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

df = df_train_copy

# Normalize 'feat1'
scaler = StandardScaler()
df['db2_normalized'] = scaler.fit_transform(df[['db2']])

# Convert to PyTorch tensors
feat1_tensor = torch.tensor(df['db2_normalized'].values, dtype=torch.float32).view(-1, 1)
actual_tensor = torch.tensor(df['actual'].values, dtype=torch.float32).view(-1, 1)

# Define the embedding model
class EmbeddingModel(nn.Module):
    def __init__(self, input_dim=1, embedding_dim=64):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x)
        x = self.relu(x)
        x = self.fc(x)
        return x

# Initialize model, loss function, and optimizer
model = EmbeddingModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(feat1_tensor)
    loss = criterion(outputs, actual_tensor)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Extract embeddings
model.eval()
with torch.no_grad():
    embeddings = model.embedding(feat1_tensor).numpy()

# Ensure the DataFrame can store list objects
df['db2_embedding'] = None  # Initialize the column
df = df.astype({'db2_embedding': 'object'})  # Set the dtype to object

# Assign embeddings to the DataFrame
df['db2_embedding'] = [emb.tolist() for emb in embeddings]

# Drop the normalized column if no longer needed
df = df.drop(columns=['db2_normalized'])

print(df)


Epoch [10/100], Loss: 33472.9141
Epoch [20/100], Loss: 33390.2148
Epoch [30/100], Loss: 33307.1602
Epoch [40/100], Loss: 33222.4375
Epoch [50/100], Loss: 33135.3281
Epoch [60/100], Loss: 33045.1953
Epoch [70/100], Loss: 32951.4180
Epoch [80/100], Loss: 32853.4141
Epoch [90/100], Loss: 32750.7305
Epoch [100/100], Loss: 32642.9941
                                          sql_embedding      db2   actual  \
0     [-0.5597949028015137, 6.249405384063721, 2.199...  234.048  234.048   
1     [-0.13114792108535767, 2.876124858856201, -0.3...   93.820   39.804   
2     [-0.5244169235229492, 9.943038940429688, -3.58...  234.048  234.048   
3     [-9.048543930053711, -3.4981002807617188, 3.39...   93.820   56.700   
4     [-9.814127922058105, -6.830276012420654, 3.638...  267.284  267.292   
...                                                 ...      ...      ...   
1860  [-7.942427635192871, -2.4830620288848877, 2.98...  267.284  267.284   
1861  [0.4651572108268738, 1.5341832637786865, -1.79.

In [32]:
df.columns

Index(['sql_embedding', 'db2', 'actual', 'db2_embedding'], dtype='object')

In [33]:
df_train_v2 = df[['sql_embedding', 'db2_embedding', 'db2', 'actual']]

In [34]:
df_train_v2.head()

Unnamed: 0,sql_embedding,db2_embedding,db2,actual
0,"[-0.5597949028015137, 6.249405384063721, 2.199...","[-0.05979561805725098, -0.43220221996307373, -...",234.048,234.048
1,"[-0.13114792108535767, 2.876124858856201, -0.3...","[1.2345705032348633, -0.4280064105987549, 1.63...",93.82,39.804
2,"[-0.5244169235229492, 9.943038940429688, -3.58...","[-0.05979561805725098, -0.43220221996307373, -...",234.048,234.048
3,"[-9.048543930053711, -3.4981002807617188, 3.39...","[1.2345705032348633, -0.4280064105987549, 1.63...",93.82,56.7
4,"[-9.814127922058105, -6.830276012420654, 3.638...","[-0.36657845973968506, -0.43319669365882874, -...",267.284,267.292


In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

df = df_train_v2

# Concatenate embeddings to form the feature set
df['combined_embedding'] = df.apply(lambda row: row['sql_embedding'] + row['db2_embedding'], axis=1)

# Keep 'combined_embedding' as a DataFrame column to retain indexing
X = df[['combined_embedding']]  
y = df['actual']

# Split the data while retaining DataFrame structure
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert combined embedding column to NumPy arrays after splitting
X_train = np.vstack(X_train['combined_embedding'].values)
X_test = np.vstack(X_test['combined_embedding'].values)

# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Make predictions with the XGBoost model
xgb_pred = model.predict(X_test)

# Evaluate the XGBoost model
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_mape = mean_absolute_percentage_error(y_test, xgb_pred)

# Ensure 'db2' predictions are aligned with the test indices
db2_test_pred = df.loc[y_test.index, 'db2'].values

# Evaluate 'db2' predictions
db2_rmse = np.sqrt(mean_squared_error(y_test, db2_test_pred))
db2_mape = mean_absolute_percentage_error(y_test, db2_test_pred)

# Print the evaluation metrics
print(f'XGBoost Model - RMSE: {xgb_rmse:.2f}, MAPE: {xgb_mape:.2%}')
print(f'db2 Predictions - RMSE: {db2_rmse:.2f}, MAPE: {db2_mape:.2%}')


XGBoost Model - RMSE: 11.76, MAPE: 15.80%
db2 Predictions - RMSE: 49.10, MAPE: 47.57%
