# Create a single query model using embeddings

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import neural_network
import seaborn as sns
import matplotlib.cm as cm
import os # new!

In [3]:
batch_size = 10
box_plot_title = 'Memory Estimation Error (MB)'
pd.set_option('display.max_columns', None)
cluster_set = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df_success = pd.read_csv('utils/success_db2_est.csv')

# Display the first few rows to verify
df_success.head()


In [None]:
df_success.shape

# Reading validation set

In [None]:
import json
import pandas as pd

# Path to the JSON file
val_embeddings_path = "val_embeddings.json"

# Load the JSON file
with open(val_embeddings_path, "r") as f:
    val_embeddings_data = json.load(f)

# Convert to DataFrame
val_embeddings_df = pd.DataFrame({
    "file_name": list(val_embeddings_data.keys()),
    "embedding": list(val_embeddings_data.values())
})

# Display the DataFrame
print(val_embeddings_df.head())


In [None]:
val_embeddings_df.head()

In [None]:
# Extract QUERYID and EXPLAIN_TIME
val_embeddings_df["QUERYID"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
val_embeddings_df["EXPLAIN_TIME"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
val_embeddings_df["QUERYID"] = val_embeddings_df["QUERYID"].astype(int)

In [None]:
val_embeddings_df.head()

In [None]:
import pandas as pd

# Assuming val_embeddings_df and df_success are already defined

# Perform the join
result_df = pd.merge(
    val_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

# Display the resulting DataFrame
print(result_df.head())


In [None]:
val_embeddings_df.shape

In [None]:
result_df.shape

In [None]:
df_test = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

# Loading Training Set

In [None]:
import json
import pandas as pd

# Path to the JSON file
train_embeddings_path = "train_embeddings.json"

# Load the JSON file
with open(train_embeddings_path, "r") as f:
    train_embeddings_path_embeddings_data = json.load(f)

# Convert to DataFrame
train_embeddings_df = pd.DataFrame({
    "file_name": list(train_embeddings_path_embeddings_data.keys()),
    "embedding": list(train_embeddings_path_embeddings_data.values())
})

# Display the DataFrame
# print(train_embeddings_path_embeddings_df.head())

# Extract QUERYID and EXPLAIN_TIME
train_embeddings_df["QUERYID"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
train_embeddings_df["EXPLAIN_TIME"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
train_embeddings_df["QUERYID"] = train_embeddings_df["QUERYID"].astype(int)

# Perform the join
result_df = pd.merge(
    train_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

df_train = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
# Rename columns for df_train
df_train.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_train = df_train[['sql_embedding', 'db2', 'actual']]

# Rename columns for df_test
df_test.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_test = df_test[['sql_embedding', 'db2', 'actual']]

# Verify the changes
print(df_train.head())
print(df_test.head())


In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train['db2'] = df_train['db2'] * 4000 / 1000000
df_train['actual'] = df_train['actual'] * 4000 / 1000000

In [None]:
df_test['db2'] = df_test['db2'] * 4000 / 1000000
df_test['actual'] = df_test['actual'] * 4000 / 1000000

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_train[['sql_embedding', 'actual']].head()

In [None]:
df_train[['db2', 'actual']].head()

In [None]:
df_train_copy = df_train.copy()

In [None]:
df_train.head()

In [None]:
import numpy as np

# Multiply each embedding vector by the corresponding scalar in db2
df['sql_embedding'] = df.apply(lambda row: (np.array(row['sql_embedding']) * row['db2']).tolist(), axis=1)

# Print the updated DataFrame
print(df.head())


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Keep 'combined_embedding' as a DataFrame column to retain indexing
X = df[['sql_embedding']]  
y = df['actual']

# Split the data while retaining DataFrame structure
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert combined embedding column to NumPy arrays after splitting
X_train = np.vstack(X_train['sql_embedding'].values)
X_test = np.vstack(X_test['sql_embedding'].values)

# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Make predictions with the XGBoost model
xgb_pred = model.predict(X_test)

# Evaluate the XGBoost model
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_mape = mean_absolute_percentage_error(y_test, xgb_pred)

# Ensure 'db2' predictions are aligned with the test indices
db2_test_pred = df.loc[y_test.index, 'db2'].values

# Evaluate 'db2' predictions
db2_rmse = np.sqrt(mean_squared_error(y_test, db2_test_pred))
db2_mape = mean_absolute_percentage_error(y_test, db2_test_pred)

# Print the evaluation metrics
print(f'XGBoost Model - RMSE: {xgb_rmse:.2f}, MAPE: {xgb_mape:.2%}')
print(f'db2 Predictions - RMSE: {db2_rmse:.2f}, MAPE: {db2_mape:.2%}')
