In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import neural_network
import seaborn as sns
import matplotlib.cm as cm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.layers import BatchNormalization 
from tensorflow.keras.callbacks import ModelCheckpoint # new!
import os # new!
# import seaborn as sns
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout 
# from tensorflow.keras.layers import BatchNormalization 
# from tensorflow.keras.callbacks import ModelCheckpoint # new!


In [None]:
batch_size = 10
box_plot_title = 'Memory Estimation Error (MB)'
pd.set_option('display.max_columns', None)
cluster_set = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df_success = pd.read_csv('utils/success_db2_est.csv')

# Display the first few rows to verify
df_success.head()


In [None]:
df_success.shape

# Reading validation set

In [None]:
import json
import pandas as pd

# Path to the JSON file
val_embeddings_path = "val_embeddings.json"

# Load the JSON file
with open(val_embeddings_path, "r") as f:
    val_embeddings_data = json.load(f)

# Convert to DataFrame
val_embeddings_df = pd.DataFrame({
    "file_name": list(val_embeddings_data.keys()),
    "embedding": list(val_embeddings_data.values())
})

# Display the DataFrame
print(val_embeddings_df.head())


In [None]:
val_embeddings_df.head()

In [None]:
# Extract QUERYID and EXPLAIN_TIME
val_embeddings_df["QUERYID"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
val_embeddings_df["EXPLAIN_TIME"] = val_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
val_embeddings_df["QUERYID"] = val_embeddings_df["QUERYID"].astype(int)

In [None]:
val_embeddings_df.head()

In [None]:
import pandas as pd

# Assuming val_embeddings_df and df_success are already defined

# Perform the join
result_df = pd.merge(
    val_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

# Display the resulting DataFrame
print(result_df.head())


In [None]:
val_embeddings_df.shape

In [None]:
result_df.shape

In [None]:
df_test = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

# Loading Training Set

In [None]:
import json
import pandas as pd

# Path to the JSON file
train_embeddings_path = "train_embeddings.json"

# Load the JSON file
with open(train_embeddings_path, "r") as f:
    train_embeddings_path_embeddings_data = json.load(f)

# Convert to DataFrame
train_embeddings_df = pd.DataFrame({
    "file_name": list(train_embeddings_path_embeddings_data.keys()),
    "embedding": list(train_embeddings_path_embeddings_data.values())
})

# Display the DataFrame
# print(train_embeddings_path_embeddings_df.head())

# Extract QUERYID and EXPLAIN_TIME
train_embeddings_df["QUERYID"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[1])
train_embeddings_df["EXPLAIN_TIME"] = train_embeddings_df["file_name"].apply(lambda x: x.split("_")[2].replace(".pt", ""))

# Convert QUERYID to integer (if needed)
train_embeddings_df["QUERYID"] = train_embeddings_df["QUERYID"].astype(int)

# Perform the join
result_df = pd.merge(
    train_embeddings_df,
    df_success,
    on=['QUERYID', 'EXPLAIN_TIME'],  # Match on QUERYID and EXPLAIN_TIME
    how='inner'  # Inner join to keep only matching rows
)

df_train = result_df.copy()[['embedding', 'SORT_SHRHEAP_TOP', 'Db2_ESTIMATE']]

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
# Rename columns for df_train
df_train.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_train = df_train[['sql_embedding', 'db2', 'actual']]

# Rename columns for df_test
df_test.rename(columns={
    'embedding': 'sql_embedding',
    'SORT_SHRHEAP_TOP': 'actual',
    'Db2_ESTIMATE': 'db2'
}, inplace=True)

df_test = df_test[['sql_embedding', 'db2', 'actual']]

# Verify the changes
print(df_train.head())
print(df_test.head())


In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train['db2'] = df_train['db2'] * 4000 / 1000000
df_train['actual'] = df_train['actual'] * 4000 / 1000000

In [None]:
df_test['db2'] = df_test['db2'] * 4000 / 1000000
df_test['actual'] = df_test['actual'] * 4000 / 1000000

In [None]:
df_train.head()

In [None]:
df_train.shape

# Standardizing embeddings

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Convert the embeddings to 2D numpy arrays
train_embeddings = np.array([np.array(embed) for embed in df_train['sql_embedding']])
test_embeddings = np.array([np.array(embed) for embed in df_test['sql_embedding']])

# Initialize the scaler and fit on the training embeddings
scaler = StandardScaler()
standardized_train_embeddings = scaler.fit_transform(train_embeddings)

# Transform the test embeddings using the same scaler
standardized_test_embeddings = scaler.transform(test_embeddings)

# Overwrite the 'sql_embedding' column in both dataframes with standardized values
df_train['sql_embedding'] = list(standardized_train_embeddings)
df_test['sql_embedding'] = list(standardized_test_embeddings)

# Save the scaler for future use if needed
import joblib
joblib.dump(scaler, "embedding_scaler.pkl")

print("Standardization applied and saved.")

In [None]:
df_train.head()

# 2. Train and Evaluate a Model

In [None]:
def batch_predict(estimator, batch_size, X, Y):
    predicted = estimator.predict(X)
    Y = np.insert(Y, Y.shape[1], predicted, axis=1)
    
    indices = np.linspace(0, X.shape[0]-1, X.shape[0], dtype=int)
    np.random.seed(42)
    np.random.shuffle(indices)
    num_batches = int(np.floor(X.shape[0] / batch_size))
    
    df_batches = pd.DataFrame(columns=['actual', 'db2', 'ml'])
    
    for ibat in range(num_batches):
        start = (ibat * batch_size)
        end = (ibat * batch_size + batch_size) - 1
        
        ibat_Y = Y[indices[start:end]]
        
        actual = sum(ibat_Y[:,-1])
        db2 = sum(ibat_Y[:,-2])
        ml = sum(ibat_Y[:,-3])
        
        df_batches = df_batches.append({'actual':actual,
                                       'db2':db2,
                                       'ml':ml},
                                      ignore_index=True)
        
    return df_batches

def rmse(Y):
    cols = Y.columns.values[1:]
    rmse_dict = {}
    
    for col in cols:
        rmse = np.round(np.sqrt(mean_squared_error(Y['actual'].values, Y[col].values)))
        rmse_dict[col] = rmse
    
    return rmse_dict
    
def calculate_residuals(Y):
    first_col = Y.columns[0]
    cols = Y.columns[1:]
    df_residuals = pd.DataFrame(columns=cols)

    for col in cols:
        df_residuals[col] = Y[col] - Y[first_col]
        
    return df_residuals

def box_plot(Y, length, height):
    df_residuals = calculate_residuals(Y)
    sns.set_style("whitegrid", {'axes.grid' : False})
    f = plt.figure(figsize=[length,height])
    plt.rcParams.update({'font.size': 16})
    ax = f.add_subplot(111)
    sns.boxplot(data=df_residuals, ax=ax, showfliers = True, orient="h")
    ax.set_xlabel(xlabel=box_plot_title,fontsize=22)
    plt.tick_params(axis='x',labeltop='on', labelbottom='on')
    ax.xaxis.set_ticks_position('both')
    #ax.set_yticks(yticks_new)
#     plt.setp(ax.get_yticklabels(), rotation=90)
    f.tight_layout()
    plt.show()
    ax.savefig('job_err.png')
def residual_plot(Y):
    Y_predicted = Y.iloc[:,1:]
    print('Y_predicted ', Y_predicted.shape)
    cols = Y_predicted.columns
    markers = ['8', 'P', '*', 'h', 'X','+','^','s','o']
#     colors = ['steelblue', 'darkorange', 'darkorchid', 'limegreen', 'fuchsia']
    colors = cm.rainbow(np.linspace(0, 1, len(cols)))
    
    Y_residuals = calculate_residuals(Y)
    print('Y_residuals ', Y_residuals.shape)
    
    for col in cols:
        plot_index = Y_predicted.columns.get_loc(col)
        plt.scatter(Y_predicted[col], Y_residuals[col], 
                   edgecolor='white', c=colors[plot_index],
                   marker=markers[plot_index], label=col)
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.legend(loc='upper left')
    plt.hlines(y=0, xmin=0, xmax=9000, color='black', lw=2)
    plt.xlim([0, 9000])
    plt.tight_layout()
    plt.show()

# Clustering 

In [None]:
from sklearn.cluster import KMeans
import numpy as np

def get_clusters(k, data, km):
    X = np.vstack(data['sql_embedding'].values)  # Convert list of lists to 2D array
    if km != None:
        print('clustering test dataset')
        y_kmeans = km.predict(X)
        
    else:
        print('clustering train dataset')
        km = KMeans(n_clusters = k, 
                        init='k-means++', 
                        n_init=10, 
                        max_iter=300, 
                        random_state=0)
        km.fit(X)
        y_kmeans = km.predict(X)
        
    print('Distortion: %.2f' % km.inertia_)
    df_train = data.copy();
    df_train['cluster'] = np.nan
    for i,e in enumerate(y_kmeans):
        df_train['cluster'].loc[i] = e;
    
    return km, df_train


# Workload

In [None]:
import pandas as pd
import numpy as np

def create_workload(batch_size, data, k):
    # Select relevant columns
    df_data = data[['db2', 'actual', 'cluster']]
    
    # Create dummies for 'cluster'
    df_data = pd.get_dummies(df_data, columns=['cluster'])
    
    # Add missing cluster columns in one step
    cluster_columns = [f'cluster_{i}.0' for i in range(k)]
    missing_columns = [col for col in cluster_columns if col not in df_data.columns]
    if missing_columns:
        # Create a DataFrame with missing columns set to 0
        df_missing = pd.DataFrame(0, index=df_data.index, columns=missing_columns)
        # Concatenate the missing columns
        df_data = pd.concat([df_data, df_missing], axis=1)

    # Sort columns to maintain a consistent order (optional)
    df_data = df_data.reindex(columns=['db2', 'actual'] + cluster_columns)
    
    # Initialize batches
    df_batches = []
    indices = np.arange(len(df_data))
    num_batches = len(df_data) // batch_size
    
    # Create batches
    for ibat in range(num_batches):
        batch_indices = indices[ibat * batch_size:(ibat + 1) * batch_size]
        ibat_Y = df_data.iloc[batch_indices]
        df_batches.append(ibat_Y.sum())

    # Combine batches into a DataFrame
    return pd.DataFrame(df_batches)


# Create Workloads

In [None]:
import os

# Ensure the folder exists
output_folder = "cluster_data"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for k in cluster_set:
    km, df_train_clusters = get_clusters(k, df_train, None)
    km, df_test_clusters = get_clusters(k, df_test, km)
    
    workload_train = create_workload(batch_size, df_train_clusters, k)
    workload_test = create_workload(batch_size, df_test_clusters, k)

    file_name_train = os.path.join(output_folder, f'train_workloads_final_{k}_clusters.csv')
    file_name_test = os.path.join(output_folder, f'test_workloads_final_{k}_clusters.csv')
    
    workload_train.to_csv(file_name_train, index=False)
    workload_test.to_csv(file_name_test, index=False)
    
    print(f"k = {k} is done")

# ML Models

In [None]:
import numpy as np

def run_model(model, train_data, test_data, model_type, k):
    input_cols = train_data.columns.tolist()
    
    # Remove target and identifier columns from input features
    input_cols.remove('actual')
    input_cols.remove('db2')
    
    # Extract input features and target variables
    train_X = train_data[input_cols].values
    train_y = train_data[['actual']].values.ravel()  # Flatten target to 1D
    test_X = test_data[input_cols].values
    test_y = test_data[['actual']].values.ravel()   # Flatten target to 1D
    
    # Convert to float32 for compatibility
    train_X = np.asarray(train_X).astype('float32')
    train_y = np.asarray(train_y).astype('float32')
    test_X = np.asarray(test_X).astype('float32')
    test_y = np.asarray(test_y).astype('float32')
    
    # Fit the model and make predictions
    model.fit(train_X, train_y)
    train_data[model_type] = model.predict(train_X)
    test_data[model_type] = model.predict(test_X)
    
    print(f"{model_type} done")
    return train_data, test_data


# Run Models

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import neural_network
import xgboost as xgb
import pandas as pd
import numpy as np

# Initialize empty lists for results
results_train = []
results_test = []

def run_model(model, train_data, test_data, model_type, k):
    input_cols = train_data.columns.tolist()
    
    # Remove target and identifier columns from input features
    input_cols.remove('actual')
    input_cols.remove('db2')
    
    # Extract input features and target variables
    train_X = train_data[input_cols].values
    train_y = train_data[['actual']].values.ravel()  # Flatten target to 1D
    test_X = test_data[input_cols].values
    test_y = test_data[['actual']].values.ravel()   # Flatten target to 1D
    
    # Convert to float32 for compatibility
    train_X = np.asarray(train_X).astype('float32')
    train_y = np.asarray(train_y).astype('float32')
    test_X = np.asarray(test_X).astype('float32')
    test_y = np.asarray(test_y).astype('float32')
    
    # Fit the model and make predictions
    model.fit(train_X, train_y)
    train_data[model_type] = model.predict(train_X)
    test_data[model_type] = model.predict(test_X)
    
    print(f"{model_type} done")
    return train_data, test_data

for k in cluster_set:
    file_name_train = f'cluster_data/train_workloads_final_{k}_clusters.csv'
    file_name_test = f'cluster_data/test_workloads_final_{k}_clusters.csv'
    workloads_train = pd.read_csv(file_name_train)
    workloads_test = pd.read_csv(file_name_test)

    # Flatten target variables
    train_y = workloads_train['actual'].values.ravel()
    test_y = workloads_test['actual'].values.ravel()

    # Linear Regression - Ridge
    ridge = make_pipeline(
        StandardScaler(),
        Ridge(
            alpha=3.2573287932867558,
            fit_intercept=True,
            solver='lsqr',
            random_state=42
        )
    )
    workloads_train, workloads_test = run_model(
        ridge, 
        workloads_train, 
        workloads_test, 
        'ridge',
        k
    )
    train_results = workloads_train[['actual', 'db2', 'ridge']].copy()
    test_results = workloads_test[['actual', 'db2', 'ridge']].copy()
    workloads_train.drop('ridge', axis=1, inplace=True)
    workloads_test.drop('ridge', axis=1, inplace=True)

    # Decision Tree Regressor
    dtr = DecisionTreeRegressor(
        random_state=42,
        min_samples_leaf=3,
        max_features=5,
        max_depth=30,
        criterion='squared_error'
    )
    workloads_train, workloads_test = run_model(
        dtr, 
        workloads_train, 
        workloads_test, 
        'dtr',
        k
    )
    train_results.loc[:, 'dtr'] = workloads_train['dtr'].values
    test_results.loc[:, 'dtr'] = workloads_test['dtr'].values
    workloads_train.drop('dtr', axis=1, inplace=True)
    workloads_test.drop('dtr', axis=1, inplace=True)

    # Random Forest Regressor
    random_forest = RandomForestRegressor(
        n_estimators=2000,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        max_depth=50,
        bootstrap=True,
        random_state=42
    )
    workloads_train, workloads_test = run_model(
        random_forest, 
        workloads_train, 
        workloads_test, 
        'random_forest',
        k
    )
    train_results.loc[:, 'random_forest'] = workloads_train['random_forest'].values
    test_results.loc[:, 'random_forest'] = workloads_test['random_forest'].values
    workloads_train.drop('random_forest', axis=1, inplace=True)
    workloads_test.drop('random_forest', axis=1, inplace=True)

    # XGBoost Model
    xgboost = xgb.XGBRegressor(
        colsample_bytree=0.7165235326918536, 
        gamma=0.2573287932867558, 
        learning_rate=0.3895603296024942, 
        max_depth=2, 
        n_estimators=165, 
        objective='reg:squarederror',  # Updated objective
        subsample=0.3234123573173331, 
        random_state=42
    )
    workloads_train, workloads_test = run_model(
        xgboost, 
        workloads_train, 
        workloads_test, 
        'xgboost',
        k
    )
    train_results.loc[:, 'xgboost'] = workloads_train['xgboost'].values
    test_results.loc[:, 'xgboost'] = workloads_test['xgboost'].values
    workloads_train.drop('xgboost', axis=1, inplace=True)
    workloads_test.drop('xgboost', axis=1, inplace=True)

    # Neural Network
    optimal_nn_model = Pipeline([
        ('scaler', StandardScaler()),  # Scale input features
        ('nn', neural_network.MLPRegressor(
            max_iter=1000,  # Increased iterations
            alpha=0.001,
            activation='identity',
            learning_rate='constant',
            random_state=6,
            hidden_layer_sizes=(10, 20),  # Simplified architecture
            solver='lbfgs'
        ))
    ])
    workloads_train, workloads_test = run_model(
        optimal_nn_model, 
        workloads_train, 
        workloads_test, 
        'nn',
        k
    )
    train_results.loc[:, 'nn'] = workloads_train['nn'].values
    test_results.loc[:, 'nn'] = workloads_test['nn'].values
    workloads_train.drop('nn', axis=1, inplace=True)
    workloads_test.drop('nn', axis=1, inplace=True)

    # Calculate RMSE
    rmse_train = rmse(train_results)
    rmse_test = rmse(test_results)

    # Add cluster info
    rmse_train['cluster'] = k
    rmse_test['cluster'] = k

    # Append results to lists
    results_train.append(rmse_train)
    results_test.append(rmse_test)

    print(f"Models for k = {k} are done")
    print("-------------------------------")

# Convert lists to DataFrames
df_results_train = pd.DataFrame(results_train)
df_results_test = pd.DataFrame(results_test)


In [None]:
df_results_train.to_csv("df_results_train.csv")
df_results_train.head(30)

In [None]:
df_results_test.to_csv("df_results_test.csv")
df_results_test.head(30)

In [None]:
df = pd.read_csv("df_results_test.csv")

In [None]:

# ref: https://www.python-graph-gallery.com/5-control-width-and-space-in-barplots
mapes = df['nn']
batch_sizes = df['cluster']
#labels = df_batches['Label']

x_pos = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

fig, ax = plt.subplots()
fig.set_tight_layout(True)

width = 10
height = 3.5
fig.set_size_inches(width, height)

bars = ax.bar(x_pos, mapes, fill=False, hatch='ooo')

ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)

plt.xticks(x_pos, batch_sizes)

# https://stackoverflow.com/questions/72970649/how-to-label-bars-with-multiple-custom-values
for c in ax.containers:
    #print(c)
    ax.bar_label(c, label_type='edge', padding=1, fontsize=11, fontstyle='italic')

ax.tick_params(axis='both', which='major', labelsize=12)

plt.xlabel("Cluster Size", labelpad=5, fontsize=15)
#rotation: https://stackoverflow.com/questions/42100114/rotating-title-of-y-axis-to-be-horizontal-in-matplotlib
# labelpad: https://stackoverflow.com/questions/21539018/how-to-change-separation-between-tick-labels-and-axis-labels-in-matplotlib
plt.ylabel("RSME", labelpad=5, fontsize=15)

fig.savefig('job_cluster_result.png', bbox_inches='tight', dpi=300)

plt.show()