In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn import neural_network
import seaborn as sns
import matplotlib.cm as cm
from sklearn.model_selection import validation_curve
import os # new!

In [None]:
batch_size = 10
box_plot_title = 'Memory Estimation Error (MB)'
pd.set_option('display.max_columns', None)
# cluster_set = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]

# 1. Loading training and test datasets

In [None]:
def load_data():
    df = pd.read_csv('tpcds_query_train.csv')
    # df_test = pd.read_csv('job2_test_clean.csv')
    return df

In [None]:
df = load_data()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Add queryId starting from 1
df.insert(0, 'queryId', range(1, len(df) + 1))

In [None]:
df.head()

# 2. Train and Evaluate a Model

In [None]:
def batch_predict(estimator, batch_size, X, Y):
    predicted = estimator.predict(X)
    Y = np.insert(Y, Y.shape[1], predicted, axis=1)
    
    indices = np.linspace(0, X.shape[0]-1, X.shape[0], dtype=int)
    np.random.seed(42)
    np.random.shuffle(indices)
    num_batches = int(np.floor(X.shape[0] / batch_size))
    
    df_batches = pd.DataFrame(columns=['actual', 'db2', 'ml'])
    
    for ibat in range(num_batches):
        start = (ibat * batch_size)
        end = (ibat * batch_size + batch_size) - 1
        
        ibat_Y = Y[indices[start:end]]
        
        actual = sum(ibat_Y[:,-1])
        db2 = sum(ibat_Y[:,-2])
        ml = sum(ibat_Y[:,-3])
        
        df_batches = df_batches.append({'actual':actual,
                                       'db2':db2,
                                       'ml':ml},
                                      ignore_index=True)
        
    return df_batches

def rmse(Y):
    cols = Y.columns.values[1:]
    rmse_dict = {}
    
    for col in cols:
        rmse = np.round(np.sqrt(mean_squared_error(Y['actual'].values, Y[col].values)))
        rmse_dict[col] = rmse
    
    return rmse_dict
    
def calculate_residuals(Y):
    first_col = Y.columns[0]
    cols = Y.columns[1:]
    df_residuals = pd.DataFrame(columns=cols)

    for col in cols:
        df_residuals[col] = Y[col] - Y[first_col]
        
    return df_residuals

def box_plot(Y, length, height):
    df_residuals = calculate_residuals(Y)
    sns.set_style("whitegrid", {'axes.grid' : False})
    f = plt.figure(figsize=[length,height])
    plt.rcParams.update({'font.size': 16})
    ax = f.add_subplot(111)
    sns.boxplot(data=df_residuals, ax=ax, showfliers = True, orient="h")
    ax.set_xlabel(xlabel=box_plot_title,fontsize=22)
    plt.tick_params(axis='x',labeltop='on', labelbottom='on')
    ax.xaxis.set_ticks_position('both')
    #ax.set_yticks(yticks_new)
#     plt.setp(ax.get_yticklabels(), rotation=90)
    f.tight_layout()
    plt.show()
    ax.savefig('job_err.png')
def residual_plot(Y):
    Y_predicted = Y.iloc[:,1:]
    print('Y_predicted ', Y_predicted.shape)
    cols = Y_predicted.columns
    markers = ['8', 'P', '*', 'h', 'X','+','^','s','o']
#     colors = ['steelblue', 'darkorange', 'darkorchid', 'limegreen', 'fuchsia']
    colors = cm.rainbow(np.linspace(0, 1, len(cols)))
    
    Y_residuals = calculate_residuals(Y)
    print('Y_residuals ', Y_residuals.shape)
    
    for col in cols:
        plot_index = Y_predicted.columns.get_loc(col)
        plt.scatter(Y_predicted[col], Y_residuals[col], 
                   edgecolor='white', c=colors[plot_index],
                   marker=markers[plot_index], label=col)
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.legend(loc='upper left')
    plt.hlines(y=0, xmin=0, xmax=9000, color='black', lw=2)
    plt.xlim([0, 9000])
    plt.tight_layout()
    plt.show()
    
def create_workload(batch_size, data):
    # Select relevant columns
    df_data = data[['queryId', 'db2', 'actual', 'cluster']]
    
    labels = df_data['cluster'].unique()
    labels = np.sort(labels)
    cluster_columns = [f"cluster_{int(c)}" for c in labels]
    
    # Break view link if any filtering
    df_data = df_data.copy()
    df_data.loc[:, "cluster"] = df_data["cluster"].astype("int64")

    df_data = pd.get_dummies(df_data, columns=["cluster"], dtype=int)

    missing_columns = [col for col in cluster_columns if col not in df_data.columns]
    if missing_columns:
        df_missing = pd.DataFrame(0, index=df_data.index, columns=missing_columns)
        df_data = pd.concat([df_data, df_missing], axis=1)

    # Sort columns
    df_data = df_data.reindex(columns=['queryId', 'db2', 'actual'] + cluster_columns)
    
    # Initialize batches
    df_batches = []
    query_ids_per_batch = []  # 👈 list to track query IDs
    indices = np.arange(len(df_data))
    num_batches = len(df_data) // batch_size
    
    # Create batches
    for ibat in range(num_batches):
        batch_indices = indices[ibat * batch_size:(ibat + 1) * batch_size]
        ibat_Y = df_data.iloc[batch_indices]

        df_batches.append(ibat_Y.drop(columns=['queryId']).sum())  # drop queryId before summing
        query_ids_per_batch.append(ibat_Y['queryId'].tolist())     # 👈 save queryIds separately

    # Combine batches into a DataFrame
    batches_df = pd.DataFrame(df_batches)

    return batches_df, query_ids_per_batch

# Clustering 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

def get_clusters(k, data, km=None, test_size=0.3, min_rows_per_cluster=10, random_state=42):
    # Split data
    train_data, test_data = train_test_split(
        data,
        test_size=test_size,
        random_state=random_state,
        shuffle=True
    )

    # Features only (drop queryId, db2, actual)
    X_train = train_data.drop(columns=['queryId', 'db2', 'actual'])
    X_test = test_data.drop(columns=['queryId', 'db2', 'actual'])

    # Train KMeans once
    if km is not None:
        print('Using provided clustering model')
    else:
        print('Training initial clustering model')
        km = KMeans(
            n_clusters=k,
            init='k-means++',
            n_init=10,
            max_iter=300,
            random_state=random_state
        )
        km.fit(X_train)

    # Predict initial cluster assignments
    initial_train_clusters = km.predict(X_train)

    # Find which clusters to keep
    cluster_counts = pd.Series(initial_train_clusters).value_counts()
    keep_clusters = cluster_counts[cluster_counts >= min_rows_per_cluster].index.tolist()
    print(f"Kept {len(keep_clusters)} clusters after pruning (≥ {min_rows_per_cluster} members each)")

    # Keep only centroids of valid clusters
    kept_centroids = km.cluster_centers_[keep_clusters]

    # Function to assign rows to nearest kept centroid
    def assign_to_kept_centroids(X, kept_centroids):
        distances = cdist(X, kept_centroids, metric='euclidean')  # shape: (n_samples, n_kept_clusters)
        nearest_cluster_indices = np.argmin(distances, axis=1)    # which centroid is nearest
        return nearest_cluster_indices

    # Predict final labels for full train and test sets
    relabeled_train_clusters = assign_to_kept_centroids(X_train, kept_centroids)
    relabeled_test_clusters = assign_to_kept_centroids(X_test, kept_centroids)

    # Relabel clusters starting from 1
    relabeled_train_clusters += 1
    relabeled_test_clusters += 1

    # Prepare final DataFrames
    train_df = train_data.copy()
    train_df['cluster'] = relabeled_train_clusters.astype('int64')

    test_df = test_data.copy()
    test_df['cluster'] = relabeled_test_clusters.astype('int64')

    # Report
    print('Distortion (Training Inertia): %.2f' % km.inertia_)

    return train_df, test_df

# Workload

In [None]:
df.shape

In [None]:
k = 110
df_train, df_test = get_clusters(k, df, min_rows_per_cluster=200)

In [None]:
df_train.head()

In [None]:
df_train["cluster"].nunique()

In [None]:
df_train['cluster'].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Count the number of rows in each cluster
cluster_counts = df_train["cluster"].value_counts().sort_index()

# Create the bar plot
plt.figure(figsize=(8, 4))
plt.bar(cluster_counts.index, cluster_counts.values)

# Add labels and title
plt.xlabel("Cluster")
plt.ylabel("Number of Rows")
plt.title("Distribution of Rows by Cluster")
plt.xticks(cluster_counts.index)

# Show the plot
plt.tight_layout()
plt.show()

# Create Workloads

In [None]:
df_train.columns

In [None]:
# import os

# # Ensure the folder exists
# output_folder = "cluster_data"
# if not os.path.exists(output_folder):
#     os.makedirs(output_folder)
    
df_train_workloads, train_query_ids_per_batch = create_workload(batch_size, df_train)
df_test_workloads, test_query_ids_per_batch = create_workload(batch_size, df_test)

In [None]:
df_train_workloads.shape

In [None]:
len(train_query_ids_per_batch)

In [None]:
train_query_ids_per_batch[0]

In [None]:
df_test_workloads.shape

In [None]:
# Find columns that are missing in test
missing_cols = set(df_train_workloads.columns) - set(df_test_workloads.columns)

# Add missing columns with value 0
for col in missing_cols:
    df_test_workloads[col] = 0

# Reorder columns to match train
df_test_workloads = df_test_workloads[df_train_workloads.columns]

In [None]:
df_test_workloads.shape

In [None]:
df_train_workloads.head()

In [None]:
df_test_workloads.head()

In [None]:
df_test_workloads.columns

In [None]:
df_test_workloads.shape

align the two frames on their columns and tell pandas to fill anything that’s missing with 0

In [None]:

def load_train_data():
    df = df_train_workloads
    
    feature_cols = [col for col in df.columns if col.startswith('cluster_')]    
    print(feature_cols)

    target_col = ['actual']
    
    X = df[feature_cols]
    y = df[target_col].values.ravel()  # Flatten y to 1D
    
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    
    return X.values, y


def load_test_data():
    df = df_test_workloads
  
    # Assuming `df` is your DataFrame
    feature_cols = [col for col in df.columns if col.startswith('cluster_')]
    print(feature_cols)
    
    target_cols = ['db2', 'actual']
    
    X = df[feature_cols]
    Y = df[target_cols]
    
    print('X.shape: ', X.shape)
    print('y.shape: ', Y.shape)
    
    return X, Y, test_query_ids_per_batch

def my_validation_curve(estimator_name, estimator, param_name, param_range):
    train_scores, valid_scores = validation_curve(estimator, X, y, param_name=param_name,
        param_range=param_range, cv=10, scoring="neg_mean_squared_error",
    )

    train_scores = np.sqrt(np.abs(train_scores))
    valid_scores = np.sqrt(np.abs(valid_scores))
    
    print(len(train_scores))
    print(len(valid_scores))

    train_scores_mean = np.mean(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    
    title_str = "Validation Curve with " + estimator_name
    plt.title(title_str)
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    plt.plot(param_range, train_scores_mean, label="train rmse")
    plt.plot(param_range, valid_scores_mean, label="validation rmse")

    plt.legend(loc='lower right')
    plt.show()
    
    train_rmse = [round(elem, 2) for elem in train_scores_mean]
    valid_rmse = [round(elem, 2) for elem in valid_scores_mean]
    
    df_scores = pd.DataFrame({'param': param_range, 'train_rmse': train_rmse, 'valid_rmse': valid_rmse})
    print(df_scores)
    
from sklearn.metrics import mean_squared_error
import numpy as np

def cross_validate(model):
    # Load data
    X, y = load_train_data()
    train_data = X.copy()
    train_targets = y.copy()

    # k = 10
    num_val_samples = len(train_data) // k
    all_train_scores = []
    all_scores = []
    
    for i in range(k):
        print(f"Processing fold #{i}")
        val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
        val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
        
        partial_train_data = np.concatenate([train_data[:i * num_val_samples],
                                             train_data[(i + 1) * num_val_samples:]], axis=0)
        
        partial_train_targets = np.concatenate([train_targets[:i * num_val_samples],
                                                train_targets[(i + 1) * num_val_samples:]], axis=0)

        model.fit(partial_train_data, partial_train_targets)
    
        train_mse = mean_squared_error(partial_train_targets, model.predict(partial_train_data))
        val_mse = mean_squared_error(val_targets, model.predict(val_data))
    
        all_train_scores.append(train_mse)
        all_scores.append(val_mse)
    
    train_rmse = np.sqrt(np.mean(all_train_scores))
    val_rmse = np.sqrt(np.mean(all_scores))

    print('train rmse:', train_rmse)
    print('validation rmse:', val_rmse)

    return train_rmse, val_rmse

In [None]:
X, y = load_train_data()

In [None]:
rmse_scores = {}

# Ridge Cross-Validate

In [None]:
ridge = Ridge(
        fit_intercept=True, 
        solver='lsqr',
        alpha = 1.0,
        random_state=42)

rmse_scores['Ridge'] = cross_validate(ridge)
# ridge.fit(X, y)

# Decision Tree Regression - Tuning max_features

In [None]:
tree = DecisionTreeRegressor(
    max_depth=5,
    min_samples_split=4,
    min_samples_leaf=0.23,
    random_state=33,
)

rmse_scores['Decision Tree'] = cross_validate(tree)

tree.fit(X, y)

# Forest final model - using tuned HP from AutoAI

In [None]:
forest = RandomForestRegressor(
    max_depth=5,
    max_features=0.6109469920813564,
    min_samples_leaf=4,
    min_samples_split=5,
    n_estimators=17,
    #n_jobs=CPU_NUMBER,
    random_state=33,
)

rmse_scores['Random Forest'] = cross_validate(forest)

forest.fit(X, y)

# XGB Regressor

In [None]:
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(
    # --- general ----------------------------------------------------------
    objective="reg:squarederror",        # default for regression
    base_score=0.5,
    booster="gbtree",
    random_state=33,                     # controls all randomness
    seed=33,                             # still accepted (alias for random_state)

    # --- tree construction ------------------------------------------------
    tree_method="hist",                  # faster than "exact" on most CPUs
    device="cpu",                        # set to "cuda" for GPU training ➊
    n_jobs=1,                            # threads (was `nthread`) ➋
    n_estimators=879,
    learning_rate=0.1814227666290778,
    max_depth=1,
    min_child_weight=2,
    max_delta_step=0,

    # --- column / row sampling -------------------------------------------
    subsample=0.04694370939809412,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,

    # --- regularisation ---------------------------------------------------
    gamma=0.0,                           # alias for `min_split_loss`
    reg_alpha=1.0,
    reg_lambda=0.40529327440922186,

    # --- misc -------------------------------------------------------------
    interaction_constraints="",
    monotone_constraints="()",
    num_parallel_tree=1,
    scale_pos_weight=1,
    verbosity=0,                         # replaces deprecated `silent` ➌
    validate_parameters=True,            # still supported ➍
)


rmse_scores['XGBoost'] = cross_validate(xgb_regressor)

xgb_regressor.fit(X, y)

# Deep Neural Network (DNN)

In [None]:
from sklearn.neural_network import MLPRegressor

dnn_model = dnn_model = MLPRegressor(max_iter=500,
                     alpha=0.001,
                     activation='identity',
                     learning_rate= 'constant',
                     random_state = 6,
                     hidden_layer_sizes = (48, 39, 27, 16, 7, 5),
                     solver = 'lbfgs'
                    )
rmse_scores['DNN'] = cross_validate(dnn_model)

dnn_model.fit(X, y)

In [None]:
import matplotlib.pyplot as plt

# Extract model names and RMSE values
model_names = list(rmse_scores.keys())
train_rmses = [rmse_scores[model][0] for model in model_names]
val_rmses = [rmse_scores[model][1] for model in model_names]

# Update model labels
label_mapping = {
    'Ridge': 'Ridge (LearnedWMP)',
    'Decision Tree': 'Decision Tree (Baseline)',
    'Random Forest': 'Random Forest (Baseline)',
    'XGBoost': 'XGBoost (Baseline)',
    'DNN': 'DNN (LearnedWMP)'
}
model_names = [label_mapping.get(name, name) for name in model_names]

# Identify the best model (lowest validation RMSE)
best_index = val_rmses.index(min(val_rmses))

# Plot setup
x = range(len(model_names))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 5))

# Setup consistent colors
train_color = '#a6cee3'  # light blue for Train RMSE
val_color = '#fdbf6f'    # light orange for Validation RMSE

# Draw bars
train_bars = ax.bar([i - width/2 for i in x], train_rmses, width, label='Train RMSE', color=train_color)
val_bars = ax.bar([i + width/2 for i in x], val_rmses, width, label='Validation RMSE', color=val_color)

# Add RMSE value labels on top of bars
for bar_group in [train_bars, val_bars]:
    for bar in bar_group:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)

# Find minimum of train and validation RMSE for best model
best_bar_height = min(train_rmses[best_index], val_rmses[best_index])

# Annotate above the bar, move slightly higher
ax.annotate(
    '⬇ Best Model',
    xy=(best_index, best_bar_height),
    xytext=(0, 20),      # 🔵 move 20 points above
    textcoords='offset points',
    ha='center',
    va='bottom',
    fontsize=10,
    color='#1f9e44',
    fontweight='bold'
)


# Formatting and labels
ax.set_ylabel('RMSE', fontsize=12)
ax.set_title('Train vs Validation RMSE by Model', fontsize=14, weight='bold', pad=20)
ax.set_xticks(list(x))
ax.set_xticklabels(model_names, rotation=15, ha='right')
ax.legend()

# Remove grid lines
ax.grid(False)

# Remove top and right spines (borders)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Adjust y-limits
ax.set_ylim(
    bottom=0,  # start at 0 now (no need for negative values)
    top=max(max(train_rmses), max(val_rmses)) + 5
)

plt.tight_layout()
plt.show()

In [None]:
dnn_model.fit(X, y)

In [None]:
X_test, Y_test, list_query_ids = load_test_data()

In [None]:
X_test.shape

In [None]:
Y_test.shape

In [None]:
X_test.head()

In [None]:
len(test_query_ids_per_batch)

In [None]:
Y_test = Y_test.copy()
Y_test['ridge'] = ridge.predict(X_test.values)

In [None]:
# Ensure 'actual' is the first column
cols = ['actual'] + [col for col in Y_test.columns if col != 'actual']
Y_test = Y_test[cols]

In [None]:
Y_test.head()

In [None]:
Y_test.columns

In [None]:
import numpy as np
import pandas as pd

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    epsilon = 1e-10
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

# Assume Y_test is a DataFrame with 'actual', 'db2', 'dnn'

# MAPE for db2
mape_db2 = mean_absolute_percentage_error(Y_test['actual'], Y_test['db2'])

# MAPE for dnn
mape_dnn = mean_absolute_percentage_error(Y_test['actual'], Y_test['ridge'])

print(f"MAPE (db2): {mape_db2:.3f}%")
print(f"MAPE (ridge): {mape_dnn:.3f}%")

# Comparing Db2 vs Ridge

In [None]:
import pandas as pd
import numpy as np

# Assume Y_test is your DataFrame

# Calculate per-row MAPE for each method
Y_test['db2_mape'] = np.abs((Y_test['db2'] - Y_test['actual']) / (Y_test['actual'] + 1e-10)) * 100
Y_test['ridge_mape'] = np.abs((Y_test['ridge'] - Y_test['actual']) / (Y_test['actual'] + 1e-10)) * 100

# Compare which method is better for each example
db2_better = Y_test[Y_test['db2_mape'] < Y_test['ridge_mape']]
ridge_better = Y_test[Y_test['ridge_mape'] < Y_test['db2_mape']]

# (Optional) Print some results
print(f"Number of examples where DB2 is better (by MAPE): {len(db2_better)}")
print(f"Number of examples where Ridge is better (by MAPE): {len(ridge_better)}")

# (Optional) View
# print(db2_better[['actual', 'db2', 'ridge', 'db2_mape', 'ridge_mape']])
# print(ridge_better[['actual', 'db2', 'ridge', 'db2_mape', 'ridge_mape']])

In [None]:
Y_test.head()

In [None]:
Y_test.describe()

# count the number of rows where db2_mape is below 1%

In [None]:
count_db2_mape_below_1 = (Y_test['db2_mape'] < 1).sum()
print(f"Number of rows where DB2 MAPE < 1%: {count_db2_mape_below_1}")

### Top Rows with Smallest DB2 MAPE (among examples where DB2 MAPE < 1%)


In [None]:
# Step 1: Select rows where db2_mape < 1
db2_mape_below_1 = Y_test[Y_test['db2_mape'] < 1]

# Step 2: Prepare a small DataFrame showing original index, db2_mape, ridge_mape
result = db2_mape_below_1[['db2_mape', 'ridge_mape']]

# Step 3: Reset index for display if you want, but keep original index shown
print("Original Row Index | DB2 MAPE (%) | Ridge MAPE (%)")
for idx, row in result.iterrows():
    print(f"{idx:<18} {row['db2_mape']:.4f}%         {row['ridge_mape']:.4f}%")


### Rows where DB2 MAPE < 1% and Ridge MAPE ≥ 5%

In [None]:
# Step 1: Select rows where db2_mape < 1
db2_mape_below_1 = Y_test[Y_test['db2_mape'] < 1]

# Step 2: Further filter to keep rows where ridge_mape >= 5
filtered = db2_mape_below_1[db2_mape_below_1['ridge_mape'] >= 5]

# Step 3: Sort filtered rows by db2_mape ascending
filtered_sorted = filtered.sort_values(by='db2_mape')

# Step 4: Keep only the top 10 rows
top_10_filtered = filtered_sorted.head(10)

# Step 5: Prepare a small DataFrame showing original index, db2_mape, ridge_mape
result = top_10_filtered[['db2_mape', 'ridge_mape']]

# Step 6: Print
print("Original Row Index | DB2 MAPE (%) | Ridge MAPE (%)")
for idx, row in result.iterrows():
    print(f"{idx:<18} {row['db2_mape']:.4f}%         {row['ridge_mape']:.4f}%")

### Top 10 Rows Where Ridge MAPE < 1% and DB2 MAPE ≥ 5% (Sorted by Smallest Ridge MAPE)

In [None]:
# Step 1: Select rows where ridge_mape < 1
ridge_mape_below_1 = Y_test[Y_test['ridge_mape'] < 1]

# Step 2: Further filter to keep rows where db2_mape >= 5
filtered = ridge_mape_below_1[ridge_mape_below_1['db2_mape'] >= 5]

# Step 3: Sort filtered rows by ridge_mape ascending
filtered_sorted = filtered.sort_values(by='ridge_mape')

# Step 4: Keep only the top 10 rows
top_10_filtered = filtered_sorted.head(10)

# Step 5: Prepare a small DataFrame showing original index, db2_mape, ridge_mape
result = top_10_filtered[['db2_mape', 'ridge_mape']]

# Step 6: Print
print("Original Row Index | DB2 MAPE (%) | Ridge MAPE (%)")
for idx, row in result.iterrows():
    print(f"{idx:<18} {row['db2_mape']:.4f}%         {row['ridge_mape']:.4f}%")

# Find the top 10% where db2 made the smallest error (i.e., best predictions)

In [None]:
# Sort db2_better by db2_mape ascending
db2_better_sorted = db2_better.sort_values(by='db2_mape')

# Pick the top 10 rows (not top 10%)
top_db2_rows = db2_better_sorted.head(10)

# Show their original DataFrame index
print("Top 10 DB2 predictions with smallest MAPE (original index):")
print(top_db2_rows.index.tolist())

# Print the highest MAPE among these top 10
threshold_mape = top_db2_rows['db2_mape'].min()
print(f"Lowest MAPE in top 10 best DB2 predictions: {threshold_mape:.4f}%")

In [None]:
# Sort db2_better by db2_mape ascending
db2_better_sorted = db2_better.sort_values(by='db2_mape')

# Pick the top 10 rows
top_db2_rows = db2_better_sorted.head(10)

# Get their original indices
top_indices = top_db2_rows.index.tolist()

# Now, look up and collect corresponding items from test_query_ids_per_batch
collected_items = []
for idx in top_indices:
    if idx < len(test_query_ids_per_batch):
        item = test_query_ids_per_batch[idx]
        # If the item itself is a list, extend; otherwise, append
        if isinstance(item, list):
            collected_items.extend(item)
        else:
            collected_items.append(item)
    else:
        print(f"Warning: Index {idx} is out of range in test_query_ids_per_batch")

# Now sort the final collected list
final_sorted_list = sorted(collected_items)

# Print the sorted list
print("\nFinal sorted list of query IDs:")
print(final_sorted_list)

# Find the top 10% where learnedWMP made the smallest error (i.e., best predictions)

In [None]:
# Sort ridge_better by ridge_mape ascending
ridge_better_sorted = ridge_better.sort_values(by='ridge_mape')

# Pick the top 10 rows
top_ridge_rows = ridge_better_sorted.head(10)

# Get their original indices
top_ridge_indices = top_ridge_rows.index.tolist()

# Show the original DataFrame indices
print("Top 10 Ridge predictions with smallest MAPE (original indices):")
print(top_ridge_indices)

# Now, look up and collect corresponding items from test_query_ids_per_batch
collected_ridge_items = []
for idx in top_ridge_indices:
    if idx < len(test_query_ids_per_batch):
        item = test_query_ids_per_batch[idx]
        # If the item is a list, extend; otherwise, append
        if isinstance(item, list):
            collected_ridge_items.extend(item)
        else:
            collected_ridge_items.append(item)
    else:
        print(f"Warning: Index {idx} is out of range in test_query_ids_per_batch")

# Now sort the final collected list
final_sorted_ridge_list = sorted(collected_ridge_items)

# Print the sorted list
print("\nFinal sorted list of query IDs for Ridge:")
print(final_sorted_ridge_list)

# Print the highest Ridge MAPE within the top 10
threshold_ridge_mape = top_ridge_rows['ridge_mape'].min()
print(f"\nLowest MAPE in top 10 best Ridge predictions: {threshold_ridge_mape:.4f}%")


### Fetch the original queries that are present in each of the above two lists

In [None]:
# Step 1: Filter df_test based on DB2 best query IDs
df_test_db2_selected = df_test[df_test['queryId'].isin(final_sorted_list)]

# Step 2: Filter df_test based on Ridge best query IDs
df_test_ridge_selected = df_test[df_test['queryId'].isin(final_sorted_ridge_list)]

# Step 3: Print results
print(f"Number of rows selected for DB2 best predictions: {len(df_test_db2_selected)}")
print(f"Number of rows selected for Ridge best predictions: {len(df_test_ridge_selected)}")

# Optional: View a few rows
# print(df_test_db2_selected.head())
# print(df_test_ridge_selected.head())


In [None]:
df_test_db2_selected.head()

In [None]:
df_test_ridge_selected.head()

In [None]:
df_test_ridge_selected.dtypes

### Identify Features with Statistically Significant Differences Between DB2 and Ridge Query Sets

In [None]:
import pandas as pd
import numpy as np

# Features list
features = [
    'TQ', 'TQ COUNT', 'TBSCAN', 'TBSCAN COUNT', 'SORT', 'SORT COUNT',
    'FILTER', 'FILTER COUNT', 'HS JOIN', 'HS JOIN COUNT',
    'TEMP', 'TEMP COUNT', 'GRPBY', 'GRPBY COUNT', 'UNIQUE', 'UNIQUE COUNT'
]

# Step 1: Summary statistics
summary_db2 = df_test_db2_selected[features].describe().T[['mean', '50%', 'std']]
summary_ridge = df_test_ridge_selected[features].describe().T[['mean', '50%', 'std']]

# Rename median column
summary_db2.rename(columns={'50%': 'median'}, inplace=True)
summary_ridge.rename(columns={'50%': 'median'}, inplace=True)

# Step 2: Combine into one comparison table
comparison_summary = summary_db2.join(
    summary_ridge,
    lsuffix='_db2',
    rsuffix='_ridge'
)

# Step 3: Find features with noticeable mean differences
threshold = 0.05  # Tune as needed
interesting_features = []

for feature in features:
    mean_db2 = df_test_db2_selected[feature].mean()
    mean_ridge = df_test_ridge_selected[feature].mean()
    mean_diff = abs(mean_db2 - mean_ridge)
    
    if mean_diff > threshold:
        interesting_features.append((feature, mean_db2, mean_ridge, mean_diff))

# Step 4: Print features with noticeable differences
print("\nFeatures with noticeable mean differences (> {:.2f}):".format(threshold))
for feature, db2_mean, ridge_mean, diff in interesting_features:
    print(f"{feature}: DB2 mean = {db2_mean:.4f}, Ridge mean = {ridge_mean:.4f}, Difference = {diff:.4f}")

# Step 5: Automatically Generate Insight Sentences for DB2
print("\nGenerated Insight Sentences for DB2:\n")

for feature, db2_mean, ridge_mean, diff in interesting_features:
    if db2_mean < ridge_mean:
        print(f"In queries where DB2 predictions were more accurate, the feature '{feature}' had a lower average value ({db2_mean:.4f}) compared to Ridge ({ridge_mean:.4f}). This suggests DB2 performs better when '{feature}' is relatively lower.")
    else:
        print(f"In queries where DB2 predictions were more accurate, the feature '{feature}' had a higher average value ({db2_mean:.4f}) compared to Ridge ({ridge_mean:.4f}). This suggests DB2 performs better when '{feature}' is relatively higher.")

# Step 6: Automatically Generate Insight Sentences for Ridge
print("\nGenerated Insight Sentences for Ridge:\n")

for feature, db2_mean, ridge_mean, diff in interesting_features:
    if ridge_mean < db2_mean:
        print(f"In queries where Ridge predictions were more accurate, the feature '{feature}' had a lower average value ({ridge_mean:.4f}) compared to DB2 ({db2_mean:.4f}). This suggests Ridge performs better when '{feature}' is relatively lower.")
    else:
        print(f"In queries where Ridge predictions were more accurate, the feature '{feature}' had a higher average value ({ridge_mean:.4f}) compared to DB2 ({db2_mean:.4f}). This suggests Ridge performs better when '{feature}' is relatively higher.")
