In [58]:
import matplotlib.pyplot as pyplot
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

def preprocess_df(df):
    # Assuming df is your original DataFrame

    # Define the operation columns and their corresponding new column names
    op_columns = [col for col in df.columns if col.startswith('opName_')]
    prev_op_columns = [col for col in df.columns if col.startswith('p_opName_')]
    curr_dev_columns = [col for col in df.columns if col.startswith('mapDevice_')]
    prev_dev_columns = [col for col in df.columns if col.startswith('p_mapDevice_')]

    # Extract the operation name, device type by parsing column names
    def extract_name(col_name, prefix):
        return col_name.replace(prefix, '')

    # Process current operation
    df['currOp'] = None
    for col in op_columns:
        op_name = extract_name(col, 'opName_')
        df.loc[df[col] == 1, 'currOp'] = op_name

    # Process previous operation
    df['prevOp'] = None
    for col in prev_op_columns:
        op_name = extract_name(col, 'p_opName_')
        df.loc[df[col] == 1, 'prevOp'] = op_name

    # Process current device
    df['currDev'] = None
    for col in curr_dev_columns:
        dev_name = extract_name(col, 'mapDevice_')
        df.loc[df[col] == 1, 'currDev'] = dev_name

    # Process previous device
    df['prevDev'] = None
    for col in prev_dev_columns:
        dev_name = extract_name(col, 'p_mapDevice_')
        df.loc[df[col] == 1, 'prevDev'] = dev_name

    # Prepare the final DataFrame
    final_df = df[['inputRow', 'currOp', 'prevOp', 'currDev', 'prevDev', 'requestID', 'execTime']].copy()
    final_df.rename(columns={'requestID': 'operatorLoc'}, inplace=True)

    return final_df


In [None]:
raw_df = pd.read_csv("./X_train_q1.csv")
df = preprocess_df(raw_df)
df, t_df = train_test_split(df, test_size=0.1)
y_train = df['execTime']
x_train = df.drop(['execTime'], axis=1)
y_test = df['execTime']
x_test = df.drop(['execTime'], axis=1)
categorical_columns = ['currOp', 'prevOp', 'currDev', 'prevDev', 'operatorLoc']
numerical_columns = ['inputRow']
categorical_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1
)
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

model = Pipeline(
    [
        ("preprocess", preprocessing),
        ("regressor", GradientBoostingRegressor(n_estimators=300)),
    ]
)

model.fit(x_train, y_train)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

feature_names = model[:-1].get_feature_names_out()
feature_names = np.delete(feature_names, 0)
y_pos = np.arange(len(feature_names))
colors = sns.color_palette('gnuplot2', len(feature_names))
values = np.delete(model[-1].feature_importances_, 0)
mdi_importances = pd.Series(
    values, index=feature_names
).sort_values(ascending=True)

ax = mdi_importances.plot.barh(color=colors)
ax.set_title("Feature Importances (MDI)", fontweight='bold', fontsize=14)
ax.set_yticks(y_pos, labels=feature_names, fontweight='bold', fontsize=14)
ax.set_xlabel("MDI scores", fontweight='bold', fontsize=14)
ax.figure.tight_layout()
plt.savefig("./img/q1_MDI.pdf", format='pdf', bbox_inches="tight")


In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_importances_idx = result.importances_mean.argsort()
sorted_importances_idx = np.delete(sorted_importances_idx, -1)
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=df.columns[sorted_importances_idx],
)

y_pos = np.arange(len(sorted_importances_idx)) + 1
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (PI)", fontweight='bold', fontsize=14)
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score", fontweight='bold', fontsize=14)
ax.set_yticks(y_pos, labels=df.columns[sorted_importances_idx], fontweight='bold', fontsize=14)
ax.grid()
ax.figure.tight_layout()
plt.savefig("./img/q1_PI.pdf", format='pdf', bbox_inches="tight")