In [24]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

Load the layout data

In [2]:
class LayoutDataProcessor:
    def __init__(self, directory, split):
        self.directory = os.path.join(directory, split)
        self.data = []
        self.feature_occurrences = {}
        self.initialized = False

    def load_data(self):
        for filename in tqdm(os.listdir(self.directory)):
            filepath = os.path.join(self.directory, filename)
            self.process_file(filepath, filename)
        self.calculate_occurrence_rates()

    def process_file(self, filepath, filename):
        data = np.load(filepath)
        node_config_ids = data['node_config_ids']
        node_config_feat = data['node_config_feat']
        config_runtime = data['config_runtime']
        node_feat = data['node_feat']
        node_opcode = data['node_opcode']
        node_feat_avg = np.mean(node_feat, axis=0)  # Calculate average node features

        # Initialize feature occurrence tracking if not done yet
        if not self.initialized:
            self.initialize_feature_occurrences(node_config_feat.shape[2])
            self.initialized = True

        # Process each configuration
        for i in range(len(config_runtime)):
            # Configuration feature array for the current configuration
            current_config_features = node_config_feat[i, :, :]

            # Append features to the data dictionary
            row = {
                'config_id': f"{filename}",
                'runtime': config_runtime[i],
                'node_feat_avg': node_feat_avg.tolist(),  # Add the average node features
            }

            # Add node_config_feat features and update feature occurrences
            self.add_config_features(row, current_config_features)

            self.data.append(row)

    def initialize_feature_occurrences(self, num_features):
        for i in range(num_features):
            self.feature_occurrences[f"feature_{i}"] = {}

    def add_config_features(self, row, config_features):
        for feature_index in range(config_features.shape[1]):
            feature_name = f"feature_{feature_index}"
            feature_value = config_features[0, feature_index]
            row[feature_name] = feature_value

            # Update occurrence counts for each feature
            self.feature_occurrences[feature_name].setdefault(feature_value, 0)
            self.feature_occurrences[feature_name][feature_value] += 1


    def calculate_occurrence_rates(self):
        # Calculate occurrence rates for each feature
        for row in self.data:
            for feature_name, occurrences in self.feature_occurrences.items():
                feature_value = row.get(feature_name)
                if feature_value is not None:  # Ensure the feature was recorded for this config
                    total_occurrences = sum(occurrences.values())
                    row[feature_name + '_rate'] = occurrences[feature_value] / total_occurrences

    def get_dataframe(self):
        df = pd.DataFrame(self.data)
        # Unpack 'node_feat_avg' into separate columns
        node_feat_avg_df = df.pop('node_feat_avg').apply(pd.Series)
        node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]
        # Concatenate with the original DataFrame
        df = pd.concat([df, node_feat_avg_df], axis=1)
        return df

Working with validation set of xla:default collection

In [3]:
processor = LayoutDataProcessor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/default', 'valid')
processor.load_data()
df_valid = processor.get_dataframe()

100%|██████████| 7/7 [00:10<00:00,  1.55s/it]


In [4]:
df_valid.columns

Index(['config_id', 'runtime', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       ...
       'node_feat_avg_130', 'node_feat_avg_131', 'node_feat_avg_132',
       'node_feat_avg_133', 'node_feat_avg_134', 'node_feat_avg_135',
       'node_feat_avg_136', 'node_feat_avg_137', 'node_feat_avg_138',
       'node_feat_avg_139'],
      dtype='object', length=178)

minmax scale target by config_id

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Iterate over each config_id and scale the target column within each group
for config_id in tqdm(df_valid['config_id'].unique()):
    # Selecting the rows corresponding to the current config_id
    idx = df_valid['config_id'] == config_id
    # Scaling the target column for the current group
    df_valid.loc[idx, 'runtime'] = scaler.fit_transform(df_valid.loc[idx, ['runtime']])

100%|██████████| 7/7 [00:00<00:00, 50.45it/s]


In [6]:
df_valid.to_csv('processed_layout_data.csv', index=False)

Load data from here

In [8]:
df_valid = pd.read_csv('processed_layout_data.csv')

In [9]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66021 entries, 0 to 66020
Columns: 178 entries, config_id to node_feat_avg_139
dtypes: float64(177), object(1)
memory usage: 89.7+ MB


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

train test split, ensure config_ids are not split across train and test splits

In [11]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.2, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

# Separating features and target variable
X_train = train_df.drop(['config_id', 'runtime',], axis=1)
y_train = train_df['runtime']
X_test = test_df.drop(['config_id', 'runtime',], axis=1)
y_test = test_df['runtime']

In [12]:
!pip install LightGBM
from lightgbm import LGBMRegressor



In [13]:
df_valid['runtime'].describe()

count    66021.000000
mean         0.072605
std          0.192265
min          0.000000
25%          0.001642
50%          0.003572
75%          0.012358
max          1.000000
Name: runtime, dtype: float64

best params from hyperparameter tuning

In [15]:
lasso_alpha=100
ridge_alpha=100
lgbm_params = {'colsample_bytree': 0.9101057142919446,
 'learning_rate': 0.1,
 'min_child_samples': 340,
 'min_child_weight': 0.07452212998940164,
 'n_estimators': 500,
 'num_leaves': 15,
 'reg_alpha': 0.1,
 'reg_lambda': 10,
 'subsample': 0.861899732308057}


lr = LinearRegression()
lasso = Lasso(alpha = lasso_alpha)
ridge = Ridge(alpha = ridge_alpha)
lgbm = LGBMRegressor(**lgbm_params)

models = [lr, lasso, ridge, lgbm]
for model in tqdm(models):
    model.fit(X_train, y_train)

100%|██████████| 4/4 [00:05<00:00,  1.44s/it]


In [18]:
# Making predictions
train_predictions_lin = lin_reg.predict(X_train)
train_predictions_lasso = lasso_reg.predict(X_train)
train_predictions_ridge = ridge_reg.predict(X_train)
train_predictions_lgbm = lgbm.predict(X_train)

predictions_lin = lin_reg.predict(X_test)
predictions_lasso = lasso_reg.predict(X_test)
predictions_ridge = ridge_reg.predict(X_test)
predictions_lgbm = lgbm.predict(X_test)

# Calculating R² scores
r2_lin = r2_score(y_test, predictions_lin)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_ridge = r2_score(y_test, predictions_ridge)
r2_lgbm = r2_score(y_test, predictions_lgbm)

r2_train_lin = r2_score(y_train, train_predictions_lin)
r2_train_lasso = r2_score(y_train, train_predictions_lasso)
r2_train_ridge = r2_score(y_train, train_predictions_ridge)
r2_train_lgbm = r2_score(y_train, train_predictions_lgbm)

print("R² Scores Train:")
print(f"Linear Regression: {r2_train_lin}")
print(f"Lasso Regression: {r2_train_lasso}")
print(f"Ridge Regression: {r2_train_ridge}")
print(f"LGBM Regression: {r2_train_lgbm}")

print("R² Scores Test:")
print(f"Linear Regression: {r2_lin}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Ridge Regression: {r2_ridge}")
print(f"LGBM Regression: {r2_lgbm}")

R² Scores Train:
Linear Regression: 0.05764880730913158
Lasso Regression: 0.004776706348410431
Ridge Regression: 0.05844506844669206
LGBM Regression: 0.05266208305872844
R² Scores Test:
Linear Regression: -3885482292290225.0
Lasso Regression: -0.04879012493730284
Ridge Regression: -6.227965279646386
LGBM Regression: 0.10789453730516885


calculate rankings and kendall tau correlation

In [22]:
def rank_configurations(predictions, full_df):
    ranked_configurations = []

    # Create a mapping of DataFrame indices to the range of indices in predictions
    index_mapping = {idx: i for i, idx in enumerate(full_df.index)}

    # Group data by 'config_id' and process each group
    for config_id, group in full_df.groupby('config_id'):
        # Get the corresponding prediction indices for the current group
        prediction_indices = [index_mapping[idx] for idx in group.index]

        # Rank configurations by predicted runtime
        ranked_indices = group.index[np.argsort(predictions[prediction_indices])]

        # Store the original indices of the ranked configurations
        ranked_configurations.append(list(ranked_indices))

    return ranked_configurations

In [20]:
from scipy.stats import kendalltau

def calculate_kendall_tau(predicted_rankings, true_rankings):
    kendall_tau_scores = []

    for predicted, true in zip(predicted_rankings, true_rankings):
        tau, _ = kendalltau(predicted, true)
        kendall_tau_scores.append(tau)

    # Calculate the average Kendall tau correlation
    average_kendall_tau = sum(kendall_tau_scores) / len(kendall_tau_scores)
    return average_kendall_tau



In [25]:
epsilon = 1e-7  # Small value to add to zero targets

train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)

In [29]:
# Use the existing predictions to rank configurations
ranked_train_lr = rank_configurations(train_predictions_lin, train_df)
ranked_test_lr = rank_configurations(predictions_lin, test_df)

ranked_train_lasso = rank_configurations(train_predictions_lasso, train_df)
ranked_test_lasso = rank_configurations(predictions_lasso, test_df)

ranked_train_ridge = rank_configurations(train_predictions_ridge, train_df)
ranked_test_ridge = rank_configurations(predictions_ridge, test_df)

ranked_train_lgbm = rank_configurations(train_predictions_lgbm, train_df)
ranked_test_lgbm = rank_configurations(predictions_lgbm, test_df)

true_ranked_train = rank_configurations(y_train.to_numpy(), train_df)
true_ranked_test = rank_configurations(y_test.to_numpy(), test_df)

In [30]:
kendall_tau_lr_train = calculate_kendall_tau(ranked_train_lr, true_ranked_train)
kendall_tau_lr_test = calculate_kendall_tau(ranked_test_lr, true_ranked_test)

kendall_tau_lasso_train = calculate_kendall_tau(ranked_train_lasso, true_ranked_train)
kendall_tau_lasso_test = calculate_kendall_tau(ranked_test_lasso, true_ranked_test)

kendall_tau_ridge_train = calculate_kendall_tau(ranked_train_ridge, true_ranked_train)
kendall_tau_ridge_test = calculate_kendall_tau(ranked_test_ridge, true_ranked_test)

kendall_tau_lgbm_train = calculate_kendall_tau(ranked_train_lgbm, true_ranked_train)
kendall_tau_lgbm_test = calculate_kendall_tau(ranked_test_lgbm, true_ranked_test)

# Print Kendall tau correlations
print("Kendall tau (LR, Train):", kendall_tau_lr_train)
print("Kendall tau (LR, Test):", kendall_tau_lr_test)

print("Kendall tau (Lasso, Train):", kendall_tau_lasso_train)
print("Kendall tau (Lasso, Test):", kendall_tau_lasso_test)

print("Kendall tau (Ridge, Train):", kendall_tau_ridge_train)
print("Kendall tau (Ridge, Test):", kendall_tau_ridge_test)

print("Kendall tau (LGBM, Train):", kendall_tau_lgbm_train)
print("Kendall tau (LGBM, Test):", kendall_tau_lgbm_test)

Kendall tau (LR, Train): 0.006997150492382141
Kendall tau (LR, Test): 0.0020229098793555344
Kendall tau (Lasso, Train): -0.005999331660060098
Kendall tau (Lasso, Test): 0.005620911624383509
Kendall tau (Ridge, Train): 0.006997150492382141
Kendall tau (Ridge, Test): -0.0016531340360336991
Kendall tau (LGBM, Train): -0.004237329881009363
Kendall tau (LGBM, Test): 0.004196686767445572


Code below is old, used for hyperparameter search.

In [None]:
#X_train = X_train.drop(columns=['node_opcode'])
#X_test = X_test.drop('node_opcode', axis=1)
# Training models
lin_reg = LinearRegression().fit(X_train, y_train)

alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Setting up GridSearchCV for Lasso Regression
lasso = Lasso()
grid_search_lasso = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_lasso.fit(X_train, y_train)
lasso_reg = grid_search_lasso.best_estimator_
print("Lasso Alpha")
print(grid_search_lasso.best_params_['alpha'])

ridge = Ridge()
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_ridge.fit(X_train, y_train)
ridge_reg = grid_search_ridge.best_estimator_
print("Ridge Alpha")
print(grid_search_ridge.best_params_['alpha'])

# Making predictions
train_predictions_lin = lin_reg.predict(X_train)
train_predictions_lasso = lasso_reg.predict(X_train)
train_predictions_ridge = ridge_reg.predict(X_train)

predictions_lin = lin_reg.predict(X_test)
predictions_lasso = lasso_reg.predict(X_test)
predictions_ridge = ridge_reg.predict(X_test)

# Calculating R² scores
r2_lin = r2_score(y_test, predictions_lin)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_ridge = r2_score(y_test, predictions_ridge)

r2_train_lin = r2_score(y_train, train_predictions_lin)
r2_train_lasso = r2_score(y_train, train_predictions_lasso)
r2_train_ridge = r2_score(y_train, train_predictions_ridge)

print("R² Scores Train:")
print(f"Linear Regression: {r2_train_lin}")
print(f"Lasso Regression: {r2_train_lasso}")
print(f"Ridge Regression: {r2_train_ridge}")

print("R² Scores Test:")
print(f"Linear Regression: {r2_lin}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Ridge Regression: {r2_ridge}")

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [16]:
param_dist = {
    'num_leaves': sp_randint(3, 50), 
    'min_child_samples': sp_randint(5, 500), 
    'min_child_weight': sp_uniform(0.01, 0.1),
    'subsample': sp_uniform(0.8, 0.2),
    'colsample_bytree': sp_uniform(0.8, 0.2),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 250, 500, 1000, 1500]
}

In [17]:
lgbm = LGBMRegressor()

In [18]:
random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=25, cv=4, scoring='neg_mean_squared_error', verbose=4)
random_search.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV 1/4] END colsample_bytree=0.9752494804449268, learning_rate=0.2, min_child_samples=194, min_child_weight=0.021260066697622752, n_estimators=1000, num_leaves=45, reg_alpha=10, reg_lambda=0, subsample=0.8377882425034783;, score=-0.019 total time=   6.2s
[CV 2/4] END colsample_bytree=0.9752494804449268, learning_rate=0.2, min_child_samples=194, min_child_weight=0.021260066697622752, n_estimators=1000, num_leaves=45, reg_alpha=10, reg_lambda=0, subsample=0.8377882425034783;, score=-0.030 total time=   5.9s
[CV 3/4] END colsample_bytree=0.9752494804449268, learning_rate=0.2, min_child_samples=194, min_child_weight=0.021260066697622752, n_estimators=1000, num_leaves=45, reg_alpha=10, reg_lambda=0, subsample=0.8377882425034783;, score=-0.038 total time=   4.3s
[CV 4/4] END colsample_bytree=0.9752494804449268, learning_rate=0.2, min_child_samples=194, min_child_weight=0.021260066697622752, n_estimators=1000, num_leaves=45, reg_a

In [19]:
best_lgbm = random_search.best_estimator_

In [20]:
random_search.best_params_

{'colsample_bytree': 0.9101057142919446,
 'learning_rate': 0.1,
 'min_child_samples': 340,
 'min_child_weight': 0.07452212998940164,
 'n_estimators': 500,
 'num_leaves': 15,
 'reg_alpha': 0.1,
 'reg_lambda': 10,
 'subsample': 0.861899732308057}

In [21]:
train_preds_lgb = best_lgbm.predict(X_train)

preds_lgb = best_lgbm.predict(X_test)

# Calculating R² scores
r2_lgb = r2_score(y_test, preds_lgb)

r2_train_lgb=r2_score(y_train, train_preds_lgb)

print("R² Scores Train:")
print(f"LGBM: {r2_train_lgb}")

print("R² Scores Test:")
print(f"LGBM: {r2_lgb}")

R² Scores Train:
LGBM: 0.05266208305872844
R² Scores Test:
LGBM: 0.10789453730516918


In [22]:
def rank_configurations(predictions, full_df):
    ranked_configurations = []

    # Create a mapping of DataFrame indices to the range of indices in predictions
    index_mapping = {idx: i for i, idx in enumerate(full_df.index)}

    # Group data by 'config_id' and process each group
    for config_id, group in full_df.groupby('config_id'):
        # Get the corresponding prediction indices for the current group
        prediction_indices = [index_mapping[idx] for idx in group.index]

        # Rank configurations by predicted runtime
        ranked_indices = group.index[np.argsort(predictions[prediction_indices])]

        # Store the original indices of the ranked configurations
        ranked_configurations.append(list(ranked_indices))

    return ranked_configurations
def calculate_top_k_slowdown(predicted_rankings, full_df, runtime_column='runtime', k=5):
    total_slowdown = 0

    for predicted in tqdm(predicted_rankings):
        # Extract the top-k predicted configurations
        top_k_predicted = predicted[:k]

        # Best runtime among top-k predicted configurations
        best_runtime_top_k = full_df.loc[top_k_predicted, runtime_column].max()

        # Best runtime among all configurations in the model group
        config_id = full_df.loc[top_k_predicted[0], 'config_id']
        best_runtime_all = full_df[full_df['config_id'] == config_id][runtime_column].max()

        # Calculate the speedup instead of slowdown
        speedup = best_runtime_all / best_runtime_top_k

        # Speedup should always be >= 1; if it's not, cap it at 1 to avoid negative slowdown
        speedup = max(speedup, 1)

        # Calculate the slowdown as the inverse of speedup, minus 1 to get the additional time taken
        slowdown = (1 / speedup) - 1
        total_slowdown += slowdown

    # Average slowdown across all models
    average_slowdown = total_slowdown / len(predicted_rankings)
    return average_slowdown

In [23]:
epsilon = 1e-7  # Small value to add to zero runtimes

# Adjusting the 'runtime' column in train_df and test_df
train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)


In [24]:
from scipy.stats import kendalltau

def calculate_kendall_tau(predicted_rankings, true_rankings):
    kendall_tau_scores = []

    for predicted, true in zip(predicted_rankings, true_rankings):
        tau, _ = kendalltau(predicted, true)
        kendall_tau_scores.append(tau)

    # Calculate the average Kendall tau correlation
    average_kendall_tau = sum(kendall_tau_scores) / len(kendall_tau_scores)
    return average_kendall_tau



In [25]:
# Use the existing predictions to rank configurations
ranked_train_lr = rank_configurations(train_predictions_lin, train_df)
ranked_test_lr = rank_configurations(predictions_lin, test_df)

ranked_train_lasso = rank_configurations(train_predictions_lasso, train_df)
ranked_test_lasso = rank_configurations(predictions_lasso, test_df)

ranked_train_ridge = rank_configurations(train_predictions_ridge, train_df)
ranked_test_ridge = rank_configurations(predictions_ridge, test_df)

ranked_train_lgbm = rank_configurations(train_preds_lgb, train_df)
ranked_test_lgbm = rank_configurations(preds_lgb, test_df)

true_ranked_train = rank_configurations(y_train.to_numpy(), train_df)
true_ranked_test = rank_configurations(y_test.to_numpy(), test_df)

# Calculate and print the average top-k slowdown for the train predictions
average_slowdown_lr_train = calculate_top_k_slowdown(ranked_train_lr, train_df)
average_slowdown_lasso_train = calculate_top_k_slowdown(ranked_train_lasso, train_df)
average_slowdown_ridge_train = calculate_top_k_slowdown(ranked_train_ridge, train_df)
average_slowdown_lgbm_train = calculate_top_k_slowdown(ranked_train_lgbm, train_df)

print("Average Top-k Slowdown (LR, Train):", average_slowdown_lr_train)
print("Average Top-k Slowdown (Lasso, Train):", average_slowdown_lasso_train)
print("Average Top-k Slowdown (Ridge, Train):", average_slowdown_ridge_train)
print("Average Top-k Slowdown (LGBM, Train):", average_slowdown_lgbm_train)

# Calculate and print the average top-k slowdown for the test predictions
average_slowdown_lr_test = calculate_top_k_slowdown(ranked_test_lr, test_df)
average_slowdown_lasso_test = calculate_top_k_slowdown(ranked_test_lasso, test_df)
average_slowdown_ridge_test = calculate_top_k_slowdown(ranked_test_ridge, test_df)
average_slowdown_lgbm_test = calculate_top_k_slowdown(ranked_test_lgbm, test_df)

print("Average Top-k Slowdown (LR, Test):", average_slowdown_lr_test)
print("Average Top-k Slowdown (Lasso, Test):", average_slowdown_lasso_test)
print("Average Top-k Slowdown (Ridge, Test):", average_slowdown_ridge_test)
print("Average Top-k Slowdown (LGBM, Test):", average_slowdown_lgbm_test)

100%|██████████| 5/5 [00:00<00:00, 62.18it/s]
100%|██████████| 5/5 [00:00<00:00, 64.96it/s]
100%|██████████| 5/5 [00:00<00:00, 66.71it/s]
100%|██████████| 5/5 [00:00<00:00, 65.76it/s]


Average Top-k Slowdown (LR, Train): -0.6946932266504761
Average Top-k Slowdown (Lasso, Train): -0.3532765048264646
Average Top-k Slowdown (Ridge, Train): -0.6946932266504761
Average Top-k Slowdown (LGBM, Train): -0.8358003766532823


100%|██████████| 2/2 [00:00<00:00, 186.59it/s]
100%|██████████| 2/2 [00:00<00:00, 191.72it/s]
100%|██████████| 2/2 [00:00<00:00, 219.74it/s]
100%|██████████| 2/2 [00:00<00:00, 232.15it/s]

Average Top-k Slowdown (LR, Test): -0.7586100348166744
Average Top-k Slowdown (Lasso, Test): -0.49931061635827995
Average Top-k Slowdown (Ridge, Test): -0.7586071071254612
Average Top-k Slowdown (LGBM, Test): -0.9962259675304663





In [26]:
# Example usage:
kendall_tau_lr_train = calculate_kendall_tau(ranked_train_lr, true_ranked_train)
kendall_tau_lr_test = calculate_kendall_tau(ranked_test_lr, true_ranked_test)

kendall_tau_lasso_train = calculate_kendall_tau(ranked_train_lasso, true_ranked_train)
kendall_tau_lasso_test = calculate_kendall_tau(ranked_test_lasso, true_ranked_test)

kendall_tau_ridge_train = calculate_kendall_tau(ranked_train_ridge, true_ranked_train)
kendall_tau_ridge_test = calculate_kendall_tau(ranked_test_ridge, true_ranked_test)

kendall_tau_lgbm_train = calculate_kendall_tau(ranked_train_lgbm, true_ranked_train)
kendall_tau_lgbm_test = calculate_kendall_tau(ranked_test_lgbm, true_ranked_test)

# Print Kendall tau correlations
print("Kendall tau (LR, Train):", kendall_tau_lr_train)
print("Kendall tau (LR, Test):", kendall_tau_lr_test)

print("Kendall tau (Lasso, Train):", kendall_tau_lasso_train)
print("Kendall tau (Lasso, Test):", kendall_tau_lasso_test)

print("Kendall tau (Ridge, Train):", kendall_tau_ridge_train)
print("Kendall tau (Ridge, Test):", kendall_tau_ridge_test)

print("Kendall tau (LGBM, Train):", kendall_tau_lgbm_train)
print("Kendall tau (LGBM, Test):", kendall_tau_lgbm_test)

Kendall tau (LR, Train): 0.006997150492382141
Kendall tau (LR, Test): -0.0008056718828987619
Kendall tau (Lasso, Train): -0.005999331660060098
Kendall tau (Lasso, Test): 0.005620911624383509
Kendall tau (Ridge, Train): 0.006997150492382141
Kendall tau (Ridge, Test): 0.0012241014961667743
Kendall tau (LGBM, Train): -0.004237329881009363
Kendall tau (LGBM, Test): 0.004196686767445572
