In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [22]:
class LayoutDataProcessor:
    def __init__(self, directory, split):
        self.directory = os.path.join(directory, split)
        self.data = []
        self.feature_occurrences = {}

    def load_data(self):
        for filename in tqdm(os.listdir(self.directory)):
            filepath = os.path.join(self.directory, filename)
            self.process_file(filepath, filename)
        self.calculate_occurrence_rates()

    def process_file(self, filepath, filename):
        data = np.load(filepath)
        node_config_ids = data['node_config_ids']
        node_config_feat = data['node_config_feat']
        config_runtime = data['config_runtime']
        node_feat = data['node_feat']
        node_feat_avg = np.mean(node_feat, axis=0)
        opcodes = data['node_opcode']
        dist, _ = np.histogram(data['node_opcode'], bins=np.arange(0, 120, 4))
        dist = np.argmax(dist)

        # Initialize feature occurrence tracking if not done yet
        if not self.feature_occurrences:
            for i in range(node_config_feat.shape[2]):  # For each feature in node_config_feat
                self.feature_occurrences[f"feature_{i}"] = {}

        # Process each configuration
        for i in range(len(config_runtime)):
            # Configuration feature array for the current configuration
            current_config_features = node_config_feat[i, :, :]

            # Append features to the data dictionary
            row = {
                'config_id': f"{filename}",
                'node_feat_avg': node_feat_avg,
                'opcode': dist,
                'runtime': config_runtime[i]      
            }

            for feature_index in range(current_config_features.shape[1]):
                feature_name = f"feature_{feature_index}"
                feature_value = current_config_features[0, feature_index]  # Assuming features are not multi-dimensional
                row[feature_name] = feature_value

                # Update occurrence counts for each feature
                self.feature_occurrences[feature_name].setdefault(feature_value, 0)
                self.feature_occurrences[feature_name][feature_value] += 1

            self.data.append(row)

    def calculate_occurrence_rates(self):
        for row in self.data:
            for feature_name, occurrences in self.feature_occurrences.items():
                feature_value = row[feature_name]
                total_occurrences = sum(occurrences.values())
                row[feature_name + '_rate'] = occurrences[feature_value] / total_occurrences

    def get_dataframe(self):
        return pd.DataFrame(self.data)


In [34]:
processor = LayoutDataProcessor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/random', 'valid')
processor.load_data()
df_valid = processor.get_dataframe()

100%|██████████| 7/7 [00:07<00:00,  1.07s/it]


In [35]:
df_valid.columns

Index(['config_id', 'node_feat_avg', 'opcode', 'runtime', 'feature_0',
       'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10',
       'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15',
       'feature_16', 'feature_17', 'feature_0_rate', 'feature_1_rate',
       'feature_2_rate', 'feature_3_rate', 'feature_4_rate', 'feature_5_rate',
       'feature_6_rate', 'feature_7_rate', 'feature_8_rate', 'feature_9_rate',
       'feature_10_rate', 'feature_11_rate', 'feature_12_rate',
       'feature_13_rate', 'feature_14_rate', 'feature_15_rate',
       'feature_16_rate', 'feature_17_rate'],
      dtype='object')

In [38]:
# Unpack 'node_feat_avg'
node_feat_avg_df = df_valid['node_feat_avg'].apply(pd.Series)
node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]

# Concatenate with the original DataFrame
df_valid = pd.concat([df_valid.drop(['node_feat_avg'], axis=1), node_feat_avg_df], axis=1)

In [39]:
import warnings

# Use this at the beginning of the cell where you want to suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Iterate over each config_id and scale the target column within each group
for config_id in tqdm(df_valid['config_id'].unique()):
    # Selecting the rows corresponding to the current config_id
    idx = df_valid['config_id'] == config_id
    # Scaling the target column for the current group
    df_valid.loc[idx, 'runtime'] = scaler.fit_transform(df_valid.loc[idx, ['runtime']])

100%|██████████| 7/7 [00:00<00:00, 83.61it/s]


In [40]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59809 entries, 0 to 59808
Columns: 179 entries, config_id to node_feat_avg_139
dtypes: float32(158), float64(19), int64(1), object(1)
memory usage: 45.6+ MB


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [42]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.1, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

# Separating features and target variable
X_train = train_df.drop(['config_id', 'runtime',], axis=1)
y_train = train_df['runtime']
X_test = test_df.drop(['config_id', 'runtime',], axis=1)
y_test = test_df['runtime']

In [43]:
!pip install LightGBM
from lightgbm import LGBMRegressor



In [44]:
df_valid['runtime'].describe()

count    59809.000000
mean         0.481909
std          0.256118
min          0.000000
25%          0.272015
50%          0.482651
75%          0.682399
max          1.000000
Name: runtime, dtype: float64

In [45]:
X_train.head()

Unnamed: 0,opcode,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,node_feat_avg_130,node_feat_avg_131,node_feat_avg_132,node_feat_avg_133,node_feat_avg_134,node_feat_avg_135,node_feat_avg_136,node_feat_avg_137,node_feat_avg_138,node_feat_avg_139
5704,14,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,1.0,0.000316,1.50332,1.128991,0.746127,0.367373,0.0,0.0
5705,14,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,1.0,0.000316,1.50332,1.128991,0.746127,0.367373,0.0,0.0
5706,14,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,...,0.0,0.0,1.0,0.000316,1.50332,1.128991,0.746127,0.367373,0.0,0.0
5707,14,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,...,0.0,0.0,1.0,0.000316,1.50332,1.128991,0.746127,0.367373,0.0,0.0
5708,14,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,...,0.0,0.0,1.0,0.000316,1.50332,1.128991,0.746127,0.367373,0.0,0.0


In [46]:
# Training models
lin_reg = LinearRegression().fit(X_train, y_train)

alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Setting up GridSearchCV for Lasso Regression
lasso = Lasso()
grid_search_lasso = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_lasso.fit(X_train, y_train)
lasso_reg = grid_search_lasso.best_estimator_
print("Lasso Alpha")
print(grid_search_lasso.best_params_['alpha'])

ridge = Ridge()
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_ridge.fit(X_train, y_train)
ridge_reg = grid_search_ridge.best_estimator_
print("Ridge Alpha")
print(grid_search_ridge.best_params_['alpha'])

# Making predictions
train_predictions_lin = lin_reg.predict(X_train)
train_predictions_lasso = lasso_reg.predict(X_train)
train_predictions_ridge = ridge_reg.predict(X_train)

predictions_lin = lin_reg.predict(X_test)
predictions_lasso = lasso_reg.predict(X_test)
predictions_ridge = ridge_reg.predict(X_test)

# Calculating R² scores
r2_lin = r2_score(y_test, predictions_lin)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_ridge = r2_score(y_test, predictions_ridge)

r2_train_lin = r2_score(y_train, train_predictions_lin)
r2_train_lasso = r2_score(y_train, train_predictions_lasso)
r2_train_ridge = r2_score(y_train, train_predictions_ridge)

print("R² Scores Train:")
print(f"Linear Regression: {r2_train_lin}")
print(f"Lasso Regression: {r2_train_lasso}")
print(f"Ridge Regression: {r2_train_ridge}")

print("R² Scores Test:")
print(f"Linear Regression: {r2_lin}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Ridge Regression: {r2_ridge}")

Fitting 3 folds for each of 6 candidates, totalling 18 fits


  model = cd_fast.enet_coordinate_descent(


[CV 1/3] END ......................alpha=0.001;, score=-0.054 total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV 2/3] END ......................alpha=0.001;, score=-0.052 total time=   3.4s


  model = cd_fast.enet_coordinate_descent(


[CV 3/3] END ......................alpha=0.001;, score=-0.086 total time=   3.1s
[CV 1/3] END .......................alpha=0.01;, score=-0.054 total time=   1.9s
[CV 2/3] END .......................alpha=0.01;, score=-0.052 total time=   0.9s


  model = cd_fast.enet_coordinate_descent(


[CV 3/3] END .......................alpha=0.01;, score=-0.096 total time=   4.2s
[CV 1/3] END ........................alpha=0.1;, score=-0.052 total time=   0.4s
[CV 2/3] END ........................alpha=0.1;, score=-0.052 total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV 3/3] END ........................alpha=0.1;, score=-0.503 total time=   1.5s
[CV 1/3] END ..........................alpha=1;, score=-0.052 total time=   0.2s
[CV 2/3] END ..........................alpha=1;, score=-0.052 total time=   0.2s


  model = cd_fast.enet_coordinate_descent(


[CV 3/3] END ..........................alpha=1;, score=-0.516 total time=   1.4s
[CV 1/3] END .........................alpha=10;, score=-0.052 total time=   0.2s
[CV 2/3] END .........................alpha=10;, score=-0.052 total time=   0.2s
[CV 3/3] END .........................alpha=10;, score=-0.082 total time=   0.2s
[CV 1/3] END ........................alpha=100;, score=-0.052 total time=   0.2s
[CV 2/3] END ........................alpha=100;, score=-0.052 total time=   0.2s
[CV 3/3] END ........................alpha=100;, score=-0.082 total time=   0.2s
Lasso Alpha
100
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/3] END ......................alpha=0.001;, score=-0.054 total time=   0.2s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 2/3] END ......................alpha=0.001;, score=-0.052 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 3/3] END ......................alpha=0.001;, score=-1.019 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/3] END .......................alpha=0.01;, score=-0.054 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 2/3] END .......................alpha=0.01;, score=-0.052 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 3/3] END .......................alpha=0.01;, score=-1.141 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/3] END ........................alpha=0.1;, score=-0.054 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 2/3] END ........................alpha=0.1;, score=-0.052 total time=   0.2s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 3/3] END ........................alpha=0.1;, score=-1.152 total time=   0.2s
[CV 1/3] END ..........................alpha=1;, score=-0.054 total time=   0.2s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 2/3] END ..........................alpha=1;, score=-0.052 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 3/3] END ..........................alpha=1;, score=-1.153 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 1/3] END .........................alpha=10;, score=-0.054 total time=   0.3s


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


[CV 2/3] END .........................alpha=10;, score=-0.052 total time=   0.2s
[CV 3/3] END .........................alpha=10;, score=-1.153 total time=   0.2s
[CV 1/3] END ........................alpha=100;, score=-0.054 total time=   0.2s
[CV 2/3] END ........................alpha=100;, score=-0.052 total time=   0.2s
[CV 3/3] END ........................alpha=100;, score=-1.153 total time=   0.2s
Ridge Alpha
0.001


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


R² Scores Train:
Linear Regression: 0.12396062550909659
Lasso Regression: 0.010075987000404907
Ridge Regression: 0.12396071360878158
R² Scores Test:
Linear Regression: -2735027331518.5166
Lasso Regression: -0.4317563699902309
Ridge Regression: -0.7223594254125365


In [47]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [48]:
param_dist = {
    'num_leaves': sp_randint(3, 50), 
    'min_child_samples': sp_randint(5, 500), 
    'min_child_weight': sp_uniform(0.01, 0.1),
    'subsample': sp_uniform(0.8, 0.2),
    'colsample_bytree': sp_uniform(0.8, 0.2),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 250, 500, 1000, 1500]
}

In [49]:
# lgbm = LGBMRegressor(device='gpu', gpu_device=0)

In [50]:
# random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=25, cv=4, scoring='neg_mean_squared_error', verbose=4)
# random_search.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV 1/4] END colsample_bytree=0.9279941415646209, learning_rate=0.001, min_child_samples=126, min_child_weight=0.03034545288186176, n_estimators=1500, num_leaves=9, reg_alpha=10, reg_lambda=1, subsample=0.9127737852631039;, score=-0.052 total time=  10.1s
[CV 2/4] END colsample_bytree=0.9279941415646209, learning_rate=0.001, min_child_samples=126, min_child_weight=0.03034545288186176, n_estimators=1500, num_leaves=9, reg_alpha=10, reg_lambda=1, subsample=0.9127737852631039;, score=-0.051 total time=   5.7s
[CV 3/4] END colsample_bytree=0.9279941415646209, learning_rate=0.001, min_child_samples=126, min_child_weight=0.03034545288186176, n_estimators=1500, num_leaves=9, reg_alpha=10, reg_lambda=1, subsample=0.9127737852631039;, score=-0.056 total time=   6.4s
[CV 4/4] END colsample_bytree=0.9279941415646209, learning_rate=0.001, min_child_samples=126, min_child_weight=0.03034545288186176, n_estimators=1500, num_leaves=9, reg_a

In [51]:
# best_lgbm = random_search.best_estimator_

In [52]:
# random_search.best_params_

{'colsample_bytree': 0.8466950347494685,
 'learning_rate': 0.01,
 'min_child_samples': 76,
 'min_child_weight': 0.02027055141333768,
 'n_estimators': 100,
 'num_leaves': 26,
 'reg_alpha': 0.1,
 'reg_lambda': 5,
 'subsample': 0.8259488679896974}

In [72]:
lgbm = LGBMRegressor(colsample_bytree= 0.9029527732718773,
 learning_rate= 0.03,
 min_child_samples= 465,
 min_child_weight= 0.02476027076966974,
 n_estimators= 800,
 num_leaves= 16,
 reg_alpha= 10,
 reg_lambda= 0.1,
 subsample= 0.9672960197116944,
 device='gpu', gpu_device=0)

In [73]:
train_preds_lgb = best_lgbm.predict(X_train)

preds_lgb = best_lgbm.predict(X_test)

# Calculating R² scores
r2_lgb = r2_score(y_test, preds_lgb)

r2_train_lgb=r2_score(y_train, train_preds_lgb)

print("R² Scores Train:")
print(f"LGBM: {r2_train_lgb}")

print("R² Scores Test:")
print(f"LGBM: {r2_lgb}")

R² Scores Train:
LGBM: 0.1097554792201132
R² Scores Test:
LGBM: -0.2598196001648281


In [74]:
def rank_configurations(predictions, full_df):
    ranked_configurations = []

    # Create a mapping of DataFrame indices to the range of indices in predictions
    index_mapping = {idx: i for i, idx in enumerate(full_df.index)}

    # Group data by 'config_id' and process each group
    for config_id, group in full_df.groupby('config_id'):
        # Get the corresponding prediction indices for the current group
        prediction_indices = [index_mapping[idx] for idx in group.index]

        # Rank configurations by predicted runtime
        ranked_indices = group.index[np.argsort(predictions[prediction_indices])]

        # Store the original indices of the ranked configurations
        ranked_configurations.append(list(ranked_indices))

    return ranked_configurations
def kendall_tau_correlation(predicted_rankings, full_df, runtime_column='runtime', k=100):
    num_models = len(predicted_rankings)

    for predicted in tqdm(predicted_rankings):
        # Extract the top-k predicted configurations
        top_k_predicted = predicted[:k]

        # Best runtime among top-k predicted configurations
        best_runtime_top_k = full_df.loc[top_k_predicted, runtime_column].min()

        # Best runtime among all configurations in the model group
        config_id = full_df.loc[top_k_predicted[0], 'config_id']
        best_runtime_all = full_df[full_df['config_id'] == config_id][runtime_column].min()

        # Calculate slowdown for this model
        slowdown = 1 - ((best_runtime_top_k / best_runtime_all) - 1)
        total_slowdown += slowdown

    # Average slowdown across all models
    average_slowdown = total_slowdown / num_models
    return average_slowdown

In [78]:
epsilon = 1e-3  # Small value to add to zero runtimes

# Adjusting the 'runtime' column in train_df and test_df
train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)


In [139]:
# Use the existing predictions to rank configurations
ranked_train_lr = rank_configurations(train_predictions_lin, train_df)
ranked_test_lr = rank_configurations(predictions_lin, test_df)

ranked_train_lasso = rank_configurations(train_predictions_lasso, train_df)
ranked_test_lasso = rank_configurations(predictions_lasso, test_df)

ranked_train_ridge = rank_configurations(train_predictions_ridge, train_df)
ranked_test_ridge = rank_configurations(predictions_ridge, test_df)

ranked_train_lgbm = rank_configurations(train_preds_lgb, train_df)
ranked_test_lgbm = rank_configurations(preds_lgb, test_df)

true_ranked_train = rank_configurations(y_train.to_numpy(), train_df)
true_ranked_test = rank_configurations(y_test.to_numpy(), test_df)

# Calculate and print the average top-k slowdown for the train predictions
average_slowdown_lr_train = calculate_top_k_slowdown(ranked_train_lr, train_df)
average_slowdown_lasso_train = calculate_top_k_slowdown(ranked_train_lasso, train_df)
average_slowdown_ridge_train = calculate_top_k_slowdown(ranked_train_ridge, train_df)
average_slowdown_lgbm_train = calculate_top_k_slowdown(ranked_train_lgbm, train_df)

print("Average Top-k Slowdown (LR, Train):", average_slowdown_lr_train)
print("Average Top-k Slowdown (Lasso, Train):", average_slowdown_lasso_train)
print("Average Top-k Slowdown (Ridge, Train):", average_slowdown_ridge_train)
print("Average Top-k Slowdown (LGBM, Train):", average_slowdown_lgbm_train)

# Calculate and print the average top-k slowdown for the test predictions
average_slowdown_lr_test = calculate_top_k_slowdown(ranked_test_lr, test_df)
average_slowdown_lasso_test = calculate_top_k_slowdown(ranked_test_lasso, test_df)
average_slowdown_ridge_test = calculate_top_k_slowdown(ranked_test_ridge, test_df)
average_slowdown_lgbm_test = calculate_top_k_slowdown(ranked_test_lgbm, test_df)

print("Average Top-k Slowdown (LR, Test):", average_slowdown_lr_test)
print("Average Top-k Slowdown (Lasso, Test):", average_slowdown_lasso_test)
print("Average Top-k Slowdown (Ridge, Test):", average_slowdown_ridge_test)
print("Average Top-k Slowdown (LGBM, Test):", average_slowdown_lgbm_test)

100%|██████████| 6/6 [00:00<00:00, 94.34it/s]
100%|██████████| 6/6 [00:00<00:00, 97.39it/s]
100%|██████████| 6/6 [00:00<00:00, 97.75it/s]
100%|██████████| 6/6 [00:00<00:00, 97.09it/s]


Average Top-k Slowdown (LR, Train): -447.7875080767162
Average Top-k Slowdown (Lasso, Train): -386.977590915324
Average Top-k Slowdown (Ridge, Train): -447.7875080767162
Average Top-k Slowdown (LGBM, Train): -470.68899396729734


100%|██████████| 1/1 [00:00<00:00, 186.20it/s]
100%|██████████| 1/1 [00:00<00:00, 232.78it/s]
100%|██████████| 1/1 [00:00<00:00, 226.74it/s]
100%|██████████| 1/1 [00:00<00:00, 147.26it/s]

Average Top-k Slowdown (LR, Test): -2.21434105189962
Average Top-k Slowdown (Lasso, Test): -2.21434105189962
Average Top-k Slowdown (Ridge, Test): -2.21434105189962
Average Top-k Slowdown (LGBM, Test): -2.21434105189962





In [164]:
import math

true_ranked_test = list(np.array(true_ranked_test).flatten())
ranked_test_lgbm = list(np.array(ranked_test_lgbm).flatten())

disc = 0
for i in range(len(true_ranked_test[::100])):
    for j in range(i, len(true_ranked_test[::100])):
        if (true_ranked_test.index(i)-ranked_test_lgbm.index(i)) * (true_ranked_test.index(j)-ranked_test_lgbm.index(j)) < 0:
            disc = disc + 1
            
print(1 - (2 * 10000 * disc / (math.comb(len(true_ranked_test), 2))))

-0.0033842083556820945


In [156]:
import math
from itertools import chain

ranked_train_lgbm_flat = ranked_train_lgbm[0]
true_ranked_train_flat = true_ranked_train[0]

for q in range(1, 6):
    true_ranked_train_flat = list(chain(true_ranked_train_flat, true_ranked_train[q]))
    ranked_train_lgbm_flat = list(chain(ranked_train_lgbm_flat, ranked_train_lgbm[q]))

In [160]:
np.unique(np.array(true_ranked_train_flat)).shape[0]

54105

In [163]:
true_ranked_train_flat = list(np.array(true_ranked_train_flat).flatten())
ranked_train_lgbm_flat = list(np.array(ranked_train_lgbm_flat).flatten())

disc = 0
for i in true_ranked_train_flat[::100]:
    for j in ranked_train_lgbm_flat[::100]:
        if (true_ranked_train_flat.index(i)-ranked_train_lgbm_flat.index(i)) * (true_ranked_train_flat.index(j)-ranked_train_lgbm_flat.index(j)) < 0:
            disc = disc + 1

print(1 - (2 * 10000 * disc / (math.comb(len(true_ranked_train_flat), 2))))

-1.0019561254483196


In [137]:
np.array(true_ranked_train[0]).min()

23545

In [79]:
#ranked_train_lgbm[0]
train_df['runtime']

5704     0.002164
5705     0.003032
5706     0.673605
5707     0.997220
5708     0.323587
           ...   
59804    0.421472
59805    0.382515
59806    0.424939
59807    0.590373
59808    0.284799
Name: runtime, Length: 54105, dtype: float64