In [35]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [48]:
class LayoutDataProcessor:
    def __init__(self, directory, split):
        self.directory = os.path.join(directory, split)
        self.data = []
        self.feature_occurrences = {}

    def load_data(self):
        for filename in tqdm(os.listdir(self.directory)):
            filepath = os.path.join(self.directory, filename)
            self.process_file(filepath, filename)
        self.calculate_occurrence_rates()

    def process_file(self, filepath, filename):
        data = np.load(filepath)
        node_config_ids = data['node_config_ids']
        node_config_feat = data['node_config_feat']
        config_runtime = data['config_runtime']
        node_feat = data['node_feat']
        node_opcode = data['node_opcode']

        # Initialize feature occurrence tracking if not done yet
        if not self.feature_occurrences:
            for i in range(node_config_feat.shape[2]):  # For each feature in node_config_feat
                self.feature_occurrences[f"feature_{i}"] = {}

        # Process each configuration
        for i in range(len(config_runtime)):
            # Configuration feature array for the current configuration
            current_config_features = node_config_feat[i, :, :]

            # Append features to the data dictionary
            row = {
                'config_id': f"{filename}",
                'runtime': config_runtime[i],
            }

            for feature_index in range(current_config_features.shape[1]):
                feature_name = f"feature_{feature_index}"
                feature_value = current_config_features[0, feature_index]  # Assuming features are not multi-dimensional
                row[feature_name] = feature_value

                # Update occurrence counts for each feature
                self.feature_occurrences[feature_name].setdefault(feature_value, 0)
                self.feature_occurrences[feature_name][feature_value] += 1

            self.data.append(row)

    def calculate_occurrence_rates(self):
        for row in self.data:
            for feature_name, occurrences in self.feature_occurrences.items():
                feature_value = row[feature_name]
                total_occurrences = sum(occurrences.values())
                row[feature_name + '_rate'] = occurrences[feature_value] / total_occurrences

    def get_dataframe(self):
        return pd.DataFrame(self.data)


In [49]:
processor = LayoutDataProcessor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/default', 'valid')
processor.load_data()
df_valid = processor.get_dataframe()

100%|██████████| 7/7 [00:07<00:00,  1.09s/it]


In [None]:
df_valid.columns

In [50]:
import warnings

# Use this at the beginning of the cell where you want to suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Iterate over each config_id and scale the target column within each group
for config_id in tqdm(df_valid['config_id'].unique()):
    # Selecting the rows corresponding to the current config_id
    idx = df_valid['config_id'] == config_id
    # Scaling the target column for the current group
    df_valid.loc[idx, 'runtime'] = scaler.fit_transform(df_valid.loc[idx, ['runtime']])

100%|██████████| 7/7 [00:00<00:00, 80.42it/s]


In [51]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66021 entries, 0 to 66020
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   config_id        66021 non-null  object 
 1   runtime          66021 non-null  float64
 2   feature_0        66021 non-null  float32
 3   feature_1        66021 non-null  float32
 4   feature_2        66021 non-null  float32
 5   feature_3        66021 non-null  float32
 6   feature_4        66021 non-null  float32
 7   feature_5        66021 non-null  float32
 8   feature_6        66021 non-null  float32
 9   feature_7        66021 non-null  float32
 10  feature_8        66021 non-null  float32
 11  feature_9        66021 non-null  float32
 12  feature_10       66021 non-null  float32
 13  feature_11       66021 non-null  float32
 14  feature_12       66021 non-null  float32
 15  feature_13       66021 non-null  float32
 16  feature_14       66021 non-null  float32
 17  feature_15  

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [88]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.1, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

# Separating features and target variable
X_train = train_df.drop(['config_id', 'runtime',], axis=1)
y_train = train_df['runtime']
X_test = test_df.drop(['config_id', 'runtime',], axis=1)
y_test = test_df['runtime']

In [42]:
!pip install LightGBM
from lightgbm import LGBMRegressor





In [54]:
df_valid['runtime'].describe()

count    66021.000000
mean         0.072605
std          0.192265
min          0.000000
25%          0.001642
50%          0.003572
75%          0.012358
max          1.000000
Name: runtime, dtype: float64

In [89]:
# Training models
lin_reg = LinearRegression().fit(X_train, y_train)

alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Setting up GridSearchCV for Lasso Regression
lasso = Lasso()
grid_search_lasso = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_lasso.fit(X_train, y_train)
lasso_reg = grid_search_lasso.best_estimator_
print("Lasso Alpha")
print(grid_search_lasso.best_params_['alpha'])

ridge = Ridge()
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_ridge.fit(X_train, y_train)
ridge_reg = grid_search_ridge.best_estimator_
print("Ridge Alpha")
print(grid_search_ridge.best_params_['alpha'])

# Making predictions
train_predictions_lin = lin_reg.predict(X_train)
train_predictions_lasso = lasso_reg.predict(X_train)
train_predictions_ridge = ridge_reg.predict(X_train)

predictions_lin = lin_reg.predict(X_test)
predictions_lasso = lasso_reg.predict(X_test)
predictions_ridge = ridge_reg.predict(X_test)

# Calculating R² scores
r2_lin = r2_score(y_test, predictions_lin)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_ridge = r2_score(y_test, predictions_ridge)

r2_train_lin = r2_score(y_train, train_predictions_lin)
r2_train_lasso = r2_score(y_train, train_predictions_lasso)
r2_train_ridge = r2_score(y_train, train_predictions_ridge)

print("R² Scores Train:")
print(f"Linear Regression: {r2_train_lin}")
print(f"Lasso Regression: {r2_train_lasso}")
print(f"Ridge Regression: {r2_train_ridge}")

print("R² Scores Test:")
print(f"Linear Regression: {r2_lin}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Ridge Regression: {r2_ridge}")

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ......................alpha=0.001;, score=-0.026 total time=   0.1s
[CV 2/3] END ......................alpha=0.001;, score=-0.033 total time=   0.1s
[CV 3/3] END ......................alpha=0.001;, score=-0.049 total time=   0.1s
[CV 1/3] END .......................alpha=0.01;, score=-0.026 total time=   0.1s
[CV 2/3] END .......................alpha=0.01;, score=-0.033 total time=   0.1s
[CV 3/3] END .......................alpha=0.01;, score=-0.050 total time=   0.1s
[CV 1/3] END ........................alpha=0.1;, score=-0.026 total time=   0.1s
[CV 2/3] END ........................alpha=0.1;, score=-0.033 total time=   0.1s
[CV 3/3] END ........................alpha=0.1;, score=-0.050 total time=   0.1s
[CV 1/3] END ..........................alpha=1;, score=-0.026 total time=   0.1s
[CV 2/3] END ..........................alpha=1;, score=-0.033 total time=   0.1s
[CV 3/3] END ..........................alpha=1;, 

In [90]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [91]:
param_dist = {
    'num_leaves': sp_randint(3, 50), 
    'min_child_samples': sp_randint(5, 500), 
    'min_child_weight': sp_uniform(0.01, 0.1),
    'subsample': sp_uniform(0.8, 0.2),
    'colsample_bytree': sp_uniform(0.8, 0.2),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 250, 500, 1000, 1500]
}

In [92]:
lgbm = LGBMRegressor(device='gpu', gpu_device=0)

In [93]:
random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=25, cv=4, scoring='neg_mean_squared_error', verbose=4)
random_search.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV 1/4] END colsample_bytree=0.896350649181155, learning_rate=0.2, min_child_samples=478, min_child_weight=0.030930281034185818, n_estimators=1500, num_leaves=13, reg_alpha=7, reg_lambda=20, subsample=0.839600787134435;, score=-0.028 total time=   0.8s
[CV 2/4] END colsample_bytree=0.896350649181155, learning_rate=0.2, min_child_samples=478, min_child_weight=0.030930281034185818, n_estimators=1500, num_leaves=13, reg_alpha=7, reg_lambda=20, subsample=0.839600787134435;, score=-0.030 total time=   0.8s
[CV 3/4] END colsample_bytree=0.896350649181155, learning_rate=0.2, min_child_samples=478, min_child_weight=0.030930281034185818, n_estimators=1500, num_leaves=13, reg_alpha=7, reg_lambda=20, subsample=0.839600787134435;, score=-0.037 total time=   1.7s
[CV 4/4] END colsample_bytree=0.896350649181155, learning_rate=0.2, min_child_samples=478, min_child_weight=0.030930281034185818, n_estimators=1500, num_leaves=13, reg_alpha=7,

In [94]:
best_lgbm = random_search.best_estimator_

In [95]:
random_search.best_params_

{'colsample_bytree': 0.9961676915049176,
 'learning_rate': 0.1,
 'min_child_samples': 237,
 'min_child_weight': 0.09198541565307419,
 'n_estimators': 500,
 'num_leaves': 34,
 'reg_alpha': 0.1,
 'reg_lambda': 0,
 'subsample': 0.9630288148888715}

In [96]:
train_preds_lgb = best_lgbm.predict(X_train)

preds_lgb = best_lgbm.predict(X_test)

# Calculating R² scores
r2_lgb = r2_score(y_test, preds_lgb)

r2_train_lgb=r2_score(y_train, train_preds_lgb)

print("R² Scores Train:")
print(f"LGBM: {r2_train_lgb}")

print("R² Scores Test:")
print(f"LGBM: {r2_lgb}")

R² Scores Train:
LGBM: 0.05275089918397746
R² Scores Test:
LGBM: 0.2729291941547807


In [97]:
def rank_configurations(predictions, full_df):
    ranked_configurations = []

    # Create a mapping of DataFrame indices to the range of indices in predictions
    index_mapping = {idx: i for i, idx in enumerate(full_df.index)}

    # Group data by 'config_id' and process each group
    for config_id, group in full_df.groupby('config_id'):
        # Get the corresponding prediction indices for the current group
        prediction_indices = [index_mapping[idx] for idx in group.index]

        # Rank configurations by predicted runtime
        ranked_indices = group.index[np.argsort(predictions[prediction_indices])]

        # Store the original indices of the ranked configurations
        ranked_configurations.append(list(ranked_indices))

    return ranked_configurations
def calculate_top_k_slowdown(predicted_rankings, full_df, runtime_column='runtime', k=5):
    total_slowdown = 0
    num_models = len(predicted_rankings)

    for predicted in tqdm(predicted_rankings):
        # Extract the top-k predicted configurations
        top_k_predicted = predicted[:k]

        # Best runtime among top-k predicted configurations
        best_runtime_top_k = full_df.loc[top_k_predicted, runtime_column].min()

        # Best runtime among all configurations in the model group
        config_id = full_df.loc[top_k_predicted[0], 'config_id']
        best_runtime_all = full_df[full_df['config_id'] == config_id][runtime_column].min()

        # Calculate slowdown for this model
        slowdown = 1 - ((best_runtime_top_k / best_runtime_all) - 1)
        total_slowdown += slowdown

    # Average slowdown across all models
    average_slowdown = total_slowdown / num_models
    return average_slowdown

In [98]:
epsilon = 1e-3  # Small value to add to zero runtimes

# Adjusting the 'runtime' column in train_df and test_df
train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['runtime'] = train_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['runtime'] = test_df['runtime'].apply(lambda x: x if x != 0 else x + epsilon)


In [100]:
# Use the existing predictions to rank configurations
ranked_train_lr = rank_configurations(train_predictions_lin, train_df)
ranked_test_lr = rank_configurations(predictions_lin, test_df)

ranked_train_lasso = rank_configurations(train_predictions_lasso, train_df)
ranked_test_lasso = rank_configurations(predictions_lasso, test_df)

ranked_train_ridge = rank_configurations(train_predictions_ridge, train_df)
ranked_test_ridge = rank_configurations(predictions_ridge, test_df)

ranked_train_lgbm = rank_configurations(train_preds_lgb, train_df)
ranked_test_lgbm = rank_configurations(preds_lgb, test_df)

true_ranked_train = rank_configurations(y_train.to_numpy(), train_df)
true_ranked_test = rank_configurations(y_test.to_numpy(), test_df)

# Calculate and print the average top-k slowdown for the train predictions
average_slowdown_lr_train = calculate_top_k_slowdown(ranked_train_lr, train_df)
average_slowdown_lasso_train = calculate_top_k_slowdown(ranked_train_lasso, train_df)
average_slowdown_ridge_train = calculate_top_k_slowdown(ranked_train_ridge, train_df)
average_slowdown_lgbm_train = calculate_top_k_slowdown(ranked_train_lgbm, train_df)

print("Average Top-k Slowdown (LR, Train):", average_slowdown_lr_train)
print("Average Top-k Slowdown (Lasso, Train):", average_slowdown_lasso_train)
print("Average Top-k Slowdown (Ridge, Train):", average_slowdown_ridge_train)
print("Average Top-k Slowdown (LGBM, Train):", average_slowdown_lgbm_train)

# Calculate and print the average top-k slowdown for the test predictions
average_slowdown_lr_test = calculate_top_k_slowdown(ranked_test_lr, test_df)
average_slowdown_lasso_test = calculate_top_k_slowdown(ranked_test_lasso, test_df)
average_slowdown_ridge_test = calculate_top_k_slowdown(ranked_test_ridge, test_df)
average_slowdown_lgbm_test = calculate_top_k_slowdown(ranked_test_lgbm, test_df)

print("Average Top-k Slowdown (LR, Test):", average_slowdown_lr_test)
print("Average Top-k Slowdown (Lasso, Test):", average_slowdown_lasso_test)
print("Average Top-k Slowdown (Ridge, Test):", average_slowdown_ridge_test)
print("Average Top-k Slowdown (LGBM, Test):", average_slowdown_lgbm_test)

100%|██████████| 6/6 [00:00<00:00, 101.32it/s]
100%|██████████| 6/6 [00:00<00:00, 102.06it/s]
100%|██████████| 6/6 [00:00<00:00, 102.86it/s]
100%|██████████| 6/6 [00:00<00:00, 102.70it/s]


Average Top-k Slowdown (LR, Train): -54.65356033924483
Average Top-k Slowdown (Lasso, Train): -111.33296888100442
Average Top-k Slowdown (Ridge, Train): -54.65356033924483
Average Top-k Slowdown (LGBM, Train): -51.382065562165934


100%|██████████| 1/1 [00:00<00:00, 292.47it/s]
100%|██████████| 1/1 [00:00<00:00, 169.15it/s]
100%|██████████| 1/1 [00:00<00:00, 352.08it/s]
100%|██████████| 1/1 [00:00<00:00, 386.39it/s]

Average Top-k Slowdown (LR, Test): -108.47818290451092
Average Top-k Slowdown (Lasso, Test): -35716.980673294915
Average Top-k Slowdown (Ridge, Test): -112.51145646496042
Average Top-k Slowdown (LGBM, Test): -110.82068141018068





In [84]:
#ranked_train_lgbm[0]
train_df['runtime']

8085     0.297965
8086     0.298058
8087     0.297518
8088     0.298140
8089     0.298250
           ...   
66016    0.064773
66017    0.000451
66018    0.000313
66019    0.434318
66020    0.001615
Name: runtime, Length: 57936, dtype: float64