In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
tqdm.pandas()

In [None]:
#Just featurize each config with its 1) config-level features and 2) average all node features and assign to the config
#Output is simple dataframe with one config per row, its runtime, and features
class TileDataExtractor:
    def __init__(self, directory, split):
        self.directory = os.path.join(directory, split)
        self.data = []

    def load_data(self):
        for filename in tqdm(os.listdir(self.directory)):
            filepath = os.path.join(self.directory, filename)
            self.process_file(filepath, filename)

    def process_file(self, filepath, filename):
        data = np.load(filepath)
        config_feat = data['config_feat']
        node_feat = data['node_feat']
        node_feat_avg = np.mean(node_feat, axis=0)
        runtime = data['config_runtime']
        runtime_norm = data['config_runtime_normalizers']
        opcodes = data['node_opcode']
        dist, _ = np.histogram(data['node_opcode'], bins=np.arange(0, 120, 2))
        dist1 = np.argmax(dist)
        
        for i in range(len(config_feat)):
            row = {
                'config_id': f"{filename}",
                'config_feat': config_feat[i],
                'node_feat_avg': node_feat_avg,
                'opcode': dist1,
                'runtime': runtime[i],
                'runtime_norm': runtime_norm[i]
            }
            self.data.append(row)

    def get_dataframe(self):
        return pd.DataFrame(self.data)

In [None]:
#extractor = TileDataExtractor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/tile/xla', 'train')
#extractor.load_data()
#df_train = extractor.get_dataframe()

In [None]:
# Unpack 'config_feat'
#config_feat_df = df_train['config_feat'].apply(pd.Series)
#config_feat_df.columns = [f'config_feat_{i}' for i in range(config_feat_df.shape[1])]

# Unpack 'node_feat_avg'
#node_feat_avg_df = df_train['node_feat_avg'].apply(pd.Series)
#node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]

# Concatenate with the original DataFrame
#df_train = pd.concat([df_train.drop(['config_feat', 'node_feat_avg'], axis=1), config_feat_df, node_feat_avg_df], axis=1)

In [None]:
extractor = TileDataExtractor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/tile/xla', 'valid')
extractor.load_data()
df_valid = extractor.get_dataframe()

In [None]:
df_valid.head()

In [None]:
del extractor

In [None]:
# Unpack 'config_feat'
config_feat_df = df_valid['config_feat'].apply(pd.Series)
config_feat_df.columns = [f'config_feat_{i}' for i in range(config_feat_df.shape[1])]

# Unpack 'node_feat_avg'
node_feat_avg_df = df_valid['node_feat_avg'].apply(pd.Series)
node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]

# Concatenate with the original DataFrame
df_valid = pd.concat([df_valid.drop(['config_feat', 'node_feat_avg'], axis=1), config_feat_df, node_feat_avg_df], axis=1)

In [None]:
#extractor = TileDataExtractor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/tile/xla', 'test')
#extractor.load_data()
#df_test = extractor.get_dataframe()

In [None]:
# Unpack 'config_feat'
#config_feat_df = df_test['config_feat'].apply(pd.Series)
#config_feat_df.columns = [f'config_feat_{i}' for i in range(config_feat_df.shape[1])]

# Unpack 'node_feat_avg'
#node_feat_avg_df = df_test['node_feat_avg'].apply(pd.Series)
#node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]

# Concatenate with the original DataFrame
#df_test = pd.concat([df_test.drop(['config_feat', 'node_feat_avg'], axis=1), config_feat_df, node_feat_avg_df], axis=1)

In [None]:
del config_feat_df
del node_feat_avg_df

In [None]:
#df_train['target'] = df_train['runtime'] / df_train['runtime_norm']
#df_train = df_train.drop(columns=['runtime', 'runtime_norm'])

In [None]:
df_valid['target'] = df_valid['runtime'] / df_valid['runtime_norm']

In [None]:
#df_test['target'] = df_test['runtime'] / df_test['runtime_norm']
#df_test = df_test.drop(columns=['runtime', 'runtime_norm'])

In [None]:
%%capture --no-stderr

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Iterate over each config_id and scale the target column within each group
for config_id in df_valid['config_id'].unique():
    # Selecting the rows corresponding to the current config_id
    idx = df_valid['config_id'] == config_id
    
    # Scaling the target column for the current group
    df_valid.loc[idx, 'target'] = scaler.fit_transform(df_valid.loc[idx, ['target']])

#df_valid['target'] = (df_valid['target']-np.mean(df_valid['target']))/(np.std(df_valid['target'])+1e-5)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.2, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

# Separating features and target variable
X_train = train_df.drop(['target', 'config_id', 'runtime', 'runtime_norm'], axis=1)
y_train = train_df['target']
X_test = test_df.drop(['target', 'config_id', 'runtime', 'runtime_norm'], axis=1)
y_test = test_df['target']

In [None]:
X_train.head()

In [None]:
!pip install LightGBM
from lightgbm import LGBMRegressor

In [None]:
#obtained from gridsearchcv
lasso_alpha = 0.01
ridge_alpha = 0.001

lr = LinearRegression()
lasso = Lasso(alpha = lasso_alpha)
ridge = Ridge(alpha = ridge_alpha)
lgbm = LGBMRegressor(colsample_bytree= 0.9029527732718773,
 learning_rate= 0.03,
 min_child_samples= 465,
 min_child_weight= 0.02476027076966974,
 n_estimators= 1400,
 num_leaves= 16,
 reg_alpha= 10,
 reg_lambda= 0.1,
 subsample= 0.9672960197116944)

models = [lr, lasso, ridge, lgbm]
for model in tqdm(models):
    model.fit(X_train, y_train)

In [None]:
np.unique(df_valid["opcode"])

In [None]:
# Making predictions
train_preds_lr = lr.predict(X_train)
train_preds_lasso = lasso.predict(X_train)
train_preds_ridge = ridge.predict(X_train)
train_preds_lgbm = lgbm.predict(X_train)

test_preds_lr = lr.predict(X_test)
test_preds_lasso = lasso.predict(X_test)
test_preds_ridge = ridge.predict(X_test)
test_preds_lgbm = lgbm.predict(X_test)

# Calculating R² scores
test_r2_lr = r2_score(y_test, test_preds_lr)
test_r2_lasso = r2_score(y_test, test_preds_lasso)
test_r2_ridge = r2_score(y_test, test_preds_ridge)
test_r2_lgbm = r2_score(y_test, test_preds_lgbm)

train_r2_lr= r2_score(y_train, train_preds_lr)
train_r2_lasso = r2_score(y_train, train_preds_lasso)
train_r2_ridge = r2_score(y_train, train_preds_ridge)
train_r2_lgbm = r2_score(y_train, train_preds_lgbm)

print("R² Scores Train:")
print(f"Linear Regression: {train_r2_lr}")
print(f"Lasso Regression: {train_r2_lasso}")
print(f"Ridge Regression: {train_r2_ridge}")
print(f"LGBM Regression: {train_r2_lgbm}")

print("R² Scores Test:")
print(f"Linear Regression: {test_r2_lr}")
print(f"Lasso Regression: {test_r2_lasso}")
print(f"Ridge Regression: {test_r2_ridge}")
print(f"LGBM Regression: {test_r2_lgbm}")

below is all old

In [None]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.2, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

# Separating features and target variable
X_train = train_df.drop(['target', 'config_id'], axis=1)
y_train = train_df['target']
X_test = test_df.drop(['target', 'config_id'], axis=1)
y_test = test_df['target']

del train_df
del test_df
del train_config_ids
del test_config_ids
del unique_config_ids

# Training models
lin_reg = LinearRegression().fit(X_train, y_train)

alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Setting up GridSearchCV for Lasso Regression
lasso = Lasso()
grid_search_lasso = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_lasso.fit(X_train, y_train)
lasso_reg = grid_search_lasso.best_estimator_
print("Lasso Alpha")
print(grid_search_lasso.best_params_['alpha'])

ridge = Ridge()
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=alpha_grid, cv=3, scoring='neg_mean_squared_error',verbose=4)
grid_search_ridge.fit(X_train, y_train)
ridge_reg = grid_search_ridge.best_estimator_
print("Ridge Alpha")
print(grid_search_ridge.best_params_['alpha'])

# Making predictions
train_predictions_lin = lin_reg.predict(X_train)
train_predictions_lasso = lasso_reg.predict(X_train)
train_predictions_ridge = ridge_reg.predict(X_train)

predictions_lin = lin_reg.predict(X_test)
predictions_lasso = lasso_reg.predict(X_test)
predictions_ridge = ridge_reg.predict(X_test)

# Calculating R² scores
r2_lin = r2_score(y_test, predictions_lin)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_ridge = r2_score(y_test, predictions_ridge)

r2_train_lin = r2_score(y_train, train_predictions_lin)
r2_train_lasso = r2_score(y_train, train_predictions_lasso)
r2_train_ridge = r2_score(y_train, train_predictions_ridge)

print("R² Scores Train:")
print(f"Linear Regression: {r2_train_lin}")
print(f"Lasso Regression: {r2_train_lasso}")
print(f"Ridge Regression: {r2_train_ridge}")

print("R² Scores Test:")
print(f"Linear Regression: {r2_lin}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Ridge Regression: {r2_ridge}")

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
#!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
!nvidia-smi

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgbm = LGBMRegressor('colsample_bytree': 0.9029527732718773,
 'learning_rate': 0.05,
 'min_child_samples': 465,
 'min_child_weight': 0.02476027076966974,
 'n_estimators': 1500,
 'num_leaves': 26,
 'reg_alpha': 10,
 'reg_lambda': 0.1,
 'subsample': 0.9672960197116944)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [None]:
param_dist = {
    'num_leaves': sp_randint(3, 50), 
    'min_child_samples': sp_randint(5, 500), 
    'min_child_weight': sp_uniform(0.01, 0.1),
    'subsample': sp_uniform(0.8, 0.2),
    'colsample_bytree': sp_uniform(0.8, 0.2),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 250, 500, 1000, 1500]
}

In [None]:
random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=25, cv=4, scoring='neg_mean_squared_error', verbose=4)
random_search.fit(X_train, y_train)

In [None]:
best_lgbm = random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
train_preds_lgb = best_lgbm.predict(X_train)

preds_lgb = best_lgbm.predict(X_test)

# Calculating R² scores
r2_lgb = r2_score(y_test, preds_lgb)

r2_train_lgb=r2_score(y_train, train_preds_lgb)

print("R² Scores Train:")
print(f"LGBM: {r2_train_lgb}")

print("R² Scores Test:")
print(f"LGBM: {r2_lgb}")

In [None]:
extractor = TileDataExtractor('/kaggle/input/predict-ai-model-runtime/npz_all/npz/tile/xla', 'test')
extractor.load_data()
df_test = extractor.get_dataframe()

In [None]:
# Unpack 'config_feat'
config_feat_df = df_test['config_feat'].apply(pd.Series)
config_feat_df.columns = [f'config_feat_{i}' for i in range(config_feat_df.shape[1])]

# Unpack 'node_feat_avg'
node_feat_avg_df = df_test['node_feat_avg'].apply(pd.Series)
node_feat_avg_df.columns = [f'node_feat_avg_{i}' for i in range(node_feat_avg_df.shape[1])]

# Concatenate with the original DataFrame
df_test = pd.concat([df_test.drop(['config_feat', 'node_feat_avg'], axis=1), config_feat_df, node_feat_avg_df], axis=1)

In [None]:
df_test = df_test.drop

In [None]:
lgbm_tile_preds = best_lgbm.predict()

Rest of code isn't working ignore

In [None]:
unique_config_ids = df_valid['config_id'].unique()
train_config_ids, test_config_ids = train_test_split(unique_config_ids, test_size=0.2, random_state=42)

# Creating train and test dataframes based on config_id
train_df = df_valid[df_valid['config_id'].isin(train_config_ids)]
test_df = df_valid[df_valid['config_id'].isin(test_config_ids)]

In [None]:
def rank_predictions_2d(predictions, test_df):
    # Combining predictions with the test dataframe
    ranked_df = test_df[['config_id']].copy()
    ranked_df['predicted_target'] = predictions

    # Group by 'config_id' and rank predictions
    ranked_df['rank'] = ranked_df.groupby('config_id')['predicted_target'].rank(method='dense')

    # Convert the ranked dataframe into a 2D list
    grouped = ranked_df.groupby('config_id')['rank'].apply(list)
    return grouped.tolist()

train_ranked_predictions_lin = rank_predictions(train_predictions_lin, train_df)
train_ranked_predictions_lasso = rank_predictions(train_predictions_lasso, train_df)
train_ranked_predictions_ridge = rank_predictions(train_predictions_ridge, train_df)
train_ranked_predictions_lgbm = rank_predictions(train_preds_lgb, train_df)

train_true_rankings = rank_predictions(y_train, train_df)

ranked_predictions_lin = rank_predictions(predictions_lin, test_df)
ranked_predictions_lasso = rank_predictions(predictions_lasso, test_df)
ranked_predictions_ridge = rank_predictions(predictions_ridge, test_df)
ranked_predictions_lgbm = rank_predictions(preds_lgb, test_df)

# Ranking true target values
true_rankings = rank_predictions(y_test, test_df)

In [None]:
def calculate_slowdown_metric(true_runtimes, predicted_ranks, k):
    """
    Calculate the slowdown metric.

    :param true_runtimes: Array of true runtimes.
    :param predicted_ranks: Array of predicted rankings.
    :param k: Top-K predictions to consider.
    :return: Slowdown metric value.
    """
    # Get indices of top-K predictions
    top_k_indices = np.argsort(predicted_ranks)[:k]

    # Best runtime among the top-K predictions
    best_runtime_top_k = np.min(true_runtimes[top_k_indices])

    # Best runtime in all configurations
    best_runtime_all = np.min(true_runtimes)

    # Calculate slowdown metric
    slowdown = 1 - ((best_runtime_top_k / best_runtime_all) - 1)
    return slowdown

In [None]:
def average_slowdown_metric(true_runtimes, predicted_rankings, k=5):
    """
    Calculate the average slowdown metric across multiple configurations.

    :param true_runtimes: 2D array of true runtimes, where each row corresponds to a different config_id.
    :param predicted_rankings: 2D array of predicted rankings, where each row corresponds to a different config_id.
    :param k: Top-K predictions to consider.
    :return: Average slowdown metric value.
    """
    total_slowdown = 0
    n_configurations = len(true_runtimes)

    for i in range(n_configurations):
        slowdown = calculate_slowdown_metric(true_runtimes[i], predicted_rankings[i], k)
        total_slowdown += slowdown

    average_slowdown = total_slowdown / n_configurations
    return average_slowdown

In [None]:
# Calculate average slowdown metric for each model on the train set
avg_slowdown_train_lin = average_slowdown_metric(train_true_rankings, train_ranked_predictions_lin)
avg_slowdown_train_lasso = average_slowdown_metric(train_true_rankings, train_ranked_predictions_lasso)
avg_slowdown_train_ridge = average_slowdown_metric(train_true_rankings, train_ranked_predictions_ridge)
avg_slowdown_train_lgbm = average_slowdown_metric(train_true_rankings, train_ranked_predictions_lgbm)

# Calculate average slowdown metric for each model on the test set
avg_slowdown_test_lin = average_slowdown_metric(true_rankings, ranked_predictions_lin)
avg_slowdown_test_lasso = average_slowdown_metric(true_rankings, ranked_predictions_lasso)
avg_slowdown_test_ridge = average_slowdown_metric(true_rankings, ranked_predictions_ridge)
avg_slowdown_test_lgbm = average_slowdown_metric(true_rankings, ranked_predictions_lgbm)

# Print the results
print("Average Slowdown Metrics (Train Set):")
print(f"Linear Regression: {avg_slowdown_train_lin}")
print(f"Lasso Regression: {avg_slowdown_train_lasso}")
print(f"Ridge Regression: {avg_slowdown_train_ridge}")
print(f"LightGBM: {avg_slowdown_train_lgbm}")

print("\nAverage Slowdown Metrics (Test Set):")
print(f"Linear Regression: {avg_slowdown_test_lin}")
print(f"Lasso Regression: {avg_slowdown_test_lasso}")
print(f"Ridge Regression: {avg_slowdown_test_ridge}")
print(f"LightGBM: {avg_slowdown_test_lgbm}")