In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Don't wrap lines
pd.set_option('display.expand_frame_repr', False)  # Don't break into multiple lines

In [15]:
load = ""
X_train = pd.read_csv(f'./datasets_after_clean_2/eagle_data_all_completed_X_train.csv')
X_test= pd.read_csv(f'./datasets_after_clean_2/eagle_data_all_completed_X_test.csv')
y_train = pd.read_csv(f'./datasets_after_clean_2/eagle_data_all_completed_y_train.csv')
y_test = pd.read_csv(f'./datasets_after_clean_2/eagle_data_all_completed_y_test.csv')

print(len(X_train), len(X_test), len(y_train), len(y_test))

18306 2937 18306 2937


## Target encoding for qos and partition on whole dataset

In [16]:
import pandas as pd

def target_encode_df(df, columns, target_col, base_label=20, step=5):
    df= df.copy()
    encoding_maps = {}

    for col in columns:
        # 1. Mean target per category
        means = df.groupby(col)[target_col].mean()
        
        # 2. Sort and assign labels
        sorted_cats = means.sort_values().index
        label_map = {cat: base_label + i * step for i, cat in enumerate(sorted_cats)}
        
        # 3. Apply mapping
        df[col] = df[col].map(label_map)
        encoding_maps[col] = label_map

    return df, encoding_maps

# ------------------------
# 🔁 1. Merge train & test
# ------------------------

X_train['__dataset__'] = 'train'
X_test['__dataset__'] = 'test'

y_train.name = 'run_time'
y_test.name = 'run_time'

df_train = X_train.copy()
df_train['run_time'] = y_train

df_test = X_test.copy()
df_test['run_time'] = y_test

df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

# -------------------------------
# 🧠 2. Target-based label encode
# -------------------------------

cols_to_encode = ['qos', 'partition']  # your target categorical columns

df_all, encodings = target_encode_df(df_all, cols_to_encode, target_col='run_time')

# -------------------------------------
# 🔁 3. Split back to X_train and X_test
# -------------------------------------

X_train= df_all[df_all['__dataset__'] == 'train'].drop(columns=['run_time', '__dataset__'])
y_train= df_all[df_all['__dataset__'] == 'train']['run_time']

X_test= df_all[df_all['__dataset__'] == 'test'].drop(columns=['run_time', '__dataset__'])
y_test= df_all[df_all['__dataset__'] == 'test']['run_time']

# ✅ You now have:
# X_train, y_train, X_test, y_test

In [17]:
# # ✅ Ensure y_train and y_test are Series with correct name
# y_train = y_train.squeeze()
# y_test = y_test.squeeze()

# y_train.name = 'run_time'
# y_test.name = 'run_time'

# # ✅ Concatenate cleanly
# train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
# test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

# df_ = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [18]:
import numpy as np
# Job type classification
nodes = df_all['nodes_req'].values
cpus = df_all['processors_req'].values
gpus = df_all['gpus_req'].values

conditions = [
    (gpus > 0) & (nodes > 1),
    (gpus > 0) & (nodes == 1),
    (gpus == 0) & (nodes > 1),
    (gpus == 0) & (nodes == 1) & (cpus > 1),
]
choices = ["distributed_gpu", "single_node_gpu", "distributed", "multithreaded"]

df_all['job_type'] = np.select(conditions, choices, default="serial")
job_type_map = {"serial": 0, "multithreaded": 1, "distributed": 2, "single_node_gpu": 3, "distributed_gpu": 4}

df_all['job_type_id'] = df_all['job_type'].map(job_type_map).astype(int)

# Generate job_id and composite categorical feature
df_all['job_id'] = df_all['job_id'].astype(str)  # if not already

QoS	Supported Partition(s)	Max Job Duration	Max Resources per job
debug	intel, amd, gpu, l40s, condo_amd	30min	2 nodes, 2 GPUs
normal (default)	intel, amd, condo_amd	1 Week	1024 cores
long	intel, amd, condo_amd	2 Weeks	1 node

In [19]:
df_all['processors_req_nodes'] =df_all['processors_req']/df_all['nodes_req']
df_all['processors_req_mem'] =df_all['mem_req']/df_all['processors_req']
df_all['nodes_req_mem'] = df_all['mem_req']/df_all['nodes_req']
df_all['partition_wallclock'] =df_all['partition'] *df_all['wallclock_req']

In [20]:
# 1. Get ori sizes
n_train = len(X_train)
n_test = len(X_test)

# 2. Recreate X and y from df_
X_all = df_all.drop(columns=['run_time'])
y_all = df_all['run_time']

# 3. Resplit exactly as before
X_train = X_all.iloc[:n_train].copy()
X_test = X_all.iloc[n_train:].copy()

y_train = y_all.iloc[:n_train].copy()
y_test = y_all.iloc[n_train:].copy()

## Label encoding

In [21]:
categorical_features = [col for col in ['user_account_name']]

def factorize_train_then_test_dynamic(dataset, cat_columns):
    mapping_dict = {}

    for col in cat_columns:
        uniques_train = dataset[col].astype(str).unique()
        cat_to_int = {k: i for i, k in enumerate(uniques_train)}

        dataset[f"{col}"] = dataset[col].astype(str).map(cat_to_int)
        mapping_dict[col] = cat_to_int

    return dataset, mapping_dict


def apply_factorization_to_test(dataset, mapping_dict):
    dataset = dataset.copy()

    for col, cat_to_int in mapping_dict.items():
        # make sure “Other” is in the mapping
        if 'Other' not in cat_to_int:
            raise KeyError(f"'Other' not found in mapping for column {col}")

        other_code = cat_to_int['Other']

        # map unseen values to other_code
        dataset[col] = (dataset[col]
                        .astype(str)
                        .map(lambda x: cat_to_int.get(x, other_code))
                        .astype(int)
                       )

    return dataset

X_train, mappings = factorize_train_then_test_dynamic(
    X_train, categorical_features
)
X_test = apply_factorization_to_test(X_test, mappings)

In [22]:
import os

# Define the directory name
directory_name = "datasets_after_transformation"

# Check if the directory exists, if not, create it
if not os.path.exists(directory_name):
    os.makedirs(directory_name)
    print(f"Directory '{directory_name}' created.")
else:
    print(f"Directory '{directory_name}' already exists.")

Directory 'datasets_after_transformation' already exists.


In [23]:
X_train.to_csv('datasets_after_transformation/eagle_data_all_completed_X_train.csv', index=False)
X_test.to_csv('datasets_after_transformation/eagle_data_all_completed_X_test.csv', index=False)
y_train.to_csv('datasets_after_transformation/eagle_data_all_completed_y_train.csv', index=False)
y_test.to_csv('datasets_after_transformation/eagle_data_all_completed_y_test.csv', index=False)