<!-- # Here were cleaning goes -->

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Don't wrap lines
pd.set_option('display.expand_frame_repr', False)  # Don't break into multiple lines

In [2]:
df = pd.read_csv(f"datasets_after_clean_1/eagle_data_all_completed.csv")

In [3]:
df_ = df.copy()

In [4]:
df_.describe()

Unnamed: 0,job_id,wallclock_req,nodes_req,processors_req,gpus_req,mem_req,run_time
count,293633.0,293633.0,293633.0,293633.0,293633.0,293633.0,293633.0
mean,11467310.0,45296.287611,1.330634,18.901598,0.03267,134633.4,5024.657777
std,105343.1,72034.289642,2.5657,72.158847,0.406572,284096.0,21649.96045
min,11293600.0,60.0,1.0,1.0,0.0,5000.0,3.0
25%,11375370.0,14400.0,1.0,1.0,0.0,85248.0,50.0
50%,11464450.0,14400.0,1.0,1.0,0.0,85248.0,82.0
75%,11546640.0,36000.0,1.0,24.0,0.0,85248.0,905.0
max,11684750.0,864000.0,200.0,5400.0,32.0,17049600.0,827806.0


In [5]:
df_.columns

Index(['job_id', 'user', 'account', 'partition', 'qos', 'wallclock_req',
       'nodes_req', 'processors_req', 'gpus_req', 'mem_req', 'submit_time',
       'end_time', 'run_time', 'name', 'work_dir', 'submit_line',
       'job_length'],
      dtype='object')

## Correct job_id ( clusterize jobs having same into one cluster to make model deffrentiate between normal jobs and job arrays, generally having close runtime)

In [6]:
resource_cols = [
    'wallclock_req', 'nodes_req', 'processors_req', 'partition', 'qos',
    'gpus_req', 'mem_req', 'submit_line', 'work_dir', 'name','user', 'account',
]

# Step 1: Generate a group ID based on resource configuration
df_['group_id'] = df_.groupby(resource_cols).ngroup()

# Step 2: Count how many times each group_id appears
group_counts = df_['group_id'].value_counts()

# Step 3: Keep original job_id if group is unique; otherwise use group_id
df_['job_id'] = df_.apply(
    lambda row: row['group_id'] if group_counts[row['group_id']] > 1 else row['job_id'],
    axis=1
)

# Step 4: (Optional) drop group_id column
df_.drop(columns='group_id', inplace=True)

## Split into train and test

In [None]:
import pandas as pd

# Sort data by submit_time ascending
df_sorted = df_.sort_values('submit_time').reset_index(drop=True)

# Calculate split index
split_index = int(len(df_) * 0.99)

# Define target column name
target_col = 'run_time'

# Split features and target
X = df_sorted.drop(columns=[target_col])
y = df_sorted[target_col]

# Split into train and test sets chronologically
X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]

y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]


print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (290696, 16)
X_test shape: (2937, 16)
y_train shape: (290696,)
y_test shape: (2937,)


In [9]:
X_train['qos'].value_counts()


qos
normal     267175
high        17135
standby      6386
Name: count, dtype: int64

## Delete outliers ?

In [8]:
# # Calculate IQR bounds
# Q1 = y_train.quantile(0.25)
# Q3 = y_train.quantile(0.75)
# IQR = Q3 - Q1

# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Identify outliers
# outliers_mask = (y_train < lower_bound) | (y_train > upper_bound)

# # Print number of outliers
# print("Nbr of outliers:", outliers_mask.sum())

# # Access outlier runtimes
# outlier_y = y_train[outliers_mask]

# # Min/Max runtime among outliers (in hours)
# print("Min outlier run_time (hrs):", outlier_y.min() / 3600)
# print("Max outlier run_time (hrs):", outlier_y.max() / 3600)

# # Remove outliers for training
# X_train = X_train[~outliers_mask]
# y_train = y_train[~outliers_mask]

# # Min/Max runtime after removing outliers (in hours)
# print("Min run_time (hrs):", y_train.min() / 3600)
# print("Max run_time (hrs):", y_train.max() / 3600)


## Balance data

In [None]:
import pandas as pd

# Step 1: Split by qos category
X_train_normal = X_train[X_train['qos'] == 'normal']
X_train_high = X_train[X_train['qos'] == 'high']
X_train_standby = X_train[X_train['qos'] == 'standby']

y_train_normal = y_train.loc[X_train_normal.index]
y_train_high = y_train.loc[X_train_high.index]
y_train_standby = y_train.loc[X_train_standby.index]

# ✅ Step 2: Downsample 'high' to match smallest class (or define target_size manually)
target_size = min(len(X_train_normal), len(X_train_standby))  # Or set: target_size = 1000

X_train_high_downsampled = X_train_high.sample(n=target_size, random_state=42)
y_train_high_downsampled = y_train_high.loc[X_train_high_downsampled.index]

# ✅ Step 3: Recombine: keep normal + standby, only use downsampled high
X_train = pd.concat([X_train_normal, X_train_standby, X_train_high_downsampled], ignore_index=True)
y_train = pd.concat([y_train_normal, y_train_standby, y_train_high_downsampled], ignore_index=True)

# Step 2: Downsample 'normal' to same size as minority class
target_size = min(len(X_train_standby), len(X_train_high)) 

X_train_normal_downsampled = X_train_normal.sample(n=target_size, random_state=42)
y_train_normal_downsampled = y_train_normal.loc[X_train_normal_downsampled.index]

# Recombine all downsampled sets
X_train = pd.concat([X_train_normal_downsampled, X_train_standby, X_train_high_downsampled], ignore_index=True)
y_train = pd.concat([y_train_normal_downsampled, y_train_standby, y_train_high_downsampled], ignore_index=True)



# ✅ Step 4: Shuffle both X and y together
shuffled = X_train.copy()
shuffled['__y__'] = y_train
shuffled = shuffled.sample(frac=1, random_state=42).reset_index(drop=True)

# ✅ Step 5: Final X_train and y_train
y_train = shuffled.pop('__y__')
X_train = shuffled

# ✅ Final check
print(X_train['qos'].value_counts())
print(f"X_train shape: {X_train.shape}")
# print(f"y_train shape: {y_train.shape}")


qos
high       6386
normal     6386
standby    6386
Name: count, dtype: int64
X_train shape: (19158, 16)


In [10]:
# Step 1: Get top 5 partitions by count
top_partitions = X_train['partition'].value_counts().nlargest(5).index

# Step 2: Create mask and apply to both X_train and y_train
mask = X_train['partition'].isin(top_partitions)

X_train = X_train[mask].copy()
y_train = y_train[mask].copy()

# Optional: Check result
print(X_train['partition'].value_counts())
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

partition
partition007    6051
partition001    5480
partition026    3329
partition028    3002
partition006     444
Name: count, dtype: int64
X_train shape: (18306, 16)
y_train shape: (18306,)


In [None]:
X_train['user_account_name'] = X_train['user'] + '@' + X_train['account'] + '@' + X_train['name']
user_counts = X_train['user_account_name'].value_counts()

X_test['user_account_name'] = X_test['user'] + '@' + X_test['account'] + '@' + X_test['name']
threshold = 200  # Keep users with more than 200 jobs
frequent_users = user_counts[user_counts > threshold].index

X_train['user_account_name'] = X_train['user_account_name'].apply(lambda x: x if x in frequent_users else 'Other')
X_train['user_account_name'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['user_account_name'] = X_test['user'] + '@' + X_test['account'] + '@' + X_test['name']


user_account_name
Other                                9605
user0716@account0464@name00295747    3183
user0095@account0529@name00011523    2531
user0295@account0472@name02650679    1072
user0776@account0402@name00015238     858
user0864@account0072@name02494232     366
user0819@account0033@name00004484     258
user0227@account0033@name00004484     221
user0322@account0180@name00059927     212
Name: count, dtype: int64

In [12]:
import os

# Define the directory name
directory_name = "datasets_after_clean_2"

# Check if the directory exists, if not, create it
if not os.path.exists(directory_name):
    os.makedirs(directory_name)
    print(f"Directory '{directory_name}' created.")
else:
    print(f"Directory '{directory_name}' already exists.")

Directory 'datasets_after_clean_2' already exists.


In [13]:
X_train.to_csv('datasets_after_clean_2/eagle_data_all_completed_X_train.csv', index=False)
X_test.to_csv('datasets_after_clean_2/eagle_data_all_completed_X_test.csv', index=False)
y_train.to_csv('datasets_after_clean_2/eagle_data_all_completed_y_train.csv', index=False)
y_test.to_csv('datasets_after_clean_2/eagle_data_all_completed_y_test.csv', index=False)