In [1]:
%cd "F:/University/Master's Thesis/Datasets"

F:\University\Master's Thesis\Datasets


In [2]:
import os
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

In [3]:
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from pytorch_forecasting import Baseline, NHiTS, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import MAE, SMAPE, MQF2DistributionLoss, QuantileLoss

In [4]:
# Load BGL dataset into a DataFrame (Replace 'path_to_your_file' with the actual file path)
file_path = 'bgl_structured_data.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,LineId,Label,Timestamp,Date,Node,Time,NodeRepeat,Type,Component,Level,Content,EventId,EventTemplate,ParameterList
0,1186991,-,1119695655,2005.06.25,R32-M0-NA-C:J14-U11,2005-06-25-03.34.15.417108,R32-M0-NA-C:J14-U11,RAS,KERNEL,INFO,254619084 double-hummer alignment exceptions,6265c739,<*> double-hummer alignment exceptions,['254619084']
1,1186992,-,1119695655,2005.06.25,R32-M0-NA-C:J10-U11,2005-06-25-03.34.15.438456,R32-M0-NA-C:J10-U11,RAS,KERNEL,INFO,255442463 double-hummer alignment exceptions,6265c739,<*> double-hummer alignment exceptions,['255442463']
2,1186993,-,1119695655,2005.06.25,R32-M0-NA-C:J10-U11,2005-06-25-03.34.15.451519,R32-M0-NA-C:J10-U11,RAS,KERNEL,INFO,256867576 double-hummer alignment exceptions,6265c739,<*> double-hummer alignment exceptions,['256867576']
3,1186994,-,1119695655,2005.06.25,R32-M0-NA-C:J06-U11,2005-06-25-03.34.15.472919,R32-M0-NA-C:J06-U11,RAS,KERNEL,INFO,255099253 double-hummer alignment exceptions,6265c739,<*> double-hummer alignment exceptions,['255099253']
4,1186995,-,1119695655,2005.06.25,R32-M0-NA-C:J06-U11,2005-06-25-03.34.15.486042,R32-M0-NA-C:J06-U11,RAS,KERNEL,INFO,258522516 double-hummer alignment exceptions,6265c739,<*> double-hummer alignment exceptions,['258522516']


In [5]:
df["Time"] = pd.to_datetime(df["Time"], format='%Y-%m-%d-%H.%M.%S.%f')

# Calculate time_idx for seconds
df["time_idx"] = (
    df["Time"].dt.year * 365 * 24 * 60 * 60 +  # Convert years to seconds
    df["Time"].dt.month * 30 * 24 * 60 * 60 +  # Convert months to seconds (assuming 30 days per month)
    df["Time"].dt.day * 24 * 60 * 60 +  # Convert days to seconds
    df["Time"].dt.hour * 60 * 60 +  # Convert hours to seconds
    df["Time"].dt.minute * 60 +  # Convert minutes to seconds
    df["Time"].dt.second  # Seconds
)

# Adjust time_idx to start from 0
df["time_idx"] -= df["time_idx"].min()
df = df.sort_values(by="time_idx")

# Assuming 'df' is your DataFrame
df = df.drop_duplicates(subset=['time_idx', 'Node', "EventId"], keep=False)

# Display the DataFrame with the new time_idx column
print(df[["Node", "Level", "time_idx", "Component", "EventTemplate"]])

                        Node Level  time_idx Component  \
1644188  R02-M1-N0-C:J12-U11  INFO        15    KERNEL   
1644253  R02-M1-N0-C:J12-U11  INFO        38    KERNEL   
1644260  R02-M1-N0-C:J12-U11  INFO        40    KERNEL   
1644269  R02-M1-N0-C:J12-U11  INFO        44    KERNEL   
1644292  R02-M1-N0-C:J12-U11  INFO        52    KERNEL   
...                      ...   ...       ...       ...   
2356265  R37-M1-N0-C:J02-U11  INFO  15353606    KERNEL   
2356266  R51-M1-N0-C:J08-U11  INFO  15353606    KERNEL   
2356267  R37-M0-N0-C:J05-U11  INFO  15353606    KERNEL   
2356259  R10-M1-NC-C:J05-U01  INFO  15353606    KERNEL   
2356251  R57-M1-N7-C:J11-U11  INFO  15353606    KERNEL   

                                             EventTemplate  
1644188           instruction cache parity error corrected  
1644253           instruction cache parity error corrected  
1644260           instruction cache parity error corrected  
1644269           instruction cache parity error corrected 

In [6]:
unique_values = df['EventId'].unique()
print(len(unique_values))

516


In [7]:
df['Node'].fillna('-', inplace=True)

df['Node'] = df['Node'].astype('str')
df['Component'] = df['Component'].astype('str')
df['Level'] = df['Level'].astype('str')
df['EventId'] = df['EventId'].astype('str')
df['EventTemplate'] = df['EventTemplate'].astype('str')

df['Node'] = df['Node'].astype('category')
df['Component'] = df['Component'].astype('category')
df['Level'] = df['Level'].astype('category')
df['EventId'] = df['EventId'].astype('category')
df['EventTemplate'] = df['EventTemplate'].astype('category')

In [8]:
data_type = df['time_idx'].dtype
print("time_idx: " + str(data_type))

data_type = df['Node'].dtype
print("Node: " + str(data_type))

data_type = df['Component'].dtype
print("Component: " + str(data_type))

data_type = df['Level'].dtype
print("Level: " + str(data_type))

data_type = df['EventId'].dtype
print("EventId: " + str(data_type))

data_type = df['EventTemplate'].dtype
print("EventTemplate: " + str(data_type))

time_idx: int64
Node: category
Component: category
Level: category
EventId: category
EventTemplate: category


In [9]:
# Replace 'Column_Name' with the name of the column you're interested in
empty_rows = df['Node'].isnull().sum()

print(f"Number of empty rows in 'Node': {empty_rows}")


Number of empty rows in 'Node': 0


In [10]:
df.head()

Unnamed: 0,LineId,Label,Timestamp,Date,Node,Time,NodeRepeat,Type,Component,Level,Content,EventId,EventTemplate,ParameterList,time_idx
1644188,237496,-,1117838585,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:43:05.980712,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],15
1644253,237561,-,1117838608,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:43:28.948656,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],38
1644260,237568,-,1117838610,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:43:30.008754,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],40
1644269,237577,-,1117838614,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:43:34.978903,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],44
1644292,237600,-,1117838622,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:43:42.042890,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],52


In [11]:
top_lengths = df['Node'].value_counts().head(30)
print(top_lengths)

UNKNOWN_LOCATION       4607
R30-M0-N7-C:J09-U01    3912
R63-M0-N3-C:J02-U11    3816
R35-M0-N0-C:J07-U01    2972
R37-M1-NC-C:J02-U11    1824
R06-M1-N6-C:J15-U01    1812
R20-M1-NF-C:J10-U01    1618
R16-M1-N2-C:J17-U01    1555
R10-M0-N7-C:J17-U01    1518
R00-M1-NF-C:J13-U11    1246
R55-M0-N9-C:J06-U11    1217
R10-M1-N5-C:J04-U11     950
R07-M0-N6-C:J11-U01     898
R63-M1-NC-C:J06-U01     897
-                       801
R21-M0-ND-C:J04-U01     767
R15-M1-N6-C:J04-U11     731
R03-M1-NF-C:J07-U01     697
R23-M0-N8-C:J15-U11     689
R01-M0-N8-C:J02-U11     686
R22-M1-N3-C:J06-U01     671
R73-M1-N1-C:J16-U11     670
R32-M0-NF-C:J14-U01     658
R61-M1-ND-C:J07-U11     656
R03-M1-N9-C:J09-U11     654
R11-M1-N3-C:J07-U11     647
R32-M1-N9-C:J16-U11     647
R25-M1-ND-C:J09-U01     646
R16-M1-N2-C:J16-U11     645
R17-M0-N0-C:J10-U01     635
Name: Node, dtype: int64


In [12]:
#nodes_to_remove = ['R30-M0-N9-C:J16-U01', 'R02-M1-N0-C:J12-U11', "-", "UNKNOWN_LOCATION", "R16-M1-N2-C:J17-U01", "R26-M0-N0-I:J18-U11", "R02-M0-N4-C:J04-U11"]  # Example list of nodes to remove
nodes_to_remove = ["UNKNOWN_LOCATION", "-"]  # Example list of nodes to remove

# Removing rows where 'Node' column matches specified nodes
df = df[~df['Node'].isin(nodes_to_remove)]

In [13]:
total_rows = len(df)  # Retrieves the length of the DataFrame (number of rows)
print("Total number of rows:", total_rows)


Total number of rows: 2553869


In [None]:
# create dataset and dataloaders
max_encoder_length = 60
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

context_length = max_encoder_length
prediction_length = max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="EventId",
    categorical_encoders={"Node": NaNLabelEncoder().fit(data.Node), "EventId": NaNLabelEncoder().fit(data.EventId),"Component": NaNLabelEncoder().fit(data.Component), "Level": NaNLabelEncoder().fit(data.Level)},
    group_ids=["Node"],
    # only unknown variable is "value" - and N-HiTS can also not take any additional variables
    time_varying_unknown_reals=[],
    time_varying_unknown_categoricals=["Component", "Level", "EventId"],  # Adjust this based on features
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
)

validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)