In [None]:
import pandas as pd

logs_df = pd.read_csv("https://raw.githubusercontent.com/logpai/loglizer/refs/heads/master/data/HDFS/HDFS_100k.log_structured.csv")

print("Loaded structured logs:", logs_df.shape)
logs_df.head()


In [None]:
labels_df = pd.read_csv("https://raw.githubusercontent.com/logpai/loglizer/refs/heads/master/data/HDFS/anomaly_label.csv")

print("Loaded anomaly labels:", labels_df.shape)
labels_df.head()


In [None]:
import pandas as pd

logs_df['BlockId'] = logs_df['Content'].str.extract(r'(blk_-?\d+)')
print("Unique Block IDs:", logs_df['BlockId'].nunique())


Unique Block IDs: 7940


In [None]:
block_summary = logs_df.groupby("BlockId").size().reset_index(name="NumEvents")
block_summary.head()


In [None]:
block_summary["BlockId"]


Unnamed: 0,BlockId
0,blk_-1001553972418305662
1,blk_-1010952805175971965
2,blk_-1011482868748761910
3,blk_-1011537904811654030
4,blk_-1015291919896450721
...,...
7935,blk_989409441141247289
7936,blk_989757635324841895
7937,blk_992037777600190687
7938,blk_993316727245644324


In [None]:
import pandas as pd

logs_df['BlockId'] = logs_df['Content'].str.extract(r'(blk_-?\d+)')

block_summary = logs_df.groupby("BlockId").size().reset_index(name="NumEvents")

labels_df['Label'] = labels_df['Label'].map({'Normal': 0, 'Anomaly': 1})

filtered_labels = labels_df[labels_df.BlockId.isin(block_summary.BlockId)]

merged = block_summary.merge(filtered_labels, on="BlockId", how="inner")

usable = merged[merged.NumEvents >= 5]

print("Usable blocks:", usable.shape)
print(usable.Label.value_counts())


Usable blocks: (7817, 3)
Label
0    7626
1     191
Name: count, dtype: int64


In [None]:
import pandas as pd

anomaly_blocks = usable[usable.Label == 1].sample(30, random_state=42)
normal_blocks = usable[usable.Label == 0].sample(70, random_state=42)

selected_blocks = pd.concat([anomaly_blocks, normal_blocks]).reset_index(drop=True)

print(selected_blocks.Label.value_counts())

selected_blocks.to_csv("my_selected_blocks.csv", index=False)
print("Saved: my_selected_blocks.csv")


Label
0    70
1    30
Name: count, dtype: int64
Saved: my_selected_blocks.csv


In [None]:
selected_ids = selected_blocks.BlockId.tolist()

subset_logs = logs_df[logs_df.BlockId.isin(selected_ids)]

subset_logs.to_csv("my_selected_logs.csv", index=False)
print("Saved: my_selected_logs.csv")


Saved: my_selected_logs.csv
