# HDFS Log Anomaly Detection (Logistic Regression)

In [1]:
import pandas as pd
import re
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
log_structured_path = "HDFS_parsed.csv"
event_occ_path = "HDFS_v1/preprocessed/Event_occurrence_matrix.csv"
labels_path = "HDFS_v1/preprocessed/anomaly_label.csv"

logs = pd.read_csv(log_structured_path)
event_occ = pd.read_csv(event_occ_path)
labels = pd.read_csv(labels_path)

print("logs shape:", logs.shape)
print("event_occ shape:", event_occ.shape)
print("labels shape:", labels.shape)
logs.head()

logs shape: (11175629, 9)
event_occ shape: (575061, 32)
labels shape: (575061, 2)


Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,[*]Receiving block[*]src:[*]dest:[*]
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,E22,[*]BLOCK* NameSystem[*]allocateBlock:[*]
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,[*]Receiving block[*]src:[*]dest:[*]
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,[*]Receiving block[*]src:[*]dest:[*]
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,E11,[*]PacketResponder[*]for block[*]terminating[*]


In [3]:
# Извлечение `BlockId` из поля `Content`

def extract_block_id(content: str):
    """Извлекает BlockId из текстового поля Content.
    Возвращает строку вида 'blk_...' или None, если BlockId не найден.
    """
    m = re.search(r"blk_[0-9-]+", str(content))
    return m.group(0) if m else None

logs["BlockId"] = logs["Content"].apply(extract_block_id)

logs = logs.dropna(subset=["BlockId"]).reset_index(drop=True)

print("logs with BlockId shape:", logs.shape)
logs[["LineId", "Level", "EventId", "BlockId"]].head()

logs with BlockId shape: (11175629, 10)


Unnamed: 0,LineId,Level,EventId,BlockId
0,1,INFO,E5,blk_-1608999687919862906
1,2,INFO,E22,blk_-1608999687919862906
2,3,INFO,E5,blk_-1608999687919862906
3,4,INFO,E5,blk_-1608999687919862906
4,5,INFO,E11,blk_-1608999687919862906


**Формирование оконных признаков по `BlockId`**

Каждый `BlockId` рассматривается как отдельное окно.

Для каждого блока считаем:

- `log_count` — количество строк логов в блоке
- `error_ratio` — доля строк с уровнем `ERROR`
- `warn_ratio` — доля строк с уровнем `WARN`
- `unique_templates` — число различных `EventId` (шаблонов)
- `E*_freq_win` — частоты появления каждого `EventId` внутри блока

In [4]:
features = []

for block_id, g in tqdm(logs.groupby("BlockId"), desc="Building window features"):
    total = len(g)

    row = {
        "BlockId": block_id,
        "log_count": total,
        "error_ratio": (g["Level"] == "ERROR").mean(),
        "warn_ratio": (g["Level"] == "WARN").mean(),
        "unique_templates": g["EventId"].nunique(),
    }

    # Частоты EventId внутри блока
    counts = g["EventId"].value_counts()
    for eid, cnt in counts.items():
        row[f"{eid}_freq"] = cnt / total

    features.append(row)

window_features = pd.DataFrame(features)
print("window_features shape:", window_features.shape)
window_features.head()

Building window features: 100%|██████████| 575061/575061 [02:47<00:00, 3430.90it/s]


window_features shape: (575061, 34)


Unnamed: 0,BlockId,log_count,error_ratio,warn_ratio,unique_templates,E5_freq,E11_freq,E9_freq,E26_freq,E22_freq,...,E29_freq,E17_freq,E15_freq,E1_freq,E10_freq,E14_freq,E8_freq,E12_freq,E24_freq,E19_freq
0,blk_-1000002529962039464,13,0.0,0.0,5,0.230769,0.230769,0.230769,0.230769,0.076923,...,,,,,,,,,,
1,blk_-100000266894974466,28,0.0,0.107143,9,0.107143,0.107143,0.107143,0.107143,0.035714,...,,,,,,,,,,
2,blk_-1000007292892887521,13,0.0,0.0,5,0.230769,0.230769,0.230769,0.230769,0.076923,...,,,,,,,,,,
3,blk_-1000014584150379967,29,0.0,0.103448,10,0.103448,0.103448,0.103448,0.103448,0.034483,...,,,,,,,,,,
4,blk_-1000028658773048709,19,0.0,0.0,7,0.157895,0.157895,0.157895,0.157895,0.052632,...,,,,,,,,,,


In [5]:
# объединяем все источники по `BlockId`
if "Label" in event_occ.columns:
    event_occ = event_occ.drop(columns=["Label"])

df = (
    event_occ
    .merge(window_features, on="BlockId", how="left")
    .merge(labels, on="BlockId", how="left")
)

print("merged df shape:", df.shape)
df.head()

merged df shape: (575061, 65)


Unnamed: 0,BlockId,Type,E1,E2,E3,E4,E5,E6,E7,E8,...,E17_freq,E15_freq,E1_freq,E10_freq,E14_freq,E8_freq,E12_freq,E24_freq,E19_freq,Label
0,blk_-1608999687919862906,,0,0,203,0,10,7,0,0,...,,,,,,,,,,Normal
1,blk_7503483334202473044,,0,2,1,0,3,0,0,0,...,,,,,,,,,,Normal
2,blk_-3544583377289625738,21.0,0,0,203,0,3,0,0,0,...,,,,,,,,,,Anomaly
3,blk_-9073992586687739851,,0,3,0,0,3,0,0,0,...,,,,,,,,,,Normal
4,blk_7854771516489510256,,0,3,1,15,3,0,0,0,...,,,,,,,,,,Normal


In [6]:
# y: 1 = Anomaly, 0 = Normal
df["y"] = (df["Label"] == "Anomaly").astype(int)

# Удаляем служебные поля
drop_cols = ["BlockId", "Label", "Type"]
for c in drop_cols:
    if c in df.columns:
        df = df.drop(columns=[c])

X = df.drop(columns=["y"])
X["E1"] = X["E1"].astype('str').str.extract(r'(\d+)',).astype('int')
y = df["y"]

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

X shape: (575061, 62)
y distribution:
 y
0    558223
1     16838
Name: count, dtype: int64


In [7]:
# Заполняем нулями freq = Nan 
missing_columns = list(X.columns[X.isnull().any()])
values = {}
for col in missing_columns:
    values[col] = 0
X.fillna(value=values, inplace=True)

In [8]:
# Масштабирование признаков и train/test split

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((460048, 62), (115013, 62))

In [9]:
# Обучение Logistic Regression и оценка качества

clf = LogisticRegression(
    max_iter=5000,
    class_weight="balanced",
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    111645
           1       0.98      1.00      0.99      3368

    accuracy                           1.00    115013
   macro avg       0.99      1.00      0.99    115013
weighted avg       1.00      1.00      1.00    115013

ROC-AUC: 0.9994549377592215


In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[111581,     64],
       [     3,   3365]])

In [11]:
X.columns

Index(['E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10', 'E11',
       'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E20', 'E21',
       'E22', 'E23', 'E24', 'E25', 'E26', 'E27', 'E28', 'E29', 'log_count',
       'error_ratio', 'warn_ratio', 'unique_templates', 'E5_freq', 'E11_freq',
       'E9_freq', 'E26_freq', 'E22_freq', 'E3_freq', 'E4_freq', 'E23_freq',
       'E21_freq', 'E2_freq', 'E7_freq', 'E28_freq', 'E25_freq', 'E18_freq',
       'E16_freq', 'E6_freq', 'E27_freq', 'E13_freq', 'E20_freq', 'E29_freq',
       'E17_freq', 'E15_freq', 'E1_freq', 'E10_freq', 'E14_freq', 'E8_freq',
       'E12_freq', 'E24_freq', 'E19_freq'],
      dtype='object')

In [12]:
import numpy as np

importance = np.abs(clf.coef_)[0]
idx = np.argsort(-importance)

for i in idx[:20]:
    print(X.columns[i], importance[i])

unique_templates 4.935610837037359
E21 3.109472850734214
E3_freq 2.8217605885056156
E16_freq 2.6687513515332095
E20_freq 2.186551943052163
E5_freq 1.9761736403081407
E9 1.9439317534994476
E11 1.9287959915411006
E22_freq 1.9105919223717707
E6_freq 1.8567741345369222
E5 1.755620695371387
E7_freq 1.4051955267819842
E7 1.3666724048699823
E2_freq 1.3253723115340197
E28_freq 1.324182528555198
E27_freq 1.2416860447065494
E4_freq 0.9502333887287843
E21_freq 0.9006803372884726
E25 0.8836759188493789
E18 0.8836759188493789


In [13]:
df.groupby("y")[["log_count", "unique_templates"]].describe()

Unnamed: 0_level_0,log_count,log_count,log_count,log_count,log_count,log_count,log_count,log_count,unique_templates,unique_templates,unique_templates,unique_templates,unique_templates,unique_templates,unique_templates,unique_templates
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
y,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,558223.0,19.503637,4.775583,13.0,19.0,19.0,20.0,298.0,558223.0,7.255108,1.451071,5.0,7.0,7.0,8.0,14.0
1,16838.0,17.119017,12.409644,2.0,4.0,20.0,26.0,284.0,16838.0,7.158986,3.908326,2.0,3.0,8.0,11.0,20.0


In [14]:
df.groupby("y")[["error_ratio", "warn_ratio"]].describe()

Unnamed: 0_level_0,error_ratio,error_ratio,error_ratio,error_ratio,error_ratio,error_ratio,error_ratio,error_ratio,warn_ratio,warn_ratio,warn_ratio,warn_ratio,warn_ratio,warn_ratio,warn_ratio,warn_ratio
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
y,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,558223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,558223.0,0.023124,0.051608,0.0,0.0,0.0,0.0,0.583333
1,16838.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16838.0,0.033246,0.046162,0.0,0.0,0.0,0.05,0.459459


In [15]:
corr = df.corr(numeric_only=True)["y"].abs().sort_values(ascending=False)
corr.head(20)

y           1.000000
E9          0.602635
E11         0.597808
E20         0.525625
E16_freq    0.517569
E6_freq     0.498084
E18_freq    0.495429
E25_freq    0.495429
E22_freq    0.492321
E7          0.424132
E5_freq     0.368221
E18         0.312341
E25         0.312341
E16         0.309967
E20_freq    0.303362
E6          0.299926
E28         0.261382
E26         0.254904
E27         0.236291
E5          0.210662
Name: y, dtype: float64

In [16]:
corr = df.corr(numeric_only=True)["y"].abs().sort_values(ascending=False)
print(corr.head(20))


y           1.000000
E9          0.602635
E11         0.597808
E20         0.525625
E16_freq    0.517569
E6_freq     0.498084
E18_freq    0.495429
E25_freq    0.495429
E22_freq    0.492321
E7          0.424132
E5_freq     0.368221
E18         0.312341
E25         0.312341
E16         0.309967
E20_freq    0.303362
E6          0.299926
E28         0.261382
E26         0.254904
E27         0.236291
E5          0.210662
Name: y, dtype: float64
