### 🚀 **套件安裝**

In [4]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import pandas as pd
import re
from datasets import load_dataset

### 🚀 **Read Data**

In [50]:
ds = load_dataset("Ajayk/Alerts-data-desc")

# 劃分訓練與測試集
# train_ds = ds['train']
# test_ds = ds['test']
# train_df = ds.to_pandas()

# 將 train & test set 合併
total_df = pd.concat([pd.DataFrame(ds[split]) for split in ds.keys()])

### 🚀 **Data Overviewing**

In [7]:
# 查看 "Ajayk/Alerts-data-desc" 結構
print(ds)

DatasetDict({
    train: Dataset({
        features: ['log', 'target', 'description'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['log', 'target', 'description'],
        num_rows: 2000
    })
})


In [8]:
total_df .head()

Unnamed: 0,log,target,description
0,full_log:Apr 11 14:04:27 ssdnodes-652a982af37b...,False Positive,The input alert log data indicates a failed lo...
1,full_log:Apr 11 20:21:42 ssdnodes-652a982af37b...,Not-False Positive,The input log data is an alert log from a secu...
2,full_log:Apr 11 13:14:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...
3,full_log:Apr 11 11:45:01 ssdnodes-652a982af37b...,Not-False Positive,The input alert log data indicates a failed pa...
4,full_log:Apr 11 14:17:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...


In [9]:
# 查看 total_df 的行列數
print(total_df.shape)

(10000, 3)


### 🚀 **Log extraction**

In [37]:
# 通用的 Extract function
def extract_with_pattern(log, pattern, group_index=1):
    match = re.search(pattern, log)
    return match.group(group_index) if match else None

In [51]:
# Extract IP
def extract_ip(log):
        match = re.search(r'\d{1,3}(?:\.\d{1,3}){3}' ,log)
        if not match:
                return False
        ip = match.group()
        return ip if is_valid_ip(ip) else None

# 確認 IP 是否符合 IPv4 規範(0 到 255範圍內)
def is_valid_ip(ip):
        return all(0 <= int(part) <= 255 for part in ip.split('.'))

# Extract time
def extract_timestamp(log):
        pattern = r'(\w{3}\s\d{1,2},\s\d{4})\s@\s(\d{2}:\d{2}:\d{2})\.(\d{3})'
        match = re.search(pattern,log)
        if match:
                return f'{match.group(1)} {match.group(2)}'
        return None

# Extract port
def extract_port(log):
        pattern = r'port(\s\d{1,5})'
        return extract_with_pattern(log, pattern)

# Extract Country
def extract_country(log):
        pattern =  r'country_name:\s*(?!\[)([^,\[]+)'
        return extract_with_pattern(log, pattern)


# Extract Longitude(經度)
def extract_longitude(log):
        pattern = r'lon:(-?\d+\.\d+)'
        return extract_with_pattern(log, pattern)

# Extract Latitude(緯度)
def extract_latitude(log):
        pattern = r'lat:(-?\d+\.\d+)'
        return extract_with_pattern(log, pattern)


################## Rule Information ##################

# Extract rule ID -> 事件唯一識別碼？
def extract_rule_id(log):
    pattern = r'rule\\.id:(\d+)'
    return extract_with_pattern(log, pattern)

# Extract rule level -> 事件嚴重性
def extract_rule_level(log):
    pattern = r'rule\\.level:(\d+)'
    return extract_with_pattern(log, pattern)

# Extract rule mail -> 是否有用 mail 通知
def extract_rule_mail(log):
    pattern = r'rule\\.mail:(True|False)'
    return extract_with_pattern(log, pattern)

# Extract rule description
def extract_rule_description(log):
        pattern = r'rule\\.description:sshd:\s*([^,]*)'
        return extract_with_pattern(log, pattern)

# Extract rule group
def extract_rule_groups(log):
        pattern = r'rule\\.groups:(\[\s*.*?\s*\])'
        return extract_with_pattern(log, pattern)

# Extract rule firedtimes -> 規則觸發次數
def extract_ruel_firedtimes(log):
    pattern = r'firedtimes:\s*([^,]*)'
    return extract_with_pattern(log, pattern)

#######################################################

# MITRE ATT&CK

def extract_mitre_technique(log):
    pattern = r'mitre\\.technique:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

def extract_mitre_id(log):
    pattern = r'mitre\\.id:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

def extract_mitre_tactic(log):
    pattern = r'mitre\\.tactic:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

# 美國 NIST 資安法規
def extract_nist(log):
        pattern = r'nist_800_53:(\[\s*.*?\s*\])'
        return extract_with_pattern(log, pattern)

# GDPR(一般資料保護規則)
def extract_gdpr(log): 
        pattern = r'gdpr:(\[\s*.*?\s*\])'
        return extract_with_pattern(log, pattern)

# GPG13(UK Government's Good Practice Guide 13) 
def extract_gpg13(log): 
        pattern = r'gpg13:(\[\s*.*?\s*\])'
        return extract_with_pattern(log, pattern)

# HIPAA
def extract_hipaa(log):
    pattern = r'hipaa:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

# TSC
def extract_tsc(log):
    pattern = r'tsc:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

# PCI DSS
def extract_pci_dss(log):
    pattern = r'pci_dss:(\[\s*.*?\s*\])'
    return extract_with_pattern(log, pattern)

# # Extract synopsis
# def extract_synopsis(log):
#         pattern = r'sshd\[\d+\]:\s*(.*)(?=,)'
#         return extract_with_pattern(log, pattern)

In [52]:
def extract_features(log):
    return {
        'Source IP': extract_ip(log),
        'Timestamp': extract_timestamp(log),
        'Port': extract_port(log),
        'Country': extract_country(log),
        'Longitude': extract_longitude(log),
        'Latitude': extract_latitude(log),
        'Rule ID': extract_rule_id(log),
        'Rule Level': extract_rule_level(log),
        'Rule Description': extract_rule_description(log),
        'Rule Groups': extract_rule_groups(log),
        'Rule Mail': extract_rule_mail(log),
        'MITRE Technique': extract_mitre_technique(log),
        'MITRE ID': extract_mitre_id(log),
        'MITRE Tactic': extract_mitre_tactic(log),
        'Nist 800-53': extract_nist(log),
        'GDPR': extract_gdpr(log),
        'GPG13': extract_gpg13(log),
        'HIPAA': extract_hipaa(log),
        'TSC': extract_tsc(log),
        'PCI DSS': extract_pci_dss(log),
        'Firedtimes': extract_ruel_firedtimes(log),
    }

features_df = total_df['log'].apply(extract_features).apply(pd.Series)
total_df = pd.concat([total_df, features_df], axis=1)
total_df.head()


Unnamed: 0,log,target,description,Source IP,Timestamp,Port,Country,Longitude,Latitude,Rule ID,...,MITRE Technique,MITRE ID,MITRE Tactic,Nist 800-53,GDPR,GPG13,HIPAA,TSC,PCI DSS,Firedtimes
0,full_log:Apr 11 14:04:27 ssdnodes-652a982af37b...,False Positive,The input alert log data indicates a failed lo...,170.64.161.109,"Apr 11, 2024 14:04:29",57684,United States,-97.822,37.751,5710.0,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",349
1,full_log:Apr 11 20:21:42 ssdnodes-652a982af37b...,Not-False Positive,The input log data is an alert log from a secu...,170.64.233.19,"Apr 11, 2024 20:21:42",35662,,,,,...,,"[""7.1""]","[""syslog"",""sshd"",""authentication_failed"",""inva...",,,,,,"[""164.312.b""]",log
2,full_log:Apr 11 13:14:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...,104.225.217.122,"Apr 11, 2024 13:14:02",35482,United States,-122.3447,47.6144,5710.0,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",318
3,full_log:Apr 11 11:45:01 ssdnodes-652a982af37b...,Not-False Positive,The input alert log data indicates a failed pa...,180.101.88.229,"Apr 11, 2024 11:45:03",49145,China,121.4012,31.0449,5760.0,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5""]",457
4,full_log:Apr 11 14:17:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...,104.225.217.122,"Apr 11, 2024 14:17:04",59630,United States,-122.3447,47.6144,5710.0,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",1058


### 🚀 **確認空值**

In [53]:
empty_source_ip_rows = total_df[total_df['Source IP'].isna()]
empty_timestamp_rows = total_df[total_df['Timestamp'].isna()]
empty_port_rows = total_df[total_df['Port'].isna()]
empty_rule_id_rows = total_df[total_df['Port'].isna()]

In [54]:
total_df.isna().sum()

log                    0
target                 0
description            0
Source IP              3
Timestamp              0
Port                3832
Country             3643
Longitude           4165
Latitude            4165
Rule ID             3650
Rule Level             0
Rule Description    6080
Rule Groups         3650
Rule Mail           5000
MITRE Technique     3816
MITRE ID             219
MITRE Tactic         166
Nist 800-53         3680
GDPR                3665
GPG13               3770
HIPAA               3680
TSC                 3680
PCI DSS               19
Firedtimes             0
dtype: int64

### 🚀 **數據預處理**

In [55]:
total_df = total_df[total_df['Firedtimes'] != 'log']
total_df = total_df.reset_index(drop=True)
total_df.head()

Unnamed: 0,log,target,description,Source IP,Timestamp,Port,Country,Longitude,Latitude,Rule ID,...,MITRE Technique,MITRE ID,MITRE Tactic,Nist 800-53,GDPR,GPG13,HIPAA,TSC,PCI DSS,Firedtimes
0,full_log:Apr 11 14:04:27 ssdnodes-652a982af37b...,False Positive,The input alert log data indicates a failed lo...,170.64.161.109,"Apr 11, 2024 14:04:29",57684,United States,-97.822,37.751,5710,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",349
1,full_log:Apr 11 13:14:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...,104.225.217.122,"Apr 11, 2024 13:14:02",35482,United States,-122.3447,47.6144,5710,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",318
2,full_log:Apr 11 11:45:01 ssdnodes-652a982af37b...,Not-False Positive,The input alert log data indicates a failed pa...,180.101.88.229,"Apr 11, 2024 11:45:03",49145,China,121.4012,31.0449,5760,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5""]",457
3,full_log:Apr 11 14:17:02 ssdnodes-652a982af37b...,False Positive,The alert log data indicates an invalid user a...,104.225.217.122,"Apr 11, 2024 14:17:04",59630,United States,-122.3447,47.6144,5710,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7"",""AU.6""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5"",""10.6.1""]",1058
4,full_log:Apr 11 12:35:51 ssdnodes-652a982af37b...,Not-False Positive,The alert log data indicates a failed password...,180.101.88.229,"Apr 11, 2024 12:35:53",60218,China,121.4012,31.0449,5760,...,"[""Password Guessing"",""SSH""]","[""T1110.001"",""T1021.004""]","[""Credential Access"",""Lateral Movement""]","[""AU.14"",""AC.7""]","[""IV_35.7.d"",""IV_32.2""]","[""7.1""]","[""164.312.b""]","[""CC6.1"",""CC6.8"",""CC7.2"",""CC7.3""]","[""10.2.4"",""10.2.5""]",328


In [56]:
# 儲存 csv 檔案
total_df.to_csv('alert_data.csv', index=True)

In [57]:
# 總行列數
total_df.shape

(6350, 24)