#### Parsing datasets from Loghub Datasets: https://github.com/logpai/loghub

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
from datetime import datetime

## Importing datasets

Create datasets variable with paths to the datasets

In [2]:
datasets = {
    "Apache": "../datasets/Apache.log",
    "BGL": "../datasets/BGL/BGL.log",
    "HDFS": "../datasets/HDFS_V1/HDFS.log",
    "Linux": "../datasets/Linux.log",
    "Mac": "../datasets/Mac.log",
    "OpenStack_Normal1": "../datasets/OpenStack/openstack_normal1.log",
    "OpenStack_Normal2": "../datasets/OpenStack/openstack_normal2.log",
    "OpenStack_Abnormal": "../datasets/OpenStack/openstack_abnormal.log",
    "Synthetic_Logs": "../datasets/synthetic_logs.csv"
}

Following function extracts three parts from the log entry:
1. Log level
2. Message
3. Source

In [3]:
def parse_log_file(file_path, source_name):
    logs = []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        for i, log_line in enumerate(file):
            if i >= 3000000: # Limiting logs to 10000,000
                break

            match = None
            log_pattern = None
            try:
                if (source_name == "Apache"):
                    # log_pattern = r"\[(.*?)\] \[(.*?)\] (.*)"
                    log_pattern = r"\[(.*?)\]\s+\[(.*?)\]\s+(.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "timestamp": match.group(1),
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "BGL"):
                    # log_pattern = r"- \d+ \d{4}\.\d{2}\.\d{2} ([\w:-]+) [\d\-:.]+ [\w:-]+ ([\w\s]+) (.*)"
                    log_pattern = r"- \d+ \d{4}\.\d{2}\.\d{2} \S+ (\d{4}-\d{2}-\d{2}-\d{2}\.\d{2}\.\d{2}\.\d+) \S+ RAS KERNEL (\w+) (.+)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        # Convert detailed timestamp to datetime format
                        timestamp_str = match.group(1)
                        timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d-%H.%M.%S.%f")
                        
                        logs.append({
                            "timestamp": timestamp,
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "HDFS"):
                    # log_pattern = r"\d{6} \d{6} \d+ (INFO|WARN|ERROR) ([\w\.$]+): (.*)"
                    log_pattern = r"(\d{6})\s+(\d{6})\s+\d+\s+(INFO|WARN|ERROR|DEBUG)\s+([\w\.\$\*]+):\s+(.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                         # Extract date and time, then format as datetime
                        date_str = match.group(1)  # e.g., 081109
                        time_str = match.group(2)  # e.g., 203518
                        timestamp = datetime.strptime(date_str + time_str, "%y%m%d%H%M%S")
                        
                        logs.append({
                            "timestamp": timestamp,
                            "level": match.group(3),
                            "message": match.group(5),
                            "source": source_name
                        })
                elif (source_name == "Linux"):
                    # log_pattern = r"^(\w{3}\s+\d+\s+\d+:\d+:\d+)\s+([\w-]+)\s+([\w.$]+):\s+(.*)$"
                    log_pattern = r"^(\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2})\s+\S+\s+([\w\-\.]+):\s+(.*)$"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "timestamp": match.group(1),
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "Mac"):
                    # log_pattern = r"^(\w{3}\s+\d+\s+\d+:\d+:\d+)\s+([\w-]+)\s+([\w\[\]0-9]+):\s+(.*)$"
                    log_pattern = r"^(\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2})\s+\S+\s+([\w\[\]]+):\s+(.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "timestamp": match.group(1),
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "OpenStack_Normal1" or "OpenStack_Normal2" or "OpenStack_Abnormal"):
                    # log_pattern = r"^\S+\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s+\d+\s+(INFO|ERROR|WARN|DEBUG)\s+([\w\.\[\]]+)\s+(?:\[.*?\])?\s+(.*)$"
                    log_pattern = r"^\S+\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+)\s+\d+\s+(INFO|WARN|ERROR|CRITICAL)\s+\S+\s+(.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "timestamp": match.group(1),
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
            except Exception as e:
                print(e)
                continue
                    
    return pd.DataFrame(logs)


In [4]:
def parse_csv_file(file_path, source_name):
    logs_df = pd.read_csv(file_path, header=None, names=["log_line"])
    log_pattern = r"^\[(.*?)\]\s+(INFO|WARN|ERROR|CRITICAL)\s+(\w+):\s+(.*)$"

    parsed_logs = []
    for log in logs_df["log_line"]:
        match = re.match(log_pattern, log)
        if match:
            parsed_logs.append({
                "level": match.group(2),      # Log level
                "message": match.group(4),    # Message
                "source": match.group(3)     # Source
            })
    return pd.DataFrame(parsed_logs)

In [5]:
merged_logs = []

with tqdm(total=len(datasets), desc="Processing datastets", unit='file') as pbar:
    for source, file_path in datasets.items():
        if file_path.endswith(".log"):
            try:
                df = parse_log_file(file_path, source)
                merged_logs.append(df)
            except Exception as e:
                print(f"Error parsing {file_path}: {e}")
        elif file_path.endswith(".csv"):
            try:
                df = parse_csv_file(file_path, source)
                merged_logs.append(df)
            except Exception as e:
                print(f"Error parsing {file_path}: {e}")
        pbar.update(1)

final_df = pd.concat(merged_logs, ignore_index=True)

final_df.to_csv("processed_logs.csv", index=False)
print(f"Processed logs saved to 'processed_logs.csv'. Total rows: {len(final_df)}")

Processing datastets:  11%|█         | 1/9 [00:11<01:35, 11.96s/file]


KeyboardInterrupt: 