#### Exploratory Data Analysis for Loghub Datasets: https://github.com/logpai/loghub?tab=readme-ov-file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

## Importing datasets

Create datasets variable with paths to the datasets

In [2]:
datasets = {
    "Apache": "../datasets/Apache.log",
    "BGL": "../datasets/BGL/BGL.log",
    "HDFS": "../datasets/HDFS_V1/HDFS.log",
    "Linux": "../datasets/Linux.log",
    "Mac": "../datasets/Mac.log",
    "OpenStack_Normal1": "../datasets/OpenStack/openstack_normal1.log",
    "OpenStack_Normal2": "../datasets/OpenStack/openstack_normal2.log",
    "OpenStack_Abnormal": "../datasets/OpenStack/openstack_abnormal.log"
}

Following function extracts three parts from the log entry:
1. Log level
2. Message
3. Source

In [3]:
def parse_log_file(file_path, source_name):
    logs = []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        for i, log_line in enumerate(file):
            if i >= 2000000: # Limiting logs to 10000,000
                break

            match = None
            log_pattern = None
            try:
                if (source_name == "Apache"):
                    log_pattern = r"\[(.*?)\] \[(.*?)\] (.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "BGL"):
                    log_pattern = r"- \d+ \d{4}\.\d{2}\.\d{2} ([\w:-]+) [\d\-:.]+ [\w:-]+ ([\w\s]+) (.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(2),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "HDFS"):
                    log_pattern = r"\d{6} \d{6} \d+ (INFO|WARN|ERROR) ([\w\.$]+): (.*)"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(1),
                            "message": match.group(3),
                            "source": source_name
                        })
                elif (source_name == "Linux"):
                    log_pattern = r"^(\w{3}\s+\d+\s+\d+:\d+:\d+)\s+([\w-]+)\s+([\w.$]+):\s+(.*)$"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(3),
                            "message": match.group(4),
                            "source": source_name
                        })
                elif (source_name == "Mac"):
                    log_pattern = r"^(\w{3}\s+\d+\s+\d+:\d+:\d+)\s+([\w-]+)\s+([\w\[\]0-9]+):\s+(.*)$"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(3),
                            "message": match.group(4),
                            "source": source_name
                        })
                elif (source_name == "OpenStack_Normal1" or "OpenStack_Normal2" or "OpenStack_Abnormal"):
                    log_pattern = r"^\S+\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s+\d+\s+(INFO|ERROR|WARN|DEBUG)\s+([\w\.\[\]]+)\s+(?:\[.*?\])?\s+(.*)$"
                    match = re.match(log_pattern, log_line)
                    if match is not None:
                        logs.append({
                            "level": match.group(1),
                            "message": match.group(2),
                            "source": source_name
                        })
            except Exception as e:
                print(e)
                continue
                    
    return pd.DataFrame(logs)


In [4]:
def parse_csv_file(file_path, source_name):
    df = pd.read_csv(file_path)
    df["source"] = source_name  # Add a source column
    return df

In [5]:
merged_logs = []

for source, file_path in datasets.items():
    if file_path.endswith(".log"):
        print(f"Parsing log file: {file_path}")
        try:
            df = parse_log_file(file_path, source)
            merged_logs.append(df)
        except Exception as e:
            print(f"Error parsing {file_path}: {e}")
    elif file_path.endswith(".csv"):
        print(f"Parsing CSV file: {file_path}")
        try:
            df = parse_csv_file(file_path, source)
            merged_logs.append(df)
        except Exception as e:
            print(f"Error parsing {file_path}: {e}")

final_df = pd.concat(merged_logs, ignore_index=True)

final_df.to_csv("dataframe_full_view.csv", index=False)
print(f"Merged logs saved to 'dataframe_full_view.csv'. Total rows: {len(final_df)}")

Parsing log file: ../datasets/Apache.log
Parsing log file: ../datasets/BGL/BGL.log


KeyboardInterrupt: 

In [11]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4121576 entries, 0 to 4121575
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   level    object
 1   message  object
 2   source   object
dtypes: object(3)
memory usage: 94.3+ MB
