In [1]:
import os
import json
import random
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("Log Analysis").getOrCreate()

In [2]:
log_data = {
    "INFO": ["User logged in", "New user registered", "Data successfully processed"],
    "ERROR": ["Page not found", "Database connection lost", "File not found"],
    "WARNING": ["High memory usage detected", "Slow API response", "Deprecated function used"],
    "DEBUG": ["Function X executed", "Cache cleared", "Config file loaded"],
    "CRITICAL": ["System out of memory! Shutting down", "Power failure detected"],
    "FATAL": ["Kernel panic", "Unrecoverable database corruption"],
    "TRACE": ["Entering function process_data()", "Exiting function process_data()"],
    "SECURITY": ["Unauthorized login attempt", "Suspicious IP detected"],
    "ACCOUNT": ["User X changed password", "Account settings updated"]
}

json_file = os.path.join(os.getcwd(), "log_messages.json")
with open(json_file, "w") as f:
    json.dump(log_data, f, indent=4)

log_levels = "|".join(log_data.keys())

In [4]:
def generate_log_entry():
    log_level = random.choice(list(log_data.keys()))
    log_message = random.choice(log_data[log_level]).strip()
    timestamp = datetime.now() - timedelta(days=random.randint(0, 30), hours=random.randint(0, 23), minutes=random.randint(0, 59))
    return f"{timestamp} {log_level} {log_message}"

log_file = os.path.join(os.getcwd(), "log_file.txt")
num_entries = 100000
with open(log_file, "w") as f:
    for _ in range(num_entries):
        f.write(generate_log_entry() + "\n")

In [5]:
log_df = spark.read.text(log_file)
log_df = log_df.withColumn("Timestamp", F.regexp_extract(F.col("value"), r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", 1))
log_df = log_df.withColumn("LogLevel", F.regexp_extract(F.col("value"), rf"\b({log_levels})\b", 1))
log_df = log_df.withColumn("Message", F.trim(F.regexp_extract(F.col("value"), rf"\b(?:{log_levels})\b (.*)", 1)))

log_df = log_df.drop("value").na.drop()
log_df = log_df.withColumn("Timestamp", F.col("Timestamp").cast(TimestampType()))

log_df.show(10, truncate=False)

+-------------------+--------+---------------------------------+
|Timestamp          |LogLevel|Message                          |
+-------------------+--------+---------------------------------+
|2025-02-14 01:31:21|TRACE   |Entering function process_data() |
|2025-02-13 04:51:21|TRACE   |Exiting function process_data()  |
|2025-03-09 08:49:21|ERROR   |Database connection lost         |
|2025-02-21 22:15:21|TRACE   |Exiting function process_data()  |
|2025-02-23 03:41:21|INFO    |Data successfully processed      |
|2025-03-01 17:25:21|FATAL   |Unrecoverable database corruption|
|2025-03-07 23:02:21|CRITICAL|Power failure detected           |
+-------------------+--------+---------------------------------+
only showing top 10 rows



In [6]:
df_count = log_df.groupBy("LogLevel").agg(F.count("LogLevel").alias("Count"))
df_count.show()


+--------+-----+
|LogLevel|Count|
+--------+-----+
|    INFO|10887|
| ACCOUNT|11063|
|   ERROR|11275|
|   FATAL|11106|
|   DEBUG|11109|
|   TRACE|11343|
|CRITICAL|11112|
|SECURITY|11135|
+--------+-----+



In [7]:
error_logs = log_df.where(F.col("LogLevel") == "ERROR")
error_logs.show(10, truncate=False)


+-------------------+--------+------------------------+
|Timestamp          |LogLevel|Message                 |
+-------------------+--------+------------------------+
|2025-03-09 08:49:21|ERROR   |Database connection lost|
|2025-02-07 15:08:21|ERROR   |Database connection lost|
|2025-03-02 22:14:21|ERROR   |Database connection lost|
|2025-02-27 05:42:21|ERROR   |Database connection lost|
|2025-02-14 14:17:21|ERROR   |File not found          |
|2025-02-22 18:02:21|ERROR   |Page not found          |
|2025-02-24 01:30:21|ERROR   |Database connection lost|
|2025-02-10 20:56:21|ERROR   |Database connection lost|
|2025-02-13 07:00:21|ERROR   |Database connection lost|
|2025-02-06 23:22:21|ERROR   |File not found          |
+-------------------+--------+------------------------+
only showing top 10 rows



In [8]:
logs_by_hour = log_df.withColumn("Hour", F.hour("Timestamp")).groupBy("Hour").count().orderBy("Hour")
logs_by_hour.show()


+----+-----+
|Hour|count|
+----+-----+
|   0| 4112|
|   1| 4169|
|   2| 4209|
|   3| 4164|
|   4| 4141|
|   5| 4212|
|   6| 4218|
|   7| 4194|
|   8| 4256|
|   9| 4169|
|  10| 4123|
|  11| 4104|
|  12| 4166|
|  13| 4211|
|  14| 4193|
|  15| 4153|
|  16| 4145|
|  17| 4081|
|  18| 4176|
|  19| 4184|
+----+-----+
only showing top 20 rows

