In [7]:
import sys
import os
import glob
import math

# ====================================================
# SETUP: AUTO-DETECT SPARK & PROJECT PATH
# ====================================================

# 1. Detect Spark on the VM (Fix for "ModuleNotFoundError")
# We look in common installation directories
candidate_paths = [
    os.environ.get("SPARK_HOME"), 
    "/usr/local/spark",
    "/usr/lib/spark",
    "/home/talentum/spark",
    "/opt/spark"
]

SPARK_HOME = None
for path in candidate_paths:
    if path and os.path.exists(path) and os.path.exists(os.path.join(path, "python")):
        SPARK_HOME = path
        break

if SPARK_HOME:
    # Add spark/python to the system path so we can import pyspark
    sys.path.insert(0, os.path.join(SPARK_HOME, "python"))
    
    # Add py4j (required for Java communication)
    py4j_files = glob.glob(os.path.join(SPARK_HOME, "python", "lib", "py4j-*-src.zip"))
    if py4j_files:
        sys.path.insert(0, py4j_files[0])
else:
    print("Warning: Spark folder not found. Relying on default environment.")

# 2. Add Project Root to Path (Fix for "config" import)
PROJECT_ROOT = "/home/talentum/Distributed-log-analyzer"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# 3. Test Imports
try:
    from pyspark.sql.functions import col, mean, stddev, abs as spark_abs, when, lit
    from spark_jobs.common.spark_utils import get_spark_session, load_config
    print("SUCCESS: Environment is ready. You can now run the next cell.")
except ImportError as e:
    print(f"FAILURE: {e}")

SUCCESS: Environment is ready. You can now run the next cell.


In [8]:
# --- CONFIGURATION ---
SOURCE_NAME = "Hadoop" # Change this to analyze other logs later

# 1. Imports
from pyspark.sql.functions import col, mean, stddev, abs as spark_abs, when, lit
from spark_jobs.common.spark_utils import get_spark_session, load_config

# 2. Initialize
conf = load_config()
spark = get_spark_session(f"Notebook_Anomaly_{SOURCE_NAME}")

# 3. Define Paths
gold_path = f"{conf['storage']['curated']}/{SOURCE_NAME}_trends"
alert_output_path = f"{conf['storage']['curated']}/{SOURCE_NAME}_anomalies"

print(f"Reading Gold Data from: {gold_path}")

# 4. Load the Trend Data
try:
    df = spark.read.parquet(gold_path)
    # Cache it because we will use it multiple times (stats + filtering)
    df.cache()
    print(f"Loaded {df.count()} hourly records.")
except Exception as e:
    print(f"Error: Could not read Gold data. Did you run the Trend Analysis step? {e}")
    raise e
    

# 5. CALCULATE STATISTICS
stats = df.select(
    mean(col("error_count")).alias("avg_error"),
    stddev(col("error_count")).alias("std_error")
).collect()[0]

avg_error = stats["avg_error"]
std_error = stats["std_error"]

# --- FIX: Handle NaN (Not a Number) ---
# If we only have 1 row, std_error is None/NaN. We treat it as 0.
if std_error is None or math.isnan(std_error):
    std_error = 0

# Prevent division by zero if std_dev is 0 (extremely stable system)
if std_error == 0:
    # If std_dev is 0, we use a fallback logic:
    # If error > avg, it's an infinite anomaly, otherwise it's 0.
    df_analyzed = df.withColumn("z_score", 
        when(col("error_count") > avg_error, 99.0) # Artificial high score
        .otherwise(0.0)
    )
else:
    # Standard Z-Score Formula
    df_analyzed = df.withColumn("z_score", (col("error_count") - avg_error) / std_error)

# Set Threshold
df_analyzed = df_analyzed.withColumn("is_anomaly", when(col("z_score") > 3.0, True).otherwise(False))

# stats = df.select(
#     mean(col("error_count")).alias("avg_error"),
#     stddev(col("error_count")).alias("std_error")
# ).collect()[0]

# avg_error = stats["avg_error"]
# std_error = stats["std_error"]

# # Handle case where standard deviation is 0 (e.g., all errors are 0)
# if std_error is None or std_error == 0:
#     std_error = 1  # Avoid division by zero

# print(f"--- Baseline Stats ---")
# print(f"Average Errors per Hour: {avg_error:.2f}")
# print(f"Standard Deviation:      {std_error:.2f}")

# 6. DETECT ANOMALIES (Z-Score Method)
# Formula: Z = (Current_Value - Average) / StdDev
# If Z > 3, it means the value is in the top 0.3% of weirdness (Anomaly)
THRESHOLD = 3.0

df_analyzed = df.withColumn("z_score", (col("error_count") - avg_error) / std_error) \
                .withColumn("is_anomaly", when(col("z_score") > THRESHOLD, True).otherwise(False))

# 7. FILTER & DISPLAY ANOMALIES
anomalies_df = df_analyzed.filter(col("is_anomaly") == True) \
    .select(
        col("window_start"), 
        col("total_logs"), 
        col("error_count"), 
        col("z_score"),
        lit(f"Error count is {THRESHOLD}x higher than normal").alias("alert_msg")
    ) \
    .orderBy(col("z_score").desc())

print("\n--- DETECTED ANOMALIES (Spikes) ---")
if anomalies_df.count() > 0:
    anomalies_df.show(truncate=False)
else:
    print("Good news! No anomalies detected. System is stable.")

# 8. OPTIONAL: VISUALIZE "NORMAL" DATA vs ANOMALIES
print("\n--- Recent Data (Top 5 Normal Hours) ---")
df_analyzed.filter(col("is_anomaly") == False).orderBy("window_start").show(5)

# 9. Save Results
print(f"Saving anomalies to {alert_output_path}...")
anomalies_df.write.mode("overwrite").parquet(alert_output_path)
print("Success!")

Reading Gold Data from: /user/talentum/project_logs/curated/Hadoop_trends
Loaded 1 hourly records.

--- DETECTED ANOMALIES (Spikes) ---
Good news! No anomalies detected. System is stable.

--- Recent Data (Top 5 Normal Hours) ---
+-------------------+----------+-----------+----------+-------+----------+
|       window_start|total_logs|error_count|warn_count|z_score|is_anomaly|
+-------------------+----------+-----------+----------+-------+----------+
|2015-10-18 17:30:00|      2000|        152|       808|   null|     false|
+-------------------+----------+-----------+----------+-------+----------+

Saving anomalies to /user/talentum/project_logs/curated/Hadoop_anomalies...
Success!
