In [3]:
import sys
import os
import glob

# ====================================================
# FIX: AUTO-DETECT SPARK ON LOCAL MACHINE (No PIP needed)
# ====================================================

# 1. Unset the conflicting variable inside Python to prevent future crashes
if "PYTHONPATH" in os.environ:
    del os.environ["PYTHONPATH"]

# 2. Define common places where Spark lives in training VMs
candidate_paths = [
    os.environ.get("SPARK_HOME"), 
    "/usr/local/spark",
    "/usr/lib/spark",
    "/home/talentum/spark",
    "/opt/spark",
    "/usr/hdp/current/spark2-client"
]

SPARK_HOME = None

# 3. Search for the folder
print("Searching for pre-installed Spark...")
for path in candidate_paths:
    if path and os.path.exists(path) and os.path.exists(os.path.join(path, "python")):
        SPARK_HOME = path
        print(f"--> Found Spark at: {SPARK_HOME}")
        break

# 4. Add to Python Path if found
if SPARK_HOME:
    # Add the python folder (where pyspark lives)
    sys.path.insert(0, os.path.join(SPARK_HOME, "python"))
    
    # Add the py4j zip file (required for Java communication)
    py4j_files = glob.glob(os.path.join(SPARK_HOME, "python", "lib", "py4j-*-src.zip"))
    if py4j_files:
        sys.path.insert(0, py4j_files[0])
        print(f"--> Added Py4J: {py4j_files[0]}")
    else:
        print("--> Warning: Could not find py4j zip file.")
else:
    print("--> Error: Could not find Spark folder.")

PROJECT_ROOT = "/home/talentum/Distributed-log-analyzer"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Now try the imports
try:
    from spark_jobs.common.spark_utils import get_spark_session, load_config
    import config.schemas as schemas
    from pyspark.sql.functions import input_file_name, current_timestamp, regexp_extract, col
    print("\nSUCCESS: All modules loaded correctly! You can now run the next cell.")
except ImportError as e:
    print(f"\nFAILURE: {e}")

Searching for pre-installed Spark...
--> Found Spark at: /home/talentum/spark
--> Added Py4J: /home/talentum/spark/python/lib/py4j-0.10.7-src.zip

SUCCESS: All modules loaded correctly! You can now run the next cell.


In [4]:
# --- CONFIGURATION FOR THIS RUN ---
# Change this to: "Android", "Linux", "Apache", etc. to test different logs
TARGET_SOURCE = "Hadoop" 

# 1. Initialize Spark & Config
conf = load_config()
spark = get_spark_session(f"Notebook_Ingest_{TARGET_SOURCE}")

print(f"--- Processing Source: {TARGET_SOURCE} ---")

# 2. Get Paths
raw_path = f"{conf['storage']['raw']}/{TARGET_SOURCE}"
refined_path = f"{conf['storage']['refined']}/{TARGET_SOURCE}"

# 3. Get Schema (Regex)
if TARGET_SOURCE not in schemas.LOG_PATTERNS:
    raise ValueError(f"Error: No Regex found for {TARGET_SOURCE} in schemas.py")

log_def = schemas.LOG_PATTERNS[TARGET_SOURCE]
pattern = log_def["pattern"]
columns = log_def["columns"]

print(f"Reading from: {raw_path}")
print(f"Pattern: {pattern}")

# 4. Read Data
try:
    raw_df = spark.read.text(raw_path)
    print(f"Raw Count: {raw_df.count()} rows")
except Exception as e:
    print(f"Error reading file: {e}")
    # Stop execution of this cell if file not found
    raise e

# 5. Apply Regex & Parse
parsed_df = raw_df
for idx, col_name in enumerate(columns):
    parsed_df = parsed_df.withColumn(col_name, regexp_extract(col("value"), pattern, idx + 1))

# Filter out empty rows (where regex didn't match)
clean_df = parsed_df.filter(col(columns[0]) != "")

# 6. Add Audit Columns
final_df = clean_df \
    .withColumn("ingest_time", current_timestamp()) \
    .withColumn("source_file", input_file_name()) \
    .drop("value")

# 7. VISUALIZE (The benefit of Notebooks!)
print("--- Data Preview ---")
final_df.show(5, truncate=False)

# 8. Write to Disk
print(f"Writing to {refined_path}...")
final_df.write.mode("overwrite").parquet(refined_path)
print("Success!")

--- Processing Source: Hadoop ---
Reading from: /user/talentum/project_logs/raw/Hadoop
Pattern: ^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) (\S+) \[(.*?)\] (.*?): (.*)
Raw Count: 2000 rows
--- Data Preview ---
+-----------------------+-----+------+----------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-------------------------------------------------------------------------+
|timestamp              |level|thread|component                                     |message                                                                                                                                               |ingest_time            |source_file                                                              |
+-----------------------+-----+------+----------------------------------------------+----------------------------------