In [0]:
import os
import zipfile
import urllib.request
import shutil
from pyspark.sql.functions import input_file_name, col, to_timestamp, year

USE_OFFLINE = False
TARGET_YEAR = 2024

zip_destination = "/tmp/cve/cvelistV5.zip"
extraction_folder = "/tmp/cve/cvelistV5-main"

os.makedirs("/tmp/cve", exist_ok=True)

if not USE_OFFLINE:
    github_url = "https://github.com/CVEProject/cvelistV5/archive/refs/heads/main.zip"
    
    print(f"🌐 Downloading CVE repository from GitHub...")
    print(f"   {github_url}")
    print("   This data updates every 7 minutes - you're getting fresh intel!")
    
    with urllib.request.urlopen(github_url) as response:
        downloaded_data = response.read()
    
    with open(zip_destination, "wb") as file:
        file.write(downloaded_data)
    
    file_size_mb = len(downloaded_data) / (1024 * 1024)
    print(f"✅ Downloaded {len(downloaded_data):,} bytes ({file_size_mb:.1f} MB)")
    
else:
    zip_destination = "/dbfs/FileStore/cvelistV5.zip"
    assert os.path.exists(zip_destination), f"Missing file: {zip_destination}"
    print(f"📁 Using pre-downloaded ZIP from {zip_destination}")

if os.path.exists(extraction_folder):
    print("🧹 Removing old extraction...")
    shutil.rmtree(extraction_folder, ignore_errors=True)

print("📦 Extracting ZIP archive (this has 200,000+ files, takes a minute)...")
with zipfile.ZipFile(zip_destination) as archive:
    archive.extractall("/tmp/cve/")

cve_json_directory = "/tmp/cve/cvelistV5-main/cves"
print(f"✅ Ready! CVE data extracted to: {cve_json_directory}")

🌐 Downloading CVE repository from GitHub...
   https://github.com/CVEProject/cvelistV5/archive/refs/heads/main.zip
   This data updates every 7 minutes - you're getting fresh intel!
✅ Downloaded 524,601,679 bytes (500.3 MB)
🧹 Removing old extraction...


In [0]:
import json
import os
import pandas as pd
from pyspark.sql.functions import from_json, schema_of_json, col

TARGET_YEAR = 2024
year_folder = f"/tmp/cve/cvelistV5-main/cves/{TARGET_YEAR}"

print(f"📄 Loading all {TARGET_YEAR} CVE records...")
print(f"   Reading from: {year_folder}")

json_data = []
files_processed = 0

for root, directories, files in os.walk(year_folder):
    for filename in files:
        if filename.endswith('.json'):
            full_path = os.path.join(root, filename)
            
            with open(full_path, 'r') as json_file:
                json_data.append(json_file.read())
                files_processed += 1
            
            if files_processed % 5000 == 0:
                print(f"   Progress: {files_processed:,} files loaded...")

print(f"✅ Successfully loaded {files_processed:,} JSON files")
print(f"🔄 Converting to Spark DataFrame...")

pandas_df = pd.DataFrame({'json_string': json_data})
spark_temp = spark.createDataFrame(pandas_df)

print(f"🔍 Inferring schema from sample record...")
sample_record = json_data[0]
inferred_schema = schema_of_json(sample_record)

print(f"📦 Parsing all JSON records with inferred schema...")
df_raw = spark_temp.select(
    from_json(col("json_string"), inferred_schema).alias("data")
).select("data.*")

record_count = df_raw.count()
print(f"📊 Total CVE records in DataFrame: {record_count:,}")
print(f"\n🔍 Schema structure:")
df_raw.printSchema()

📄 Loading all 2024 CVE records...
   Reading from: /tmp/cve/cvelistV5-main/cves/2024
   Progress: 5,000 files loaded...
   Progress: 10,000 files loaded...
   Progress: 15,000 files loaded...
   Progress: 20,000 files loaded...
   Progress: 25,000 files loaded...
   Progress: 30,000 files loaded...
   Progress: 35,000 files loaded...
✅ Successfully loaded 38,753 JSON files
🔄 Converting to Spark DataFrame...
🔍 Inferring schema from sample record...
📦 Parsing all JSON records with inferred schema...
📊 Total CVE records in DataFrame: 38,753

🔍 Schema structure:
root
 |-- containers: struct (nullable = true)
 |    |-- adp: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- metrics: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- cvssV3_1: struct (nullable = true)
 |    |    |    |    |    |    |-- attackComplexity: string (nullable = true)
 |    |    |    |    |    |    |-- at

In [0]:
from pyspark.sql import functions as F

bronze_table_name = "cve_bronze_records"

spark.conf.set("spark.sql.shuffle.partitions", "8")

print("🔧 Processing Bronze Layer - Filtering and Validating Data...")
print(f"   Target: 2024 CVE publications only")

publication_date = F.col("cveMetadata.datePublished").cast("string")
df_raw = df_raw.withColumn("_datePublished_ts", F.to_timestamp(publication_date))
df_2024 = df_raw.filter(F.year(F.col("_datePublished_ts")) == 2024)

print("\n📊 Data Quality Validation:")

total_records = df_raw.count()
records_2024 = df_2024.count()
null_cve_ids = df_2024.filter(F.col("cveMetadata.cveId").isNull()).count()
unique_cve_ids = df_2024.select("cveMetadata.cveId").distinct().count()

print(f"   Total records loaded: {total_records:,}")
print(f"   Records from 2024: {records_2024:,}")
print(f"   Null CVE IDs found: {null_cve_ids}")
print(f"   Unique CVE IDs: {unique_cve_ids:,}")

print("\n🔍 Running quality assertions...")
assert records_2024 >= 30000, f"Expected at least 30,000 records, got {records_2024:,}"
assert null_cve_ids == 0, f"Found {null_cve_ids} null CVE IDs - data integrity issue"
assert unique_cve_ids == records_2024, f"Duplicate CVE IDs detected: {records_2024 - unique_cve_ids} duplicates"


df_2024.createOrReplaceTempView(bronze_table_name)

print(f"\nBronze Table: {bronze_table_name}")
print(f"Total Records: {records_2024:,}")
print(f"\nData Schema:")
df_2024.printSchema()

print("\nSample Data (First 5 Records):")
spark.sql(f"""
    SELECT cveMetadata.cveId, cveMetadata.datePublished 
    FROM {bronze_table_name} 
    LIMIT 5
""").show(truncate=False)


🔧 Processing Bronze Layer - Filtering and Validating Data...
   Target: 2024 CVE publications only

📊 Data Quality Validation:
   Total records loaded: 38,753
   Records from 2024: 32,924
   Null CVE IDs found: 0
   Unique CVE IDs: 32,924

🔍 Running quality assertions...

Bronze Table: cve_bronze_records
Total Records: 32,924

Data Schema:
root
 |-- containers: struct (nullable = true)
 |    |-- adp: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- metrics: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- cvssV3_1: struct (nullable = true)
 |    |    |    |    |    |    |-- attackComplexity: string (nullable = true)
 |    |    |    |    |    |    |-- attackVector: string (nullable = true)
 |    |    |    |    |    |    |-- availabilityImpact: string (nullable = true)
 |    |    |    |    |    |    |-- baseScore: double (nullable = true)
 |    |    |    |    |    |    |-- 

In [0]:
import json


bronze_data = spark.table("cve_bronze_records")

print("\n" + "="*70)
print("TOP-LEVEL SCHEMA STRUCTURE")
print("="*70)
bronze_data.printSchema()

print("\n" + "="*70)
print("SAMPLE CVE RECORD (First 5000 characters)")
print("="*70)
sample_record = bronze_data.limit(1).collect()[0].asDict()
formatted_json = json.dumps(sample_record, indent=2, default=str)
print(formatted_json[:5000])

print("\n" + "="*70)
print("CONTAINERS SECTION SCHEMA")
print("="*70)
bronze_data.select("containers.*").printSchema()

print("\n✅ Schema exploration complete")


TOP-LEVEL SCHEMA STRUCTURE
root
 |-- containers: struct (nullable = true)
 |    |-- adp: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- metrics: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- cvssV3_1: struct (nullable = true)
 |    |    |    |    |    |    |-- attackComplexity: string (nullable = true)
 |    |    |    |    |    |    |-- attackVector: string (nullable = true)
 |    |    |    |    |    |    |-- availabilityImpact: string (nullable = true)
 |    |    |    |    |    |    |-- baseScore: double (nullable = true)
 |    |    |    |    |    |    |-- baseSeverity: string (nullable = true)
 |    |    |    |    |    |    |-- confidentialityImpact: string (nullable = true)
 |    |    |    |    |    |    |-- integrityImpact: string (nullable = true)
 |    |    |    |    |    |    |-- privilegesRequired: string (nullable = true)
 |    |    |    |    |    |    |-- 

In [0]:
from pyspark.sql import functions as F

print("🔄 Building Silver Layer - Normalizing Bronze data into structured tables...")

bronze_data = spark.table("cve_bronze_records")

print("\n📋 Step 1: Creating Core CVE Table")
print("   Extracting key fields: IDs, dates, CVSS scores, descriptions")

core_cve_table = bronze_data.select(
    F.col("cveMetadata.cveId").alias("cve_id"),
    F.to_timestamp(F.col("cveMetadata.datePublished")).alias("date_published"),
    F.to_timestamp(F.col("cveMetadata.dateReserved")).alias("date_reserved"),
    F.to_timestamp(F.col("cveMetadata.dateUpdated")).alias("date_updated"),
    F.col("cveMetadata.state").alias("state"),
    
    F.col("containers.adp")[0]["metrics"][0]["cvssV3_1"]["baseScore"].alias("cvss_base_score"),
    F.col("containers.adp")[0]["metrics"][0]["cvssV3_1"]["baseSeverity"].alias("cvss_severity"),
    F.col("containers.adp")[0]["metrics"][0]["cvssV3_1"]["vectorString"].alias("cvss_vector"),
    
    F.col("containers.cna.descriptions")[0]["value"].alias("description"),
    F.col("containers.cna.title").alias("title")
)

core_cve_table.createOrReplaceTempView("cve_silver_core")
core_record_count = core_cve_table.count()
print(f"✅ Core CVE table ready: {core_record_count:,} records")

print("\n📋 Step 2: Creating Affected Products Table")
print("   Using explode to flatten vendor/product relationships")

affected_products = bronze_data.select(
    F.col("cveMetadata.cveId").alias("cve_id"),
    F.explode_outer(F.col("containers.cna.affected")).alias("affected_item")
).select(
    F.col("cve_id"),
    F.col("affected_item.vendor").alias("vendor"),
    F.col("affected_item.product").alias("product"),
    F.col("affected_item.defaultStatus").alias("default_status"),
    F.col("affected_item.repo").alias("repo")
)

affected_products_clean = affected_products.filter(
    F.col("vendor").isNotNull() & F.col("product").isNotNull()
)

affected_products_clean.createOrReplaceTempView("cve_silver_affected_products")
affected_record_count = affected_products_clean.count()
print(f"✅ Affected Products table ready: {affected_record_count:,} records")



print("\nCore CVE Table Schema:")
core_cve_table.printSchema()

print("\nCore CVE Sample Data (with CVSS scores):")
spark.sql("""
    SELECT cve_id, date_published, cvss_base_score, cvss_severity, title
    FROM cve_silver_core 
    WHERE cvss_base_score IS NOT NULL 
    LIMIT 5
""").show(truncate=False)

print("\nAffected Products Table Schema:")
affected_products_clean.printSchema()

print("\nAffected Products Sample Data:")
spark.sql("""
    SELECT cve_id, vendor, product, default_status
    FROM cve_silver_affected_products 
    LIMIT 10
""").show(truncate=False)

print("\n📊 Data Quality Metrics:")
print(f"   Core CVE records: {core_record_count:,}")
print(f"   Affected Products records: {affected_record_count:,}")
print(f"   Average products per CVE: {affected_record_count/core_record_count:.2f}")

cves_without_scores = core_cve_table.filter(F.col("cvss_base_score").isNull()).count()
percentage_unscored = (cves_without_scores/core_record_count) * 100
print(f"   CVEs without CVSS scores: {cves_without_scores:,} ({percentage_unscored:.1f}%)")



🔄 Building Silver Layer - Normalizing Bronze data into structured tables...

📋 Step 1: Creating Core CVE Table
   Extracting key fields: IDs, dates, CVSS scores, descriptions
✅ Core CVE table ready: 32,924 records

📋 Step 2: Creating Affected Products Table
   Using explode to flatten vendor/product relationships
✅ Affected Products table ready: 61,238 records

Core CVE Table Schema:
root
 |-- cve_id: string (nullable = true)
 |-- date_published: timestamp (nullable = true)
 |-- date_reserved: timestamp (nullable = true)
 |-- date_updated: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- cvss_base_score: double (nullable = true)
 |-- cvss_severity: string (nullable = true)
 |-- cvss_vector: string (nullable = true)
 |-- description: string (nullable = true)
 |-- title: string (nullable = true)


Core CVE Sample Data (with CVSS scores):
+--------------+-----------------------+---------------+-------------+-------------------------------------------------------------

In [0]:
print("📊 Starting CVE 2024 Data Analysis")
print("   Let's dig into the numbers and find some insights...\n")


print("\nMonthly Publication Trends:")
spark.sql("""
    SELECT 
        MONTH(date_published) as month,
        COUNT(*) as vulnerability_count,
        ROUND(AVG(cvss_base_score), 2) as avg_cvss_score
    FROM cve_silver_core
    WHERE date_published IS NOT NULL
    GROUP BY MONTH(date_published)
    ORDER BY month
""").show()

print("\nPublication Latency (Reserved → Published):")
spark.sql("""
    SELECT 
        ROUND(AVG(DATEDIFF(date_published, date_reserved)), 1) as avg_days_to_publish,
        MIN(DATEDIFF(date_published, date_reserved)) as min_days,
        MAX(DATEDIFF(date_published, date_reserved)) as max_days,
        PERCENTILE_APPROX(DATEDIFF(date_published, date_reserved), 0.5) as median_days
    FROM cve_silver_core
    WHERE date_reserved IS NOT NULL AND date_published IS NOT NULL
""").show()

print("\n" + "=" * 70)
print("2️⃣ RISK DISTRIBUTION - How severe are these vulnerabilities?")
print("=" * 70)

print("\nCVSS Severity Distribution:")
spark.sql("""
    SELECT 
        CASE 
            WHEN cvss_base_score >= 9.0 THEN 'CRITICAL'
            WHEN cvss_base_score >= 7.0 THEN 'HIGH'
            WHEN cvss_base_score >= 4.0 THEN 'MEDIUM'
            WHEN cvss_base_score > 0.0 THEN 'LOW'
            ELSE 'UNKNOWN'
        END as severity_level,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
    FROM cve_silver_core
    GROUP BY 
        CASE 
            WHEN cvss_base_score >= 9.0 THEN 'CRITICAL'
            WHEN cvss_base_score >= 7.0 THEN 'HIGH'
            WHEN cvss_base_score >= 4.0 THEN 'MEDIUM'
            WHEN cvss_base_score > 0.0 THEN 'LOW'
            ELSE 'UNKNOWN'
        END
    ORDER BY 
        CASE severity_level
            WHEN 'CRITICAL' THEN 1
            WHEN 'HIGH' THEN 2
            WHEN 'MEDIUM' THEN 3
            WHEN 'LOW' THEN 4
            ELSE 5
        END
""").show()

print("\nMonthly Severity Trends:")
spark.sql("""
    SELECT 
        MONTH(date_published) as month,
        ROUND(AVG(cvss_base_score), 2) as avg_severity,
        COUNT(CASE WHEN cvss_base_score >= 9.0 THEN 1 END) as critical_count,
        COUNT(CASE WHEN cvss_base_score >= 7.0 AND cvss_base_score < 9.0 THEN 1 END) as high_count
    FROM cve_silver_core
    WHERE date_published IS NOT NULL
    GROUP BY MONTH(date_published)
    ORDER BY month
""").show()

print("\n" + "=" * 70)
print("3️⃣ VENDOR INTELLIGENCE - Who's affected the most?")
print("=" * 70)

print("\nTop 25 Vendors by Vulnerability Count:")
spark.sql("""
    SELECT 
        vendor,
        COUNT(DISTINCT cve_id) as vulnerability_count,
        COUNT(DISTINCT product) as product_count
    FROM cve_silver_affected_products
    WHERE vendor IS NOT NULL
    GROUP BY vendor
    ORDER BY vulnerability_count DESC
    LIMIT 25
""").show(25, truncate=False)

print("\nMarket Concentration Analysis (Top 10):")
spark.sql("""
    WITH vendor_counts AS (
        SELECT 
            vendor,
            COUNT(DISTINCT cve_id) as vuln_count
        FROM cve_silver_affected_products
        GROUP BY vendor
    ),
    total_vulns AS (
        SELECT COUNT(DISTINCT cve_id) as total FROM cve_silver_affected_products
    )
    SELECT 
        vendor,
        vuln_count,
        ROUND(vuln_count * 100.0 / (SELECT total FROM total_vulns), 2) as market_share_pct,
        ROUND(SUM(vuln_count * 100.0 / (SELECT total FROM total_vulns)) OVER (ORDER BY vuln_count DESC), 2) as cumulative_pct
    FROM vendor_counts
    ORDER BY vuln_count DESC
    LIMIT 10
""").show(10, truncate=False)

print("\nVendor Risk Profiles (Top 10 by Average CVSS Score):")
spark.sql("""
    SELECT 
        p.vendor,
        COUNT(DISTINCT p.cve_id) as total_cves,
        ROUND(AVG(c.cvss_base_score), 2) as avg_cvss_score,
        MAX(c.cvss_base_score) as max_cvss_score,
        COUNT(CASE WHEN c.cvss_base_score >= 9.0 THEN 1 END) as critical_count
    FROM cve_silver_affected_products p
    JOIN cve_silver_core c ON p.cve_id = c.cve_id
    WHERE p.vendor IS NOT NULL AND c.cvss_base_score IS NOT NULL
    GROUP BY p.vendor
    HAVING COUNT(DISTINCT p.cve_id) >= 10
    ORDER BY avg_cvss_score DESC
    LIMIT 10
""").show(10, truncate=False)

print("\n" + "=" * 70)
print("4️⃣ SUMMARY STATISTICS - The big picture")
print("=" * 70)

spark.sql("""
    SELECT 
        COUNT(DISTINCT cve_id) as total_cves_2024,
        COUNT(DISTINCT CASE WHEN cvss_base_score IS NOT NULL THEN cve_id END) as scored_cves,
        ROUND(AVG(cvss_base_score), 2) as avg_cvss_score,
        MAX(cvss_base_score) as max_cvss_score,
        MIN(date_published) as first_published,
        MAX(date_published) as last_published
    FROM cve_silver_core
""").show(truncate=False)


📊 Starting CVE 2024 Data Analysis
   Let's dig into the numbers and find some insights...


Monthly Publication Trends:
+-----+-------------------+--------------+
|month|vulnerability_count|avg_cvss_score|
+-----+-------------------+--------------+
|    1|               1134|          8.37|
|    2|               1769|          7.06|
|    3|               2616|          6.65|
|    4|               3218|          6.74|
|    5|               3348|          7.02|
|    6|               2707|           7.0|
|    7|               2877|          7.11|
|    8|               2692|          7.57|
|    9|               2408|          7.16|
|   10|               3373|          7.24|
|   11|               3760|          6.72|
|   12|               3022|          6.93|
+-----+-------------------+--------------+


Publication Latency (Reserved → Published):
+-------------------+--------+--------+-----------+
|avg_days_to_publish|min_days|max_days|median_days|
+-------------------+--------+--------+---