# Week 1: Data Ingestion & Foundation Setup

This notebook handles:
- Databricks workspace setup
- SEC EDGAR data downloading
- Bronze layer ingestion into Delta tables
- Metadata logging


In [None]:
# Install required packages
%pip install sec-edgar-downloader pyspark delta-spark


In [None]:
# Import modules
import sys
sys.path.append('/Workspace/Repos/genai-legal-doc-poc/src')

from pyspark.sql import SparkSession
from src.utils.delta_helpers import (
    get_spark_session,
    initialize_all_tables
)
from src.ingest.sec_ingest import SECIngestor
from src.utils.logger import logger


In [None]:
# Initialize Spark and Delta tables
spark = get_spark_session()
initialize_all_tables(spark, database="default")


In [None]:
# Configure ingestion parameters
TICKER = "AAPL"  # Change to your desired ticker
FILING_TYPES = ["10-K", "10-Q"]  # Types of filings to download
DATE_RANGE = ("2023-01-01", "2023-12-31")  # Date range for filings


In [None]:
# Initialize ingestor and run full pipeline
ingestor = SECIngestor(spark=spark)

# Download and ingest SEC filings
count = ingestor.run_full_ingestion(
    ticker=TICKER,
    filing_types=FILING_TYPES,
    date_range=DATE_RANGE,
    database="default"
)

print(f"Successfully ingested {count} files")


In [None]:
# Verify ingestion
spark.sql(f"SELECT COUNT(*) as total_docs FROM default.bronze_legal_docs").show()
spark.sql(f"SELECT * FROM default.bronze_legal_docs LIMIT 5").show(truncate=False)
