# Data Preparation for SEC Financial Filings
This notebook:
1. Creates a schema for the project  
2. Creates a single Volume for raw datasets and checkpoints  
3. Copies dataset files from the repo into the Volume  

### 1. Creates a schema for the project

In [0]:
catalog_name = "financial_data"
schema_name = "lakehouse"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"USE SCHEMA {schema_name}")

print(f"Active schema: {catalog_name}.{schema_name}")

### 2. Creates a single Volume for raw datasets and checkpoints

In [0]:
volume_name = "datasets"
checkpoints_name = "files"

spark.sql(f"CREATE VOLUME IF NOT EXISTS {volume_name}")
print(f"Volume created: {catalog_name}.{schema_name}.{volume_name}")

spark.sql(f"CREATE VOLUME IF NOT EXISTS {checkpoints_name}")
print(f"Volume created: {catalog_name}.{schema_name}.{checkpoints_name}")

### 3. Copies dataset files from the repo into the Volume

In [0]:
import shutil, os

repo_dataset_path = "/Workspace/Repos/yourname@databricks.com/databricks-financial-data-lakehouse/data-sample"
volume_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/sec_filings"

os.makedirs(volume_path, exist_ok=True)

for file in os.listdir(repo_dataset_path):
    if file.endswith(".json"):
        shutil.copy(os.path.join(repo_dataset_path, file), volume_path)
        print(f"Copied: {file} -> {volume_path}")

print("All dataset files copied successfully.")