In [0]:
# Import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns

# COMMAND ----------

# Configuration - Data Paths
BASE_PATH = "/Volumes/workspace/default/file_store"
RAW_DATA_PATH = f"{BASE_PATH}"
PROCESSED_DATA_PATH = f"{BASE_PATH}/processed_data"
FEATURE_DATA_PATH = f"{BASE_PATH}/feature_data"
MODEL_PATH = f"{BASE_PATH}/models"

# Specific data files
BENEFICIARY_FILE = f"{RAW_DATA_PATH}/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv"
INPATIENT_CLAIMS_FILE = f"{RAW_DATA_PATH}/DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv"

# Delta Lake paths
DELTA_BASE_PATH = f"{BASE_PATH}/delta"
DELTA_BRONZE_PATH = f"{DELTA_BASE_PATH}/bronze"
DELTA_SILVER_PATH = f"{DELTA_BASE_PATH}/silver"
DELTA_GOLD_PATH = f"{DELTA_BASE_PATH}/gold"

print("âœ“ Configuration loaded")

# COMMAND ----------

print(f"Loading data from: {RAW_DATA_PATH}")
print(f"Beneficiary file: {BENEFICIARY_FILE}")
print(f"Inpatient claims file: {INPATIENT_CLAIMS_FILE}")

# MAGIC %md
# MAGIC ## 3. MLflow Configuration

# COMMAND ----------

MLFLOW_EXPERIMENT_NAME = "/Users/shahan24h@gmail.com/oncology-treatment-prediction"
MLFLOW_TRACKING_URI = "databricks"

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Model Parameters

# COMMAND ----------

# Random seed for reproducibility
RANDOM_SEED = 42

# Train/test split
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.2

# Model parameters (default values)
MODEL_PARAMS = {
    "random_forest": {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_split": 5,
        "random_state": RANDOM_SEED
    },
    "xgboost": {
        "max_depth": 6,
        "learning_rate": 0.1,
        "n_estimators": 100,
        "random_state": RANDOM_SEED
    },
    "logistic_regression": {
        "max_iter": 1000,
        "random_state": RANDOM_SEED
    }
}

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Feature Engineering Parameters

# COMMAND ----------

# Target variable (we'll define this based on actual data)
TARGET_VARIABLE = "treatment_type"  # placeholder

# Features to use
DEMOGRAPHIC_FEATURES = ["age", "gender", "race"]
CLINICAL_FEATURES = ["cancer_type", "cancer_stage", "comorbidity_count"]
TEMPORAL_FEATURES = ["days_since_diagnosis", "treatment_year", "treatment_month"]

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Data Quality Thresholds

# COMMAND ----------

# Missing value threshold
MAX_MISSING_THRESHOLD = 0.5  # 50%

# Minimum sample size per class
MIN_SAMPLES_PER_CLASS = 100

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Visualization Settings

# COMMAND ----------

import matplotlib.pyplot as plt
import seaborn as sns

# Set default plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

FIGURE_SIZE = (12, 6)
LARGE_FIGURE_SIZE = (15, 8)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Export Configuration as Dictionary

# COMMAND ----------

def get_config():
    """
    Returns all configuration as a dictionary
    """
    config = {
        "project": {
            "name": PROJECT_NAME,
            "version": PROJECT_VERSION,
            "author": AUTHOR,
            "description": DESCRIPTION
        },
        "paths": {
            "base": BASE_PATH,
            "raw_data": RAW_DATA_PATH,
            "processed_data": PROCESSED_DATA_PATH,
            "feature_data": FEATURE_DATA_PATH,
            "model": MODEL_PATH,
            "delta": {
                "bronze": DELTA_BRONZE_PATH,
                "silver": DELTA_SILVER_PATH,
                "gold": DELTA_GOLD_PATH
            }
        },
        "mlflow": {
            "experiment_name": MLFLOW_EXPERIMENT_NAME,
            "tracking_uri": MLFLOW_TRACKING_URI
        },
        "model": {
            "random_seed": RANDOM_SEED,
            "test_size": TEST_SIZE,
            "validation_size": VALIDATION_SIZE,
            "params": MODEL_PARAMS
        },
        "features": {
            "target": TARGET_VARIABLE,
            "demographic": DEMOGRAPHIC_FEATURES,
            "clinical": CLINICAL_FEATURES,
            "temporal": TEMPORAL_FEATURES
        },
        "quality": {
            "max_missing": MAX_MISSING_THRESHOLD,
            "min_samples": MIN_SAMPLES_PER_CLASS
        }
    }
    return config

# Test the function
config = get_config()
print(f"Configuration loaded for project: {config['project']['name']}")
print(f"MLflow Experiment: {config['mlflow']['experiment_name']}")