# Kaggle Data Loader - Databricks Notebook

This notebook downloads Kaggle datasets and loads them to Databricks Volumes.

In [None]:
%pip install kaggle pyspark

In [None]:
import os

# Option 1: Use Databricks Secrets (Recommended)
try:
    username = dbutils.secrets.get("kaggle-scope", "username")
    api_key = dbutils.secrets.get("kaggle-scope", "api-key")
    os.environ["KAGGLE_USERNAME"] = username
    os.environ["KAGGLE_KEY"] = api_key
    print("✓ Kaggle credentials configured from secrets")
except Exception as e:
    print(f"⚠ Could not load from secrets: {e}")
    print("Create secrets with: dbutils.secrets.create_scope('kaggle-scope')")

In [None]:
import sys
sys.path.append("/Workspace/databricks")

from basics.file_read import DatabricksKaggleLoader

print("✓ Module imported successfully")

In [None]:
# Initialize loader
loader = DatabricksKaggleLoader(
    databricks_path="/Volumes/workspace/default/kaggle"
)

# Download and upload dataset
print("Starting download from Kaggle...")
dataset_path = loader.download_and_upload_dataset(
    dataset_name="aekundayo/health-insurance-data",
    file_name="BenefitsCostSharing.csv"
)

print(f"✓ Dataset uploaded to: {dataset_path}")

In [None]:
import os
from pyspark.sql import functions as F

# Verify dataset file exists before loading

csv_path = f"{dataset_path}/BenefitsCostSharing.csv"

# Check if file exists in Databricks Volumes
try:
    dbutils.fs.ls(csv_path)
    print(f"✓ File verified at: {csv_path}")
    
    # Load CSV file
    df = loader.load_csv_with_spark(csv_path, num_rows=10)
    print("✓ Dataset loaded successfully")
except Exception as e:
    print(f"✗ File not found or error loading: {e}")
    df = None

In [None]:
# Load CSV file
csv_path = f"{dataset_path}/BenefitsCostSharing.csv"
df = loader.load_csv_with_spark(csv_path, num_rows=10)

# Display dataframe
display(df)

In [None]:
# Show summary statistics
print(f"Dataset shape: {df.count()} rows, {len(df.columns)} columns")
print("\nColumn types:")
df.printSchema()
print("\nNull values per column:")
df.select([(F.count(F.when(F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in df.columns]).show()

In [None]:
# List files in Volumes
import os
print("Files in Databricks Volumes:")
dbutils.fs.ls("/Volumes/workspace/default/kaggle/health-insurance-data")