# Ingest CSV Files from GitHub

**Purpose:** Load all 36 monthly CSV files from GitHub into individual bronze tables

**Source:** https://github.com/sulaiman013/sales-analytics-data-dbt-jobs-fabric

**Target:** 36 individual Delta tables in `dbo` schema (dbo.bronze_YYYY_MM)

**Lakehouse:** sales_LH (schema-enabled)

In [1]:
# Configuration
GITHUB_BASE_URL = "https://raw.githubusercontent.com/sulaiman013/sales-analytics-data-dbt-jobs-fabric/master/data"
YEARS = [2023, 2024, 2025]
MONTHS = range(1, 13)
TARGET_SCHEMA = "dbo"

print("="*60)
print("BRONZE LAYER: Ingest 36 CSV files from GitHub")
print("="*60)
print(f"Source: {GITHUB_BASE_URL}")
print(f"Target Schema: {TARGET_SCHEMA}")
print(f"Total files to load: {len(YEARS) * 12}")
print("="*60)

StatementMeta(, db8bcbd0-8509-448b-a3b3-27dcefcbc2ab, 3, Finished, Available, Finished)

BRONZE LAYER: Ingest 36 CSV files from GitHub
Source: https://raw.githubusercontent.com/sulaiman013/sales-analytics-data-dbt-jobs-fabric/master/data
Target Schema: dbo
Total files to load: 36


In [2]:
# Ingest each CSV file from GitHub
# Note: Fabric Spark can't read HTTP URLs directly, so we use pandas + requests

import time
import pandas as pd
import requests
from io import StringIO

start_time = time.time()
files_loaded = 0
total_rows = 0

for year in YEARS:
    for month in MONTHS:
        # Table name with schema: dbo.bronze_YYYY_MM
        table_name = f"{TARGET_SCHEMA}.bronze_{year}_{month:02d}"
        url = f"{GITHUB_BASE_URL}/sales_{year}_{month:02d}.csv"
        
        try:
            # Download CSV using requests
            response = requests.get(url)
            response.raise_for_status()
            
            # Read into pandas DataFrame
            pdf = pd.read_csv(StringIO(response.text))
            
            # Convert to Spark DataFrame
            df = spark.createDataFrame(pdf)
            row_count = df.count()
            
            # Write to Delta table in dbo schema
            df.write.format("delta").mode("overwrite").saveAsTable(table_name)
            
            files_loaded += 1
            total_rows += row_count
            
            if files_loaded % 6 == 0:
                elapsed = time.time() - start_time
                print(f"  Loaded {files_loaded}/36 files... ({elapsed:.1f}s)")
                
        except Exception as e:
            print(f"  ERROR loading {table_name}: {type(e).__name__}: {str(e)}")

total_time = time.time() - start_time

StatementMeta(, db8bcbd0-8509-448b-a3b3-27dcefcbc2ab, 4, Finished, Available, Finished)

  Loaded 6/36 files... (75.9s)
  Loaded 12/36 files... (129.8s)
  Loaded 18/36 files... (241.3s)
  Loaded 24/36 files... (294.6s)
  Loaded 30/36 files... (351.9s)
  Loaded 36/36 files... (410.9s)


In [3]:
# Summary
print("\n" + "="*60)
print("INGESTION COMPLETE")
print("="*60)
print(f"Target Schema:  {TARGET_SCHEMA}")
print(f"Files Loaded:   {files_loaded}")
print(f"Total Rows:     {total_rows:,}")
print(f"Total Time:     {total_time:.2f} seconds")
print(f"Avg per file:   {total_time/files_loaded:.2f} seconds")
print("="*60)

StatementMeta(, db8bcbd0-8509-448b-a3b3-27dcefcbc2ab, 5, Finished, Available, Finished)


INGESTION COMPLETE
Target Schema:  dbo
Files Loaded:   36
Total Rows:     3,600,000
Total Time:     410.91 seconds
Avg per file:   11.41 seconds


In [4]:
# List all bronze tables in dbo schema
print(f"\nTables in {TARGET_SCHEMA} schema:")
tables = spark.catalog.listTables()
bronze_tables = [t for t in tables if t.name.startswith("bronze_")]
for t in sorted(bronze_tables, key=lambda x: x.name):
    print(f"  - {TARGET_SCHEMA}.{t.name}")
print(f"\nTotal: {len(bronze_tables)} tables")

StatementMeta(, db8bcbd0-8509-448b-a3b3-27dcefcbc2ab, 6, Finished, Available, Finished)


Tables in dbo schema:
  - dbo.bronze_2023_01
  - dbo.bronze_2023_02
  - dbo.bronze_2023_03
  - dbo.bronze_2023_04
  - dbo.bronze_2023_05
  - dbo.bronze_2023_06
  - dbo.bronze_2023_07
  - dbo.bronze_2023_08
  - dbo.bronze_2023_09
  - dbo.bronze_2023_10
  - dbo.bronze_2023_11
  - dbo.bronze_2023_12
  - dbo.bronze_2024_01
  - dbo.bronze_2024_02
  - dbo.bronze_2024_03
  - dbo.bronze_2024_04
  - dbo.bronze_2024_05
  - dbo.bronze_2024_06
  - dbo.bronze_2024_07
  - dbo.bronze_2024_08
  - dbo.bronze_2024_09
  - dbo.bronze_2024_10
  - dbo.bronze_2024_11
  - dbo.bronze_2024_12
  - dbo.bronze_2025_01
  - dbo.bronze_2025_02
  - dbo.bronze_2025_03
  - dbo.bronze_2025_04
  - dbo.bronze_2025_05
  - dbo.bronze_2025_06
  - dbo.bronze_2025_07
  - dbo.bronze_2025_08
  - dbo.bronze_2025_09
  - dbo.bronze_2025_10
  - dbo.bronze_2025_11
  - dbo.bronze_2025_12

Total: 36 tables


In [5]:
# Verify sample data from first table
print("\nSample data from dbo.bronze_2023_01:")
spark.sql("SELECT * FROM dbo.bronze_2023_01 LIMIT 5").show(truncate=False)

StatementMeta(, db8bcbd0-8509-448b-a3b3-27dcefcbc2ab, 7, Finished, Available, Finished)


Sample data from dbo.bronze_2023_01:
+--------------+-----------+---------+-------------+----------+----------+----------+-----------+---------+----------+-----------------+----------+--------+-----+-------+---------------+----------+--------+------------+-----------+----------+----------+---------------+------------+-----------------+---------------------+------------------+-----------------------+--------------+------------+-----------+----------+---------------+-------------+--------------+----------+
|order_id      |customer_id|driver_id|restaurant_id|order_date|order_time|order_year|order_month|order_day|order_hour|order_day_of_week|is_weekend|city    |state|country|restaurant_type|item_count|subtotal|delivery_fee|service_fee|tax_amount|tip_amount|discount_amount|total_amount|prep_time_minutes|delivery_time_minutes|total_time_minutes|delivery_distance_miles|payment_method|order_status|device_type|promo_code|customer_rating|driver_rating|is_first_order|is_reorder|
+--------------+

In [11]:
# List all source tables (they're in the default database with prefix "source_bronze_")
print("\nTables with 'source_bronze_' prefix:")
tables = spark.catalog.listTables()
source_tables = [t for t in tables if t.name.startswith("source_bronze_")]
for t in sorted(source_tables, key=lambda x: x.name):
    print(f"  - {t.name}")
print(f"\nTotal: {len(source_tables)} tables")


StatementMeta(, 41f7a560-ee30-4f18-960d-89906a2f6c02, 13, Finished, Available, Finished)


Tables with 'source_bronze_' prefix:
  - source_bronze_2023_01
  - source_bronze_2023_02
  - source_bronze_2023_03
  - source_bronze_2023_04
  - source_bronze_2023_05
  - source_bronze_2023_06
  - source_bronze_2023_07
  - source_bronze_2023_08
  - source_bronze_2023_09
  - source_bronze_2023_10
  - source_bronze_2023_11
  - source_bronze_2023_12
  - source_bronze_2024_01
  - source_bronze_2024_02
  - source_bronze_2024_03
  - source_bronze_2024_04
  - source_bronze_2024_05
  - source_bronze_2024_06
  - source_bronze_2024_07
  - source_bronze_2024_08
  - source_bronze_2024_09
  - source_bronze_2024_10
  - source_bronze_2024_11
  - source_bronze_2024_12
  - source_bronze_2025_01
  - source_bronze_2025_02
  - source_bronze_2025_03
  - source_bronze_2025_04
  - source_bronze_2025_05
  - source_bronze_2025_06
  - source_bronze_2025_07
  - source_bronze_2025_08
  - source_bronze_2025_09
  - source_bronze_2025_10
  - source_bronze_2025_11
  - source_bronze_2025_12

Total: 36 tables
