In [38]:
# Import necessary libraries
import snowflake.connector
import glob
from tqdm import tqdm  # Import tqdm for progress bar

In [None]:
# Snowflake connection parameters
ACCOUNT = "voucimo-atb07951"
USER = "DNIERMAN"  # Update with your Snowflake username!
PASSWORD = "JustTryAnd190!"  # update with your Snowflake password!
WAREHOUSE = "purchase_orders_wh"
DATABASE = "purchase_orders_db"
SCHEMA = "purchase_orders_schema"
TABLE = "purchase_orders"
CSV_DIR = "./Data/Monthly_PO_Data"
STAGE = "monthly_po_stage"
FILE_FMT = "MONTHLY_PO_CSV_FMT"


In [40]:
# Snowflake connection
conn = snowflake.connector.connect(user=USER, password=PASSWORD, account=ACCOUNT)
cs = conn.cursor()

In [41]:
# Create Snowflake warehouse, database, schema and staging area
try:
    # Create a warehouse
    cs.execute(f"CREATE WAREHOUSE IF NOT EXISTS {WAREHOUSE};")
    print(f"✅ Warehouse '{WAREHOUSE}' created or replaced.")
except Exception as e:
    print(f"❌ Failed to create warehouse '{WAREHOUSE}': {e}")

try:
    # Create a database
    cs.execute(f"CREATE OR REPLACE DATABASE {DATABASE};")
    print(f"✅ Database '{DATABASE}' created or replaced.")
except Exception as e:
    print(f"❌ Failed to create database '{DATABASE}': {e}")

try:
    # Create a schema
    cs.execute(f"CREATE OR REPLACE SCHEMA {DATABASE}.{SCHEMA};")
    print(f"✅ Schema '{SCHEMA}' created or replaced in database '{DATABASE}'.")
except Exception as e:
    print(f"❌ Failed to create schema '{SCHEMA}' in database '{DATABASE}': {e}")

try:
    # Create a staging area in Snowflake
    cs.execute(f"CREATE OR REPLACE STAGE {DATABASE}.{SCHEMA}.{STAGE};")
    print(
        f"Stage '{STAGE}' created or replaced in schema '{SCHEMA}' of database '{DATABASE}'."
    )
except Exception as e:
    print(
        f"❌ Failed to create stage '{STAGE}' in schema '{SCHEMA}' of database '{DATABASE}': {e}"
    )


✅ Warehouse 'purchase_orders_wh' created or replaced.
✅ Database 'purchase_orders_db' created or replaced.
✅ Schema 'purchase_orders_schema' created or replaced in database 'purchase_orders_db'.
Stage 'monthly_po_stage' created or replaced in schema 'purchase_orders_schema' of database 'purchase_orders_db'.


In [42]:
# Upload to Snowflake stage

# Path to the CSV files
csv_files = glob.glob(f"{CSV_DIR}/*.csv")

# Check if any CSV files were found
if not csv_files:
    print("No CSV files found in the specified directory.")
else:
    failed_uploads = []  # List to store files that failed to upload

    # Use tqdm to create a progress bar
    for file_path in tqdm(csv_files, desc="Uploading files", unit="file"):
        put_command = (
            f"PUT 'file://{file_path}' @{DATABASE}.{SCHEMA}.{STAGE} OVERWRITE = TRUE"
        )
        try:
            cs.execute(put_command)
        except Exception as upload_error:
            failed_uploads.append((file_path, str(upload_error)))

    # Print summary of failed uploads
    if failed_uploads:
        print("\nThe following files failed to upload:")
        for file_path, error in failed_uploads:
            print(f"  - {file_path}: {error}")
    else:
        print("\nAll files uploaded successfully.")


Uploading files:   0%|          | 0/41 [00:00<?, ?file/s]

Uploading files: 100%|██████████| 41/41 [00:15<00:00,  2.67file/s]


All files uploaded successfully.





In [61]:
try:
    cs.execute(f"""
        CREATE OR REPLACE TABLE {DATABASE}.{SCHEMA}.{TABLE} (
            PurchaseOrderID INTEGER,
            SupplierID INTEGER,
            OrderDate DATE,
            DeliveryMethodID INTEGER,
            ContactPersonID INTEGER,
            ExpectedDeliveryDate DATE,
            SupplierReference STRING,
            IsOrderFinalized BOOLEAN,
            Comments STRING,
            InternalComments STRING,
            LastEditedBy INTEGER,
            LastEditedWhen TIMESTAMP,
            PurchaseOrderLineID INTEGER,
            StockItemID INTEGER,
            OrderedOuters INTEGER,
            Description STRING,
            ReceivedOuters INTEGER,
            PackageTypeID INTEGER,
            ExpectedUnitPricePerOuter FLOAT,
            LastReceiptDate DATE,
            IsOrderLineFinalized BOOLEAN,
            Right_LastEditedBy INTEGER,
            Right_LastEditedWhen TIMESTAMP
        )
    """)
    print("Table created successfully!")
except Exception as e:
    print(f"Error creating table: {e}")


Table created successfully!


In [64]:
try:
    cs.execute(f"""
        CREATE OR REPLACE FILE FORMAT {DATABASE}.{SCHEMA}.{FILE_FMT}
        TYPE = 'CSV'
        FIELD_DELIMITER = ','
        FIELD_OPTIONALLY_ENCLOSED_BY = '"'
        PARSE_HEADER = TRUE
        NULL_IF = ('NULL', 'null')
        EMPTY_FIELD_AS_NULL = TRUE
        COMPRESSION = 'AUTO'
        DATE_FORMAT = 'MM/DD/YYYY'
        TIMESTAMP_FORMAT = 'MM/DD/YYYY HH24:MI'
    """)
    print("File format updated with PARSE_HEADER=TRUE.")
except Exception as e:
    print(f"Error creating file format: {e}")


File format updated with PARSE_HEADER=TRUE.


In [65]:
try:
    cs.execute(f"""
        COPY INTO {DATABASE}.{SCHEMA}.{TABLE}
        FROM @{DATABASE}.{SCHEMA}.{STAGE}
        FILE_FORMAT = (FORMAT_NAME = '{DATABASE}.{SCHEMA}.{FILE_FMT}')
        PATTERN = '.*\\.csv(\\.gz)?$'
        MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
        ON_ERROR = 'CONTINUE'
    """)
    print("COPY INTO executed successfully!")
except Exception as e:
    print(f"Error running COPY INTO: {e}")


COPY INTO executed successfully!


In [70]:
from tqdm import tqdm

# Clear table first
cs.execute(f"TRUNCATE TABLE {DATABASE}.{SCHEMA}.{TABLE}")

# Count CSV files in stage
cs.execute(f"LIST @{DATABASE}.{SCHEMA}.{STAGE}")
stage_files = [
    row[0] for row in cs.fetchall() if row[0].lower().endswith((".csv", ".csv.gz"))
]
stage_count = len(stage_files)

# Run COPY and fetch results incrementally
cs.execute(f"""
    COPY INTO {DATABASE}.{SCHEMA}.{TABLE}
    FROM @{DATABASE}.{SCHEMA}.{STAGE}
    FILE_FORMAT = (FORMAT_NAME = '{DATABASE}.{SCHEMA}.{FILE_FMT}')
    PATTERN = '.*\\.csv(\\.gz)?$'
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    ON_ERROR = 'CONTINUE'
""")

copy_results = []
with tqdm(total=stage_count, desc="Loading CSVs", unit="file") as pbar:
    while True:
        row = cs.fetchone()
        if not row:
            break
        copy_results.append(row)
        pbar.update(1)

load_count = len(copy_results)

# Simple check
if load_count == stage_count:
    print(f"\n✅ Success: {load_count} files loaded, matches {stage_count} in stage.")
else:
    print(f"\n⚠️ Mismatch: {load_count} loaded vs {stage_count} files in stage.")


Loading CSVs: 100%|██████████| 41/41 [00:00<00:00, 260161.07file/s]


✅ Success: 41 files loaded, matches 41 in stage.



