In [0]:
import pandas as pd
import logging


In [0]:
# Configure logging for monitoring pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
# Logging is used to track pipeline execution and errors.

In [0]:
# Path to raw SuperStore CSV file (uploaded to volumes of shopz_catalog)
RAW_DATA_PATH = "/Volumes/shopz_catalog/default/superstore_volume/Sample - Superstore.csv"


In [0]:
# Expected dataset schema for validation
EXPECTED_COLUMNS = [
    "Order ID", "Order Date", "Ship Date", "Ship Mode",
    "Customer ID", "Customer Name", "Segment",
    "Country", "City", "State", "Postal Code", "Region",
    "Product ID", "Category", "Sub-Category", "Product Name",
    "Sales", "Quantity", "Discount", "Profit"
]


In [0]:
# Date and numeric columns
DATE_COLUMNS = ["Order Date", "Ship Date"]
NUMERIC_COLUMNS = ["Sales", "Quantity", "Discount", "Profit"]

# We define schema expectations to validate incoming data.


In [0]:
def read_raw_data(file_path):
    """Read raw SuperStore CSV file"""
    try:
        logging.info("Reading raw SuperStore data")
        df = pd.read_csv(file_path, encoding="latin1")
        logging.info("Raw data loaded successfully")
        return df
    except FileNotFoundError:
        logging.error("Raw data file not found")
        raise
    except Exception as e:
        logging.error(f"Error while reading raw data: {e}")
        raise

# This function loads raw CSV data using Pandas.

In [0]:
def validate_schema(df):
    """Check whether all expected columns are present"""
    missing_cols = set(EXPECTED_COLUMNS) - set(df.columns)
    if missing_cols:
        logging.error(f"Schema validation failed. Missing columns: {missing_cols}")
        raise ValueError("Schema validation failed")
    logging.info("Schema validation passed")

# This ensures the incoming dataset structure is correct.

In [0]:
def validate_date_columns(df):
    """Convert date columns and identify invalid values"""
    for col in DATE_COLUMNS:
        df[col] = pd.to_datetime(df[col], errors="coerce")

    invalid_dates = df[DATE_COLUMNS].isnull().sum()
    logging.info(f"Invalid date values count:\n{invalid_dates}")

# Invalid dates are detected and converted safely.

In [0]:
def validate_numeric_columns(df):
    """Convert numeric columns and identify invalid values"""
    for col in NUMERIC_COLUMNS:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    invalid_numbers = df[NUMERIC_COLUMNS].isnull().sum()
    logging.info(f"Invalid numeric values count:\n{invalid_numbers}")

# Numeric fields are validated to avoid calculation errors later.

In [0]:
try:
    # Load raw dataset
    df_raw = read_raw_data(RAW_DATA_PATH)

    # Log dataset information
    logging.info(f"Dataset Shape: {df_raw.shape}")
    logging.info(f"Column Names: {df_raw.columns.tolist()}")
    logging.info(f"Missing Values Count:\n{df_raw.isnull().sum()}")

    # Perform validations
    validate_schema(df_raw)
    validate_date_columns(df_raw)
    validate_numeric_columns(df_raw)

    logging.info("Data ingestion validation completed successfully")

    # Preview data
    display(df_raw.head())

except Exception as e:
    logging.critical(f"Ingestion pipeline failed: {e}")

# The is the execution of bronze injestion and validation step.

2025-12-29 07:52:05,435 - INFO - Reading raw SuperStore data
2025-12-29 07:52:06,601 - INFO - Raw data loaded successfully
2025-12-29 07:52:06,602 - INFO - Dataset Shape: (9994, 21)
2025-12-29 07:52:06,603 - INFO - Column Names: ['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit']
2025-12-29 07:52:06,609 - INFO - Missing Values Count:
Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64
2025-12-29 07:52:06,610 - INFO - Schema vali

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
1,CA-2016-152156,2016-11-08T00:00:00.000Z,2016-11-11T00:00:00.000Z,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
2,CA-2016-152156,2016-11-08T00:00:00.000Z,2016-11-11T00:00:00.000Z,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,3,0.0,219.582
3,CA-2016-138688,2016-06-12T00:00:00.000Z,2016-06-16T00:00:00.000Z,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal,14.62,2,0.0,6.8714
4,US-2015-108966,2015-10-11T00:00:00.000Z,2015-10-18T00:00:00.000Z,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
5,US-2015-108966,2015-10-11T00:00:00.000Z,2015-10-18T00:00:00.000Z,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164
