In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import sys

spark = SparkSession.builder.appName("Stage1_Ingestion").getOrCreate()

#paths read from volumes
FACT_PATH = "/Volumes/capstone/default/datasets/global_export_fact_dataset.csv"
COUNTRY_PATH = "/Volumes/capstone/default/datasets/country_dataset.csv"
PRODUCT_PATH = "/Volumes/capstone/default/datasets/product_dataset.csv"

#target tables and their storage locations
BRONZE_FACT = "capstone.default.bronze_fact"
BRONZE_COUNTRY = "capstone.default.bronze_country"
BRONZE_PRODUCT = "capstone.default.bronze_product"


In [0]:
#columns of each table for validation purpose
FACT_COLS = ["Country_Name","Country_Code","Year","Month",
             "Product_Code","Product_Name","Product_Category",
             "Region","Export_Value_USD","Export_Units"]

COUNTRY_COLS = ["Country_Name","Country_Code","Region"]
PRODUCT_COLS = ["Product_Code","Product_Name","Product_Category"]

In [0]:
print("INFO:STAGE 1 INGESTION STARTED")

INFO:STAGE 1 INGESTION STARTED


In [0]:
#safe loading function to check if the tables exists or not
def load_csv(path, expected_cols, name):
    try:
        df = spark.read.option("header", True).csv(path)
        print(f"INFO: {name} loaded successfully with {df.count()} records")

        missing_cols = [c for c in expected_cols if c not in df.columns]
        if missing_cols:
            print(f"ERROR: {name} missing columns: {missing_cols}")
            raise Exception(f"{name} schema invalid")

        print(f"INFO: {name} schema validated successfully")
        return df

    except Exception as e:
        print(f"ERROR: Failed to load {name} â€” {str(e)}")
        raise

In [0]:
# VALIDATIONS
#Validating country codes
def validate_country_codes(fact_df, country_df):
    invalid = fact_df.join(country_df, "Country_Code", "left_anti")
    c = invalid.count()
    if c > 0:
        print(f"WARNING: {c} invalid Country_Code records found in fact dataset")
    else:
        print("INFO: All Country Codes valid")

In [0]:
#Validate the product codes
def validate_product_codes(fact_df, product_df):
    invalid = fact_df.join(product_df, "Product_Code", "left_anti")
    c = invalid.count()
    if c > 0:
        print(f"WARNING: {c} invalid Product_Code records found in fact dataset")
    else:
        print("INFO: All Product Codes valid")

In [0]:
# function that saves the tables
def save_table(df, name):
    (
        df.write
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(name)
    )
    print(f"INFO: Saved bronze table -> {name}")

In [0]:
#RUN PIPELINE 
fact_df = load_csv(FACT_PATH, FACT_COLS, "FACT DATASET")
country_df = load_csv(COUNTRY_PATH, COUNTRY_COLS, "COUNTRY REFERENCE")
product_df = load_csv(PRODUCT_PATH, PRODUCT_COLS, "PRODUCT REFERENCE")

validate_country_codes(fact_df, country_df)
validate_product_codes(fact_df, product_df)

save_table(fact_df, BRONZE_FACT)
save_table(country_df, BRONZE_COUNTRY)
save_table(product_df, BRONZE_PRODUCT)

print("INFO:STAGE 1 INGESTION COMPLETED SUCCESSFULLY")

INFO: FACT DATASET loaded successfully with 7000 records
INFO: FACT DATASET schema validated successfully
INFO: COUNTRY REFERENCE loaded successfully with 10 records
INFO: COUNTRY REFERENCE schema validated successfully
INFO: PRODUCT REFERENCE loaded successfully with 10 records
INFO: PRODUCT REFERENCE schema validated successfully
INFO: All Country Codes valid
INFO: All Product Codes valid
INFO: Saved bronze table -> capstone.default.bronze_fact
INFO: Saved bronze table -> capstone.default.bronze_country
INFO: Saved bronze table -> capstone.default.bronze_product
INFO:STAGE 1 INGESTION COMPLETED SUCCESSFULLY


In [0]:
%sql
DESCRIBE TABLE capstone.default.bronze_fact;


col_name,data_type,comment
Country_Name,string,
Country_Code,string,
Year,string,
Month,string,
Product_Code,string,
Product_Name,string,
Product_Category,string,
Region,string,
Export_Value_USD,string,
Export_Units,string,


In [0]:
%sql
DESCRIBE TABLE capstone.default.BRONZE_COUNTRY;


col_name,data_type,comment
Country_Name,string,
Country_Code,string,
Region,string,


In [0]:
%sql
DESCRIBE TABLE capstone.default.bronze_product;


col_name,data_type,comment
Product_Code,string,
Product_Name,string,
Product_Category,string,


In [0]:
# verifying exceptions
# Intentionally passing wrong column names
try:
    df = load_csv(
        "/Volumes/capstone/default/datasets/global_export_fact_dataset.csv",
        ["Invalid_Column"],            # WRONG on purpose
        "FACT DATASET"
    )
except Exception as e:
    print("EXCEPTION RAISED:", e)


EXCEPTION RAISED: name 'load_csv' is not defined
