Crypto Data Loading with Snowpark

#------------------------------------------------------------------------------
# Data Engineering with Snowpark for Cryptocurrency Data
# Script:       crypto_data_loader.py
# Last Updated: February 27, 2025
#------------------------------------------------------------------------------

In [None]:
# Define our crypto tables and corresponding file names
CRYPTO_TABLES = {
    'BTC': 'BTC_raw_daily.csv',
    'DOGE': 'DOGE_raw_daily.csv',
    'ETH': 'ETH_raw_daily.csv'
}

In [None]:
# SNOWFLAKE ADVANTAGE: Schema detection
# SNOWFLAKE ADVANTAGE: Data ingestion with COPY
def load_raw_table(session, table_name, file_name, schema="RAW_CRYPTO"):
    """
    Load data from S3 stage into Snowflake tables
    
    Parameters:
    session: Snowpark session
    table_name: Table name (BTC, DOGE, ETH)
    file_name: File name in the stage (e.g., BTC_raw_daily.csv)
    schema: Schema name
    """
    print(f"Loading {table_name} from {file_name}")
    session.use_schema(schema)
    
    # Define the stage path to the file
    stage_path = f"@CRYPTO_RAW_STAGE/raw_data/{file_name}"
    
    # Read the CSV file from the stage
    df = session.read.option("header", True) \
                     .option("infer_schema", True) \
                     .csv(stage_path)
    
    # Ensure we have the correct column names and data types for our schema
    # Convert date string to DATE type
    df = df.with_column("date", F.to_date(df["date"]))
    
    # Convert numeric columns to FLOAT
    for col in ["open", "high", "low", "close", "volume", "adjclose"]:
        df = df.with_column(col, F.to_double(df[col]))
    
    # Copy data into table
    df.copy_into_table(f"{table_name}")
    
    print(f"Successfully loaded {table_name}")
    
    # Return row count
    return session.table(f"{schema}.{table_name}").count()


In [None]:
# SNOWFLAKE ADVANTAGE: Warehouse elasticity (dynamic scaling)
def load_all_crypto_tables(session, warehouse_name="COMPUTE_WH"):
    """
    Load all cryptocurrency tables with warehouse scaling
    
    Parameters:
    session: Snowpark session
    warehouse_name: The name of the Snowflake warehouse to use
    """
    # Scale up warehouse for faster loading
    session.sql(f"ALTER WAREHOUSE {warehouse_name} SET WAREHOUSE_SIZE = LARGE WAIT_FOR_COMPLETION = TRUE").collect()
    
    try:
        # Create schema if it doesn't exist
        session.sql("CREATE SCHEMA IF NOT EXISTS RAW_CRYPTO").collect()
        
        # Create tables if they don't exist
        for table_name in CRYPTO_TABLES.keys():
            session.sql(f"""
            CREATE TABLE IF NOT EXISTS RAW_CRYPTO.{table_name} (
                date DATE PRIMARY KEY,
                open FLOAT,
                high FLOAT,
                low FLOAT,
                close FLOAT,
                volume FLOAT,
                adjclose FLOAT
            )
            """).collect()
        
        # Load data for each table
        results = {}
        for table_name, file_name in CRYPTO_TABLES.items():
            row_count = load_raw_table(session, table_name, file_name)
            results[table_name] = row_count
        
        return results
            
    finally:
        # Scale down warehouse when done
        session.sql(f"ALTER WAREHOUSE {warehouse_name} SET WAREHOUSE_SIZE = XSMALL").collect()


In [None]:
def validate_crypto_tables(session):
    """
    Validate the loaded data
    
    Parameters:
    session: Snowpark session
    """
    print("Validating loaded tables:")
    for table_name in CRYPTO_TABLES.keys():
        table_ref = session.table(f'RAW_CRYPTO.{table_name}')
        row_count = table_ref.count()
        date_range = session.sql(f"SELECT MIN(date) as min_date, MAX(date) as max_date FROM RAW_CRYPTO.{table_name}").collect()[0]
        
        print(f'{table_name}: \n\tColumns: {table_ref.columns}\n\tRow count: {row_count}\n\tDate range: {date_range["MIN_DATE"]} to {date_range["MAX_DATE"]}\n')
        
        # Show sample data
        print(f"Sample data for {table_name}:")
        sample_data = table_ref.limit(5).to_pandas()
        print(sample_data)
        print("\n")


In [None]:
# Main execution function
def main(session):
    """
    Main function to execute the data loading process
    
    Parameters:
    session: Snowpark session
    """
    print("Starting cryptocurrency data loading process...")
    
    # Use correct role
    session.sql("USE ROLE ACCOUNTADMIN").collect()
    
    # Load all tables
    results = load_all_crypto_tables(session)
    print(f"Data loading complete! Rows loaded: {results}")
    
    # Validate loaded tables
    validate_crypto_tables(session)
    
    print("Process completed successfully!")


# For executing from a Snowflake notebook
if __name__ == "__main__":
    # Create a Snowpark session
    with Session.builder.getOrCreate() as session:
        main(session)