In [0]:

%pip install snowflake-connector-python

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import warnings
warnings.filterwarnings('ignore')

In [0]:
sfOptions = {
    "sfUrl": "tymhdzv-pz92491.snowflakecomputing.com",
    "sfUser": "RUTHRA",
    "sfPassword": "Ruthra#978Snowflake",
    "sfDatabase": "SNOWFLAKE_SAMPLE_DATA", 
    "sfSchema": "TPCH_SF1",  
    "sfWarehouse": "COMPUTE_WH",  
    "sfRole": "ACCOUNTADMIN"
}

# Print configuration (masking password for security)
print("Snowflake Configuration:")
print(f"URL: {sfOptions['sfUrl']}")
print(f"User: {sfOptions['sfUser']}")
print(f"Database: {sfOptions['sfDatabase']}")
print(f"Schema: {sfOptions['sfSchema']}")
print(f"Warehouse: {sfOptions['sfWarehouse']}")

Snowflake Configuration:
URL: tymhdzv-pz92491.snowflakecomputing.com
User: RUTHRA
Database: SNOWFLAKE_SAMPLE_DATA
Schema: TPCH_SF1
Warehouse: COMPUTE_WH


In [0]:
def test_snowflake_connection(sf_options):
    try:
        print("Testing Snowflake connection...")
        
        # Simple test query
        test_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("query", "SELECT CURRENT_TIMESTAMP() as current_time, CURRENT_VERSION() as version") \
            .load()
        
        result = test_df.collect()[0]
        print(f" Connection successful!")
        print(f"   Current Time: {result['current_time']}")
        print(f"   Snowflake Version: {result['version']}")
        return True
        
    except Exception as e:
        print(f" Connection failed: {str(e)}")
        return False

# Execute connection test
test_snowflake_connection(sfOptions)

Testing Snowflake connection...
 Connection successful!
 Connection failed: current_time


False

In [0]:
def list_snowflake_tables(sf_options):
    try:
        tables_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("query", """
                SELECT TABLE_NAME, TABLE_TYPE, ROW_COUNT 
                FROM INFORMATION_SCHEMA.TABLES 
                WHERE TABLE_SCHEMA = 'TPCH_SF1'
                ORDER BY TABLE_NAME
            """) \
            .load()
        
        print(" Available Tables in TPCH_SF1 schema:")
        display(tables_df)
        return tables_df
        
    except Exception as e:
        print(f"Error listing tables: {str(e)}")
        return None

list_snowflake_tables(sfOptions)

 Available Tables in TPCH_SF1 schema:


TABLE_NAME,TABLE_TYPE,ROW_COUNT
CUSTOMER,BASE TABLE,150000
LINEITEM,BASE TABLE,6001215
NATION,BASE TABLE,25
ORDERS,BASE TABLE,1500000
PART,BASE TABLE,200000
PARTSUPP,BASE TABLE,800000
REGION,BASE TABLE,5
SUPPLIER,BASE TABLE,10000


DataFrame[TABLE_NAME: string, TABLE_TYPE: string, ROW_COUNT: decimal(38,0)]

In [0]:
def read_customer_data(sf_options):
    try:
        print("👥 Reading customer data...")
        
        customer_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("dbtable", "CUSTOMER") \
            .load()
            # .limit(1000)  # Optional: limit rows for testing
        
        print(f" Successfully loaded {customer_df.count()} customer records")
        print(" Schema:")
        customer_df.printSchema()
        
        return customer_df
        
    except Exception as e:
        print(f" Error reading customer data: {str(e)}")
        return None

customer_df = read_customer_data(sfOptions)

# Display sample data
if customer_df:
    print("\n Sample Customer Data (first 10 rows):")
    display(customer_df.limit(10))

👥 Reading customer data...
 Successfully loaded 150000 customer records
 Schema:
root
 |-- C_CUSTKEY: decimal(38,0) (nullable = false)
 |-- C_NAME: string (nullable = false)
 |-- C_ADDRESS: string (nullable = false)
 |-- C_NATIONKEY: decimal(38,0) (nullable = false)
 |-- C_PHONE: string (nullable = false)
 |-- C_ACCTBAL: decimal(12,2) (nullable = false)
 |-- C_MKTSEGMENT: string (nullable = true)
 |-- C_COMMENT: string (nullable = true)


 Sample Customer Data (first 10 rows):


C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
60001,Customer#000060001,9Ii4zQn9cX,14,24-678-784-9652,9957.56,HOUSEHOLD,l theodolites boost slyly at the platelets: permanently ironic packages wake slyly pend
60002,Customer#000060002,ThGBMjDwKzkoOxhz,15,25-782-500-8435,742.46,BUILDING,beans. fluffily regular packages
60003,Customer#000060003,"Ed hbPtTXMTAsgGhCr4HuTzK,Md2",16,26-859-847-7640,2526.92,BUILDING,fully pending deposits sleep quickly. blithely unusual accounts across the blithely bold requests are quickly
60004,Customer#000060004,"NivCT2RVaavl,yUnKwBjDyMvB42WayXCnky",10,20-573-674-7999,7975.22,AUTOMOBILE,furiously above the ironic packages. slyly brave ideas boost. final platelets detect according to the ironi
60005,Customer#000060005,"1F3KM3ccEXEtI, B22XmCMOWJMl",12,22-741-208-1316,2504.74,MACHINERY,express instructions sleep quickly. ironic braids cajole furiously fluffily p
60006,Customer#000060006,3isiXW651fa8p,22,32-618-195-8029,9051.4,MACHINERY,carefully quickly even theodolites. boldly
60007,Customer#000060007,"sp6KJmx,TiSWbMPvhkQwFwTuhSi4a5OLNImpcGI",12,22-491-919-9470,6017.17,FURNITURE,bold packages. regular sheaves mold. blit
60008,Customer#000060008,3VteHZYOfbgQioA96tUeL0R7i,2,12-693-562-7122,5621.44,AUTOMOBILE,"nal courts. carefully regular Tiresias lose quickly unusual packages. regular, bold i"
60009,Customer#000060009,S60sNpR6wnacPBLeOxjxhvehf,9,19-578-776-2699,9548.01,FURNITURE,efully even dependencies haggle furiously along the express packages. final requests boost
60010,Customer#000060010,c4vEEaV1tdqLdw2oVuXp BN,21,31-677-809-6961,3497.91,HOUSEHOLD,"fter the quickly silent requests. slyly special theodolites along the even, even requests boos"


In [0]:
def read_multiple_tables(sf_options):
    try:
        # Read orders data
        orders_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("dbtable", "ORDERS") \
            .load()
        
        # Read nation data
        nation_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("dbtable", "NATION") \
            .load()
        
        print(f" Orders count: {orders_df.count()}")
        print(f" Nations count: {nation_df.count()}")
        
        return orders_df, nation_df
        
    except Exception as e:
        print(f" Error reading tables: {str(e)}")
        return None, None

orders_df, nation_df = read_multiple_tables(sfOptions)

# Display sample data
if orders_df and nation_df:
    print("\n Sample Orders Data:")
    display(orders_df.limit(5))
    
    print("\n Sample Nation Data:")
    display(nation_df.limit(5))

 Orders count: 1500000
 Nations count: 25

 Sample Orders Data:


O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
3000001,145618,F,30175.88,1992-12-17,4-NOT SPECIFIED,Clerk#000000141,0,l packages. furiously careful instructions grow furi
3000002,1481,O,297999.63,1995-07-28,1-URGENT,Clerk#000000547,0,carefully unusual dependencie
3000003,127432,O,345438.38,1997-11-04,5-LOW,Clerk#000000488,0,n packages boost slyly bold deposits. deposits around the ironic th
3000004,47423,O,135965.53,1996-06-13,4-NOT SPECIFIED,Clerk#000000004,0,nts wake carefully final decoys. quickly final accounts wake
3000005,84973,F,209937.09,1992-09-12,5-LOW,Clerk#000000030,0,yly after the quickly unusual ide



 Sample Nation Data:


N_NATIONKEY,N_NAME,N_REGIONKEY,N_COMMENT
0,ALGERIA,0,haggle. carefully final deposits detect slyly agai
1,ARGENTINA,1,al foxes promise slyly according to the regular accounts. bold requests alon
2,BRAZIL,1,y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special
3,CANADA,1,"eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold"
4,EGYPT,4,y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d


In [0]:
def execute_complex_query(sf_options):
    try:
        print("🔗 Executing complex query with joins...")
        
        complex_query = """
        SELECT 
            c.C_CUSTKEY,
            c.C_NAME,
            c.C_NATIONKEY,
            n.N_NAME as NATION_NAME,
            COUNT(o.O_ORDERKEY) as ORDER_COUNT,
            SUM(o.O_TOTALPRICE) as TOTAL_SPENT
        FROM CUSTOMER c
        JOIN NATION n ON c.C_NATIONKEY = n.N_NATIONKEY
        JOIN ORDERS o ON c.C_CUSTKEY = o.O_CUSTKEY
        GROUP BY c.C_CUSTKEY, c.C_NAME, c.C_NATIONKEY, n.N_NAME
        ORDER BY TOTAL_SPENT DESC
        LIMIT 20
        """
        
        result_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("query", complex_query) \
            .load()
        
        print(f" Complex query executed successfully. Returned {result_df.count()} rows")
        display(result_df)
        return result_df
        
    except Exception as e:
        print(f" Error in complex query: {str(e)}")
        return None

complex_result = execute_complex_query(sfOptions)

🔗 Executing complex query with joins...
 Complex query executed successfully. Returned 20 rows


C_CUSTKEY,C_NAME,C_NATIONKEY,NATION_NAME,ORDER_COUNT,TOTAL_SPENT
143500,Customer#000143500,10,IRAN,39,7012696.48
95257,Customer#000095257,2,BRAZIL,36,6563511.23
87115,Customer#000087115,14,KENYA,34,6457526.26
131113,Customer#000131113,5,ETHIOPIA,37,6311428.86
103834,Customer#000103834,11,IRAQ,31,6306524.23
134380,Customer#000134380,0,ALGERIA,37,6291610.15
69682,Customer#000069682,16,MOZAMBIQUE,39,6287149.42
102022,Customer#000102022,9,INDONESIA,41,6273788.41
98587,Customer#000098587,18,CHINA,37,6265089.35
85102,Customer#000085102,15,MOROCCO,34,6135483.63


In [0]:

def create_and_write_table(sf_options):
    try:
        print("🔄 Creating sample data and writing to Snowflake...")
        
        # Create sample DataFrame
        sample_data = [
            (1, "Alice", "Johnson", "alice@email.com", "2024-01-15"),
            (2, "Bob", "Smith", "bob@email.com", "2024-01-16"),
            (3, "Charlie", "Brown", "charlie@email.com", "2024-01-17"),
            (4, "Diana", "Prince", "diana@email.com", "2024-01-18"),
            (5, "Edward", "Wilson", "edward@email.com", "2024-01-19")
        ]
        
        columns = ["id", "first_name", "last_name", "email", "created_date"]
        sample_df = spark.createDataFrame(sample_data, columns)
        
        print(" Sample DataFrame to write:")
        display(sample_df)
        
        # Write to Snowflake (create new table)
        sample_df.write \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("dbtable", "DATABRICKS_SAMPLE") \
            .mode("overwrite") \
            .save()
        # Options: "overwrite", "append", "error", "ignore"
        print("✅ Successfully wrote data to Snowflake table 'DATABRICKS_SAMPLE'")
        
        # Verify the data was written
        verify_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**sf_options) \
            .option("dbtable", "DATABRICKS_SAMPLE") \
            .load()
        
        print(" Verification - Reading back the written data:")
        display(verify_df)
        
        return sample_df
        
    except Exception as e:
        print(f" Error writing to Snowflake: {str(e)}")
        return None

created_df = create_and_write_table(sfOptions)

🔄 Creating sample data and writing to Snowflake...
 Sample DataFrame to write:


id,first_name,last_name,email,created_date
1,Alice,Johnson,alice@email.com,2024-01-15
2,Bob,Smith,bob@email.com,2024-01-16
3,Charlie,Brown,charlie@email.com,2024-01-17
4,Diana,Prince,diana@email.com,2024-01-18
5,Edward,Wilson,edward@email.com,2024-01-19


 Error writing to Snowflake: An error occurred while calling o663.save.
: net.snowflake.client.jdbc.SnowflakeSQLException: SQL execution error: Creating stage on shared database 'SNOWFLAKE_SAMPLE_DATA' is not allowed.
	at net.snowflake.client.jdbc.SnowflakeUtil.checkErrorAndThrowExceptionSub(SnowflakeUtil.java:170)
	at net.snowflake.client.jdbc.SnowflakeUtil.checkErrorAndThrowException(SnowflakeUtil.java:103)
	at net.snowflake.client.core.StmtUtil.pollForOutput(StmtUtil.java:501)
	at net.snowflake.client.core.StmtUtil.execute(StmtUtil.java:407)
	at net.snowflake.client.core.SFStatement.executeHelper(SFStatement.java:498)
	at net.snowflake.client.core.SFStatement.executeQueryInternal(SFStatement.java:215)
	at net.snowflake.client.core.SFStatement.executeQuery(SFStatement.java:149)
	at net.snowflake.client.core.SFStatement.execute(SFStatement.java:785)
	at net.snowflake.client.core.SFStatement.execute(SFStatement.java:693)
	at net.snowflake.client.jdbc.SnowflakeStatementV1.executeQueryIn

In [0]:
# Cell 10: Advanced Operations with Comprehensive Error Handling
def advanced_snowflake_operations(sf_options):
    try:
        print(" Starting advanced operations...")
        
        # Configuration for large datasets
        optimized_options = sf_options.copy()
        optimized_options.update({
            "parallelism": "8",
            "usestagingtable": "on",
            "autopushdown": "on"
        })
        
        # Read with optimized settings
        large_df = spark.read \
            .format("net.snowflake.spark.snowflake") \
            .options(**optimized_options) \
            .option("dbtable", "LINEITEM") \
            .load()
            # .limit(50000)  # Optional limit for testing
        
        print(f" Loaded LINEITEM table with {large_df.count()} rows")
        
        # Perform some transformations
        transformed_df = large_df \
            .filter(col("L_SHIPDATE") >= "1998-01-01") \
            .groupBy("L_SHIPDATE") \
            .agg(
                sum("L_QUANTITY").alias("total_quantity"),
                avg("L_EXTENDEDPRICE").alias("avg_price"),
                count("*").alias("transaction_count")
            ) \
            .orderBy("L_SHIPDATE")
        
        print("Transformed data (aggregated by ship date):")
        display(transformed_df.limit(10))
        
        # Write transformed data back to Snowflake
        transformed_df.write \
            .format("net.snowflake.spark.snowflake") \
            .options(**optimized_options) \
            .option("dbtable", "LINEITEM_AGGREGATED") \
            .mode("overwrite") \
            .save()
        
        print(" Successfully wrote aggregated data to 'LINEITEM_AGGREGATED'")
        
        return transformed_df
        
    except Exception as e:
        print(f" Error in advanced operations: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

advanced_result = advanced_snowflake_operations(sfOptions)

 Starting advanced operations...
 Loaded LINEITEM table with 6001215 rows
Transformed data (aggregated by ship date):


L_SHIPDATE,total_quantity,avg_price,transaction_count
1998-01-01,64078.0,38003.915991,2537
1998-01-02,64562.0,38277.023458,2519
1998-01-03,64775.0,38753.180341,2524
1998-01-04,61569.0,38019.871487,2435
1998-01-05,64343.0,37913.042836,2556
1998-01-06,62609.0,38126.159853,2453
1998-01-07,60907.0,37955.183225,2409
1998-01-08,62822.0,37974.876689,2489
1998-01-09,62786.0,37555.015118,2503
1998-01-10,65427.0,37899.491846,2606


 Error in advanced operations: An error occurred while calling o624.save.
: net.snowflake.client.jdbc.SnowflakeSQLException: SQL execution error: Creating stage on shared database 'SNOWFLAKE_SAMPLE_DATA' is not allowed.
	at net.snowflake.client.jdbc.SnowflakeUtil.checkErrorAndThrowExceptionSub(SnowflakeUtil.java:170)
	at net.snowflake.client.jdbc.SnowflakeUtil.checkErrorAndThrowException(SnowflakeUtil.java:103)
	at net.snowflake.client.core.StmtUtil.pollForOutput(StmtUtil.java:501)
	at net.snowflake.client.core.StmtUtil.execute(StmtUtil.java:407)
	at net.snowflake.client.core.SFStatement.executeHelper(SFStatement.java:498)
	at net.snowflake.client.core.SFStatement.executeQueryInternal(SFStatement.java:215)
	at net.snowflake.client.core.SFStatement.executeQuery(SFStatement.java:149)
	at net.snowflake.client.core.SFStatement.execute(SFStatement.java:785)
	at net.snowflake.client.core.SFStatement.execute(SFStatement.java:693)
	at net.snowflake.client.jdbc.SnowflakeStatementV1.executeQuery

Traceback (most recent call last):
  File "/root/.ipykernel/1099/command-6923127474726906-1897863283", line 44, in advanced_snowflake_operations
    .save()
     ^^^^^^
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 47, in wrapper
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/spark/python/pyspark/sql/readwriter.py", line 1732, in save
    self._jwrite.save()
  File "/databricks/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1362, in __call__
    return_value = get_return_value(
                   ^^^^^^^^^^^^^^^^^
  File "/databricks/spark/python/pyspark/errors/exceptions/captured.py", line 269, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "/databricks/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/protocol.py", line 327, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o624.save.
: net.snowflake.client.jdbc.SnowflakeSQLException: SQL