# Install Snowpark

In [None]:
!pip install snowflake-snowpark-python



---



# Connect to Snowflake via SnowPark

In [None]:
import time
import json


# --->  PYSPARK

# import pyspark.sql.functions as f
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import udf,col
# from pyspark.sql.types import IntegerType
# spark = SparkSession.builder.appName("DataEngeering1").getOrCreate()

# <---  PYSPARK

import snowflake.snowpark.functions as f
from snowflake.snowpark import Session, DataFrame
from snowflake.snowpark.functions import udf, col
from snowflake.snowpark.types import IntegerType
from snowflake.snowpark.functions import call_udf


# <----- Make these changes before running the notebook -------
# 1. Change Connection params to match your environment

# <----------------------------------------------------------------------------

Warehouse_Name = 'MY_ETL_WH'
DB_NAME = 'DEMO_SNOWPARK'

CONNECTION_PARAMETERS1 = {
    "host": "<YourAccount>.snowflakecomputing.com",
    'account': '<YourAccount>',
    'user': '<Your_UserID>',
    'password': '<Your_Password>',
    'role': 'SYSADMIN',
}

with open('creds.json') as credsfile:
    data = json.load(credsfile)
    username = data['username']
    password = data['password']
    account = data["account"]

    CONNECTION_PARAMETERS = {
        'account': account,
        'user': username,
        'password': password,
        'role': 'SYSADMIN',
    }

print("Connecting to Snowflake.....\n")
session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print("Connected Successfully!...\n\n")



sql_cmd = "CREATE OR REPLACE WAREHOUSE {} WAREHOUSE_SIZE = 'X-Small' ".format(Warehouse_Name)
session.sql(sql_cmd).collect() 

sql_cmd = "CREATE OR REPLACE DATABASE {}".format(DB_NAME)
session.sql(sql_cmd).collect() 

sql_cmd = "USE SCHEMA {}.PUBLIC".format(DB_NAME)
session.sql(sql_cmd).collect() 

sql_cmd = "USE WAREHOUSE {}".format(Warehouse_Name)
session.sql(sql_cmd).collect() 


## Start Data Engineering Process

In [None]:


# 1 - INCREASE COMPUTE TO 4 NODES
print("Resizing to from XS(1 Node) to MEDIUM(4 Nodes) ..\n")

sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'LARGE' WAIT_FOR_COMPLETION = TRUE".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n\n")


# 2 - READ & JOIN 2 LARGE TABLES (600M & 1M rows)
print("Joining, Aggregating with 2 large tables(600M & 1M rows) & Writing results to new table(80M rows) ..\n")

dfLineItems = session.table("SFC_SAMPLES_SAMPLE_DATA.TPCH_SF100.LINEITEM")  # 600 Million Rows
dfSuppliers = session.table("SFC_SAMPLES_SAMPLE_DATA.TPCH_SF100.SUPPLIER")  # 1 Million Rows

print('Lineitems Table: %s rows' % "{:,}".format(dfLineItems.count()))
print('Suppliers Table: %s rows' % "{:,}".format(dfSuppliers.count()))

# 3 - JOIN TABLES
dfJoinTables = dfLineItems.join(dfSuppliers,
                                dfLineItems.col("L_SUPPKEY") == dfSuppliers.col("S_SUPPKEY"))  

# # 4 - SUMMARIZE THE DATA BY SUPPLIER, PART, SUM, MIN & MAX
dfSummary = dfJoinTables.groupBy("S_NAME", "L_PARTKEY").agg([
     f.sum("L_QUANTITY").alias("TOTAL_QTY"),
     f.min("L_QUANTITY").alias("MIN_QTY"),
     f.max("L_QUANTITY").alias("MAX_QTY")
])


### **↑ Compute is NOT used** up to this point. (Lazy Execution Model) !!!

## 3. Storing the Results in Table or Showing results triggers the compute & previous steps.

In [None]:
start_time = time.time()
# 5 - WRITE THE RESULTS TO A NEW TABLE ( 80 Million Rows)
# <-- This is when all the previous operations are compiled & executed as a single job
dfSummary.write.mode("overwrite").saveAsTable("SALES_SUMMARY")
print("Completed!...\n\n")

# 6 - QUERY THE RESULTS (80 Million Rows)
print("Query the results..\n")
dfSales = session.table("SALES_SUMMARY")
dfSales.show()
end_time = time.time()

print("Completed!...\n\n")

# 7 - SCALE DOWN COMPUTE TO 1 NODE
print("Reducing the warehouse to XS..\n")
sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'XSMALL'".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n")

print("--- %s seconds to Join, Summarize & Write Results to a new Table --- \n" % int(end_time - start_time))
print("--- %s Rows Written to SALES_SUMMARY table" % "{:,}".format(dfSales.count()))

# That's all there is to it!

In [None]:


















# Clean it all up
sql_cmd = "DROP WAREHOUSE {} ".format(Warehouse_Name)
session.sql(sql_cmd).collect() 

sql_cmd = "DROP DATABASE {}".format(DB_NAME)
session.sql(sql_cmd).collect() 