In [1]:
import sys
import datetime
from decimal import Decimal
from snowflake.snowpark.session import Session
sys.path.append('./test')
from parameters import CONNECTION_PARAMETERS

### Connect to Snowflake and run SQL in Snowflake

In [2]:
session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print(session.getCurrentDatabase())
print(session.getCurrentSchema())
print(session.sql('show tables').collect())

"TESTDB_SNOWPARK_PYTHON"
"TESTSCHEMA_SNOWPARK_PYTHON"
[Row[2021-08-04 20:01:56.052000-07:00,DEMO_TABLE,TESTDB_SNOWPARK_PYTHON,TESTSCHEMA_SNOWPARK_PYTHON,TABLE,,,3,1536,SYSADMIN,1,OFF,OFF,N]]


### Create a dataframe in local environment and save it as a view in Snowflake

In [3]:
data = [0, "one", 1.0, datetime.datetime.now(), True, bytearray("a", "utf-8"), [1, 2, 3], {'snow': 'flake'}]
col_names = [f"col{i+1}" for i in range(len(data))]
df = session.createDataFrame([data, data]).toDF(col_names)
df.show()

-----------------------------------------------------------------------------------------------------------------
|"COL1"  |"COL2"  |"COL3"  |"COL4"                      |"COL5"  |"COL6"           |"COL7"  |"COL8"             |
-----------------------------------------------------------------------------------------------------------------
|0       |one     |1.0     |2021-08-05 03:06:54.739224  |True    |bytearray(b'a')  |[       |{                  |
|        |        |        |                            |        |                 |  1,    |  "snow": "flake"  |
|        |        |        |                            |        |                 |  2,    |}                  |
|        |        |        |                            |        |                 |  3     |                   |
|        |        |        |                            |        |                 |]       |                   |
|0       |one     |1.0     |2021-08-05 03:06:54.739224  |True    |bytearray(b'a')  |[   

In [4]:
df.createOrReplaceView("demo_view")
print(session.sql('select * from demo_view').collect())

[Row[0,one,1.0,2021-08-05 03:06:54.739224,True,bytearray(b'a'),[
  1,
  2,
  3
],{
  "snow": "flake"
}], Row[0,one,1.0,2021-08-05 03:06:54.739224,True,bytearray(b'a'),[
  1,
  2,
  3
],{
  "snow": "flake"
}]]


### Create a table in Snowflake and save it as a table in local environment

In [5]:
session.sql("create or replace table demo_table(A number, B float, C text) "
            "as select * from values(1, 1.0, 'one'), (2, 2.0, 'two'), (3, 3.0, 'three')").collect()
df = session.table("demo_table")
print(df.collect())

[Row[1,1.0,one], Row[2,2.0,two], Row[3,3.0,three]]


### Select

In [6]:
print(df.select(df["A"]).collect())
print(df.select("*").collect())

[Row[1], Row[2], Row[3]]
[Row[1,1.0,one], Row[2,2.0,two], Row[3,3.0,three]]


In [7]:
print(df.select(-df["A"]).collect())
print(df.select(df["A"] == df["B"]).collect())
print(df.select(df["A"] + df["B"]).collect())
print(df.select(df["A"] ** df["B"]).collect())

[Row[-1], Row[-2], Row[-3]]
[Row[True], Row[True], Row[True]]
[Row[2.0], Row[4.0], Row[6.0]]
[Row[1.0], Row[4.0], Row[27.0]]


### Filter

In [8]:
print(df.filter(df["A"] > 1).collect())
print(df.where(df["C"] == "one").collect())

[Row[2,2.0,two], Row[3,3.0,three]]
[Row[1,1.0,one]]


### Drop

In [9]:
print(df.drop("C").collect())

[Row[1,1.0], Row[2,2.0], Row[3,3.0]]


### Join, Union and Intersect

In [10]:
df1 = session.range(3, 8)
df2 = session.range(5, 10)
print(df1.collect())
print(df2.collect())

[Row[3], Row[4], Row[5], Row[6], Row[7]]
[Row[5], Row[6], Row[7], Row[8], Row[9]]


In [11]:
print(df1.join(df2, "id").collect())
print(df1.join(df2, "id", "inner").collect())

[Row[5], Row[6], Row[7]]
[Row[5], Row[6], Row[7]]


In [12]:
print(df1.join(df2, "id", "left_anti").collect())

[Row[4], Row[3]]


In [13]:
print(df1.join(df2, "id", "left_outer").collect())

[Row[5], Row[6], Row[7], Row[4], Row[3]]


In [14]:
print(df1.join(df2, "id", "right_outer").collect())

[Row[5], Row[6], Row[7], Row[9], Row[8]]


In [15]:
print(df1.join(df2, "id", "outer").collect())

[Row[5], Row[6], Row[7], Row[8], Row[9], Row[4], Row[3]]


In [16]:
print(df1.crossJoin(df2).collect())

[Row[3,5], Row[3,6], Row[3,7], Row[3,8], Row[3,9], Row[4,5], Row[4,6], Row[4,7], Row[4,8], Row[4,9], Row[5,5], Row[5,6], Row[5,7], Row[5,8], Row[5,9], Row[6,5], Row[6,6], Row[6,7], Row[6,8], Row[6,9], Row[7,5], Row[7,6], Row[7,7], Row[7,8], Row[7,9]]


In [17]:
print(df1.union(df2).collect())
print(df1.intersect(df2).collect())

[Row[3], Row[4], Row[5], Row[6], Row[7], Row[5], Row[6], Row[7], Row[8], Row[9]]
[Row[7], Row[6], Row[5]]


In [18]:
df3 = session.sql("select * from values(1, 2),(2, 3),(3, 4) as T(c1, c2)")
df4 = session.sql("select * from values(1, 'one'),(2, 'two'),(3, 'three') as T(c1, c3)")
print(df3.naturalJoin(df4).collect())

[Row[1,2,one], Row[2,3,two], Row[3,4,three]]


### Sort

In [19]:
print(df3.collect())

[Row[1,2], Row[2,3], Row[3,4]]


In [20]:
print(df3.sort(df3["c1"].desc()).collect())
print(df3.sort("c1", ascending=False).collect())

[Row[3,4], Row[2,3], Row[1,2]]
[Row[3,4], Row[2,3], Row[1,2]]


### Agg

In [21]:
df5 = session.createDataFrame([[1, 4], [1, 4], [2, 5], [2, 6]]).toDF(["first", "second"])
print(df5.agg([("first", "min")]).collect())
print(df5.agg([("first", "count")]).collect())
print(df5.agg([("first", "max")]).collect())
print(df5.agg([("first", "avg")]).collect())
print(df5.agg([("first", "std")]).collect())
print(df5.agg([("first", "min"), ("first", "count"), ("first", "max"),
               ("first", "avg"), ("first", "std")]).collect())

[Row[1]]
[Row[4]]
[Row[2]]
[Row[1.500000]]
[Row[0.577349980514419]]
[Row[1,4,2,1.500000,0.577349980514419]]


### Convert a Snowflake table to a local Pandas dataframe

In [22]:
df = session.table("demo_table")
print(df.collect())

[Row[1,1.0,one], Row[2,2.0,two], Row[3,3.0,three]]


In [23]:
from snowflake.snowpark.types.sf_types import StringType, FloatType
df.select(df['A'].cast(FloatType()),
          df['B'] ** 5,
          df['B'].cast(StringType()),
          df['C'] == 'two').toDF(["double(A)", "B^5", "string(B)", "C=='two'"]).toPandas().head()

Unnamed: 0,double(A),B^5,string(B),C=='two'
0,1.0,1.0,1,False
1,2.0,32.0,2,True
2,3.0,243.0,3,False


### UDF

In [24]:
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types.sf_types import IntegerType

def square(x):
    return x * x

def cube(x):
    return square(x) * x

square_udf = udf(square, return_type=IntegerType(), input_types=[IntegerType()])
cube_udf = udf(cube, return_type=IntegerType(), input_types=[IntegerType()], name="cube")
add_udf = udf(lambda x, y: x + y, return_type=IntegerType(), input_types=[IntegerType(), IntegerType()])

In [25]:
df = session.createDataFrame([1, 2, 3]).toDF("a")
print(df.select("a", square_udf("a"), cube_udf("a"), add_udf(square_udf("a"), cube_udf("a"))).collect())

[Row[1,1,1,2], Row[2,4,8,12], Row[3,9,27,36]]


In [26]:
print(session.sql("select cube(8)").collect())

[Row[512]]


### Import dependency

In [27]:
import math
from test_udf import factorial


def sqrt_factorial(n):
    return math.sqrt(factorial(n))

# expect fail due to lack of dependency 
sqrt_factorial_udf = udf(sqrt_factorial, return_type=FloatType(), input_types=[IntegerType()])

Failed to execute query 
CREATE TEMPORARY FUNCTION "TESTDB_SNOWPARK_PYTHON"."TESTSCHEMA_SNOWPARK_PYTHON".tempUDF_851669829(arg1 INT)
RETURNS FLOAT
LANGUAGE PYTHON
IMPORTS = ('@"TESTDB_SNOWPARK_PYTHON"."TESTSCHEMA_SNOWPARK_PYTHON".snowSession_158342692941058/cloudpickle.zip','@"TESTDB_SNOWPARK_PYTHON"."TESTSCHEMA_SNOWPARK_PYTHON".snowSession_158342692941058/TESTDB_SNOWPARK_PYTHONTESTSCHEMA_SNOWPARK_PYTHONtempUDF_851669829_7713512417826232701/udf_py_1651678294.zip')
HANDLER='udf_py_1651678294.compute'

100357 (P0000): Python Interpreter Error:
ModuleNotFoundError: No module named 'test_udf' in function TEMPUDF_851669829 with handler udf_py_1651678294.compute


ProgrammingError: 100357 (P0000): Python Interpreter Error:
ModuleNotFoundError: No module named 'test_udf' in function TEMPUDF_851669829 with handler udf_py_1651678294.compute

In [28]:
session.addImports("test_udf.py")
sqrt_factorial_udf = udf(sqrt_factorial, return_type=FloatType(), input_types=[IntegerType()])
print(df.select("a", sqrt_factorial_udf("a")).collect())

[Row[1,1.0], Row[2,1.4142135623730951], Row[3,2.449489742783178]]


----------------------------------------------------------

In [29]:
print(session.sql('drop view if exists demo_view').collect())
print(session.sql('drop table if exists demo_table').collect())
session.close()

[Row[DEMO_VIEW successfully dropped.]]
[Row[DEMO_TABLE successfully dropped.]]
