In [1]:
import os
import sys
import json
import math
import cachetools
import numpy as np
import pandas as pd
import configparser
from snowflake.snowpark import Session
from copy import copy
from snowflake.snowpark import Row
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col, lit, sql_expr, get, get_path, udf, udtf, table_function, sproc, seq8, uniform, when_matched, when_not_matched, cast, try_cast, asc, asc_nulls_first, asc_nulls_last, collate, startswith, endswith, equal_nan, is_null, in_, when
from snowflake.snowpark.types import StructType, StructField, StringType, IntegerType, DecimalType, LongType, BooleanType, FloatType, PandasSeries, PandasSeriesType, PandasDataFrame, PandasDataFrameType
from snowflake.snowpark.exceptions import SnowparkJoinException, SnowparkSQLException
from snowflake.snowpark.files import SnowflakeFile
from snowflake.snowpark.column import METADATA_FILENAME, METADATA_FILE_ROW_NUMBER
from collections import Counter
from typing import Iterable, Tuple

# Read snowflake credentials securely
config = configparser.ConfigParser()
config.read('assets/credentials.cfg')

connection_parameters = dict(
   account   =  config['SNOWPARKAWS']['SNOWFLAKE_ACCOUNT'],
   user      =  config['SNOWPARKAWS']['SNOWFLAKE_USER'],
   password  =  config['SNOWPARKAWS']['SNOWFLAKE_PASSWORD'],
   role      =  config['SNOWPARKAWS']['SNOWFLAKE_ROLE'],  # optional
   warehouse =  config['SNOWPARKAWS']['SNOWFLAKE_WAREHOUSE'],  # optional
   database  =  config['SNOWPARKAWS']['SNOWFLAKE_DATABASE'],  # optional
   schema    =  config['SNOWPARKAWS']['SNOWFLAKE_SCHEMA'],  # optional
)

# Pass this dictionary to the Session.builder.configs method to return a builder object that has these connection parameters.
# Call the create method of the builder to establish the session.
session = Session.builder.configs(connection_parameters).create()



#### `Getting Started with Snowflake Snowpark Dataframes`

snowflake.snowpark.DataFrame represents a lazily-evaluated relational dataset that contains a collection of Row objects with columns defined by a schema (column name and type).

- A DataFrame is considered lazy because it encapsulates the computation or query required to produce a relational dataset
- The computation is not performed until you call a method that performs an action 

There are multiple ways to create a dataframe using snowpark

1. Using `session.create_dataframe` to create a dataframe
1. Using `session.table()` to create a dataframe
1. Using `session.sql()` to create a dataframe
1. Using `session.read.csv/json/etc..,` property of DataFrameReader to create a dataframe
1. Creating `new dataframe` by applying `transformation` on existing dataframes

In [2]:
from snowflake.snowpark.types import StructField,StructType,StringType,IntegerType

# Import local file into Snowflake named internal stage
session.file.put(local_file_name='datasets/csv_dataset.csv',stage_location='@SF_INT_STG',auto_compress=False ,overwrite=True)

[PutResult(source='csv_dataset.csv', target='csv_dataset.csv', source_size=153, target_size=160, source_compression='NONE', target_compression='NONE', status='UPLOADED', message='')]

In [6]:
# Create new dataframe
df = session.create_dataframe(["Snowflake","Snowpark"], schema=["col1"])
df.show()


-------------
|"COL1"     |
-------------
|Snowflake  |
|Snowpark   |
-------------



In [5]:
# Creating Dataframe using session.table
df_via_table = session.table('SNOWFLAKE.ACCOUNT_USAGE.WAREHOUSE_METERING_HISTORY')
df_via_table.limit(10).to_pandas()

Unnamed: 0,START_TIME,END_TIME,WAREHOUSE_ID,WAREHOUSE_NAME,CREDITS_USED,CREDITS_USED_COMPUTE,CREDITS_USED_CLOUD_SERVICES
0,2023-12-16 09:00:00-08:00,2023-12-16 10:00:00-08:00,40,SNOWPARK_WH,0.244249,0.242778,0.001471
1,2023-11-27 07:00:00-08:00,2023-11-27 08:00:00-08:00,5,TASTY_DS_WH,4.1e-05,0.0,4.1e-05
2,2023-12-12 19:00:00-08:00,2023-12-12 20:00:00-08:00,43,HOL_WH,0.000125,0.0,0.000125
3,2023-12-14 06:00:00-08:00,2023-12-14 07:00:00-08:00,0,CLOUD_SERVICES_ONLY,6e-06,0.0,6e-06
4,2023-12-11 15:00:00-08:00,2023-12-11 16:00:00-08:00,31,SNOWPARK_OPT_WH,0.43214,0.431667,0.000474
5,2023-11-30 13:00:00-08:00,2023-11-30 14:00:00-08:00,1,COMPUTE_WH,0.076943,0.076389,0.000554
6,2023-12-04 12:00:00-08:00,2023-12-04 13:00:00-08:00,1,COMPUTE_WH,9.1e-05,0.0,9.1e-05
7,2023-11-27 07:00:00-08:00,2023-11-27 08:00:00-08:00,10,TASTY_BI_WH,6.5e-05,0.0,6.5e-05
8,2023-12-12 06:00:00-08:00,2023-12-12 07:00:00-08:00,0,CLOUD_SERVICES_ONLY,1.1e-05,0.0,1.1e-05
9,2023-11-27 17:00:00-08:00,2023-11-27 18:00:00-08:00,0,CLOUD_SERVICES_ONLY,2.8e-05,0.0,2.8e-05


In [7]:
# Creating Dataframe using session.sql
df_via_sql = session.sql('SELECT START_TIME, END_TIME, WAREHOUSE_NAME, CREDITS_USED FROM SNOWFLAKE.ACCOUNT_USAGE.WAREHOUSE_METERING_HISTORY')
df_via_sql.limit(10).to_pandas()

Unnamed: 0,START_TIME,END_TIME,WAREHOUSE_NAME,CREDITS_USED
0,2023-12-13 08:00:00-08:00,2023-12-13 09:00:00-08:00,SNOWPARK_OPT_WH,9e-06
1,2023-11-27 18:00:00-08:00,2023-11-27 19:00:00-08:00,SNOWPARK_OPT_WH,5e-05
2,2023-12-18 08:00:00-08:00,2023-12-18 09:00:00-08:00,COMPUTE_WH,0.0375
3,2023-11-30 07:00:00-08:00,2023-11-30 08:00:00-08:00,COMPUTE_WH,0.043889
4,2023-11-28 09:00:00-08:00,2023-11-28 10:00:00-08:00,CLOUD_SERVICES_ONLY,1.8e-05
5,2023-12-18 06:00:00-08:00,2023-12-18 07:00:00-08:00,COMPUTE_WH,0.017222
6,2023-12-18 06:00:00-08:00,2023-12-18 07:00:00-08:00,CLOUD_SERVICES_ONLY,1.4e-05
7,2023-12-14 13:00:00-08:00,2023-12-14 14:00:00-08:00,SNOWPARK_WH,0.101283
8,2023-12-11 11:00:00-08:00,2023-12-11 12:00:00-08:00,COMPUTE_WH,1.6e-05
9,2023-12-13 08:00:00-08:00,2023-12-13 09:00:00-08:00,COMPUTE_WH,0.017778


In [8]:
# Creating Dataframe using session.read property
csv_schema = StructType([StructField('Email No.', StringType())
                        , StructField("the", IntegerType())
                        , StructField("to", IntegerType())])

df_csv_file = session.read.options({"field_delimiter":",", "skip_header":1, "pattern":".*[.]csv"}).schema(csv_schema).csv("@SF_INT_STG/csv_dataset.csv")
df_csv_file.show()

------------------------------
|"Email No."  |"THE"  |"TO"  |
------------------------------
|Email 1      |0      |11    |
|Email 2      |8      |3     |
|Email 3      |0      |3     |
|Email 4      |0      |5     |
|Email 5      |7      |12    |
|Email 6      |4      |44    |
|Email 7      |5      |9     |
|Email 8      |0      |33    |
|Email 9      |2      |5     |
|Email 10     |4      |1     |
------------------------------



#### `Types of Operations on a Dataframe` 

The operations on DataFrame can be divided into two types:

- **Transformations** produce a new DataFrame from one or more existing DataFrames. Note that transformations are lazy and don’t cause the DataFrame to be evaluated.

- **Actions** cause the DataFrame to be evaluated. When you call a method that performs an action, Snowpark sends the SQL query for the DataFrame to the server for evaluation.

Follow the Snowpark for Python Documentation to understand various use cases on Transformation and performing actions on the DataFrame - [Snowflake.Snowpark.DataFrame](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.DataFrame)

---

Complete List of All DataFrame Methods - [Here](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.DataFrame)

In [11]:
from snowflake.snowpark.types import StructType, StructField, IntegerType,StringType,FloatType

schema = StructType([StructField('name',StringType()),
                     StructField('age',IntegerType()),
                     StructField('salary',FloatType())
                     ])

dataset=[["person1",25,1000.00]
        ,["person2",30,2000.00]
        ,["person2",25,2300.00]
        ,["person3",40,5000.00]
        ,["person3",25,5400.00]
        ,["person3",30,None]
        ,[None,None,6000.00]
        ,[None,None,float('nan')]
         ]

df = session.create_dataframe(data=dataset, schema=schema)
df.schema.names

df.show()

------------------------------
|"NAME"   |"AGE"  |"SALARY"  |
------------------------------
|person1  |25     |1000.0    |
|person2  |30     |2000.0    |
|person2  |25     |2300.0    |
|person3  |40     |5000.0    |
|person3  |25     |5400.0    |
|person3  |30     |NULL      |
|NULL     |NULL   |6000.0    |
|NULL     |NULL   |nan       |
------------------------------



In [17]:
# Aggregate Functions on DataFrame using dataFrame.agg()
from snowflake.snowpark.functions import col, stddev, stddev_pop, min, max, median

(df.group_by(col("AGE"))
   .agg(max(col("SALARY")).as_("col1")
       ,median(col("SALARY")).as_("col2"))
  .show()
)

---------------------------
|"AGE"  |"COL1"  |"COL2"  |
---------------------------
|25     |5400.0  |2300.0  |
|NULL   |nan     |nan     |
|30     |2000.0  |2000.0  |
|40     |5000.0  |5000.0  |
---------------------------



- [DataFrame.cache_result](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.DataFrame.cache_result)

  - Caches the content of this DataFrame to create a new cached Table DataFrame.

  - All subsequent operations on the returned cached DataFrame are performed on the cached data and have no effect on the original DataFrame.

  - You can use `Table.drop_table()` or the `with` statement to clean up the cached result when it’s not needed. Refer to the example code below.

In [20]:
create_result = session.sql("create temp table result(num int)").collect()
insert_result = session.sql("insert into result values(1),(2)").collect()

In [21]:
df = session.table("result")
df.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------



In [22]:
# Run cache_result and then insert into the original table to see that the cached result is not affected
df1 = df.cache_result()

# Insert new record
insert_result = session.sql("insert into result values(3)").collect()

# cached dataframe result
df1.show()

# original dataframe result
df.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------

---------
|"NUM"  |
---------
|1      |
|2      |
|3      |
---------



In [32]:
# You can run cache_result on a result that has already been cached
df2 = df1.cache_result()
df2.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------



In [24]:
# Drop RESULT and see that the cached results still exist
_ = session.sql(f"drop table result").collect()

In [31]:
df1.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------



In [33]:
df2.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------



In [27]:
# Clean up the cached result
df2.drop_table()

In [30]:
# use context manager to clean up the cached result after it's use.
with df1.cache_result() as df3:
    df3.show()

---------
|"NUM"  |
---------
|1      |
|2      |
---------



In [None]:
# Close Snowpark session
session.close()