# GOAL

Connect to Snowflake using Spark Connector running Spark on local Sagemaker instance

In [5]:
# %%bash
# SFC_DIR='/home/ec2-user/snowflake'
# [ ! -d "$SFC_DIR" ] && mkdir $SFC_DIR 
# cd $SFC_DIR
# PRODUCTS='snowflake-jdbc spark-snowflake_2.11'
# for PRODUCT in $PRODUCTS
# do
#    wget "https://repo1.maven.org/maven2/net/snowflake/$PRODUCT/maven-metadata.xml" 2> /dev/null
#    VERSION=$(grep latest maven-metadata.xml | awk -F">" '{ print $2 }' | awk -F"<" '{ print $1 }')
#    DRIVER=$PRODUCT-$VERSION.jar
#    if [[ ! -e $DRIVER ]]
#    then
#       rm $PRODUCT* 2>/dev/null
#       wget "https://repo1.maven.org/maven2/net/snowflake/$PRODUCT/$VERSION/$DRIVER" 2> /dev/null
#    fi
#    [ -e maven-metadata.xml ] && rm maven-metadata.xml
# done

Spark 2.3.4
kernel : conda-python3
look at jar versions below in curl commands

The problems with Spark so far were finding the right combination of
* Snowflake version
* Scala version  and Spark Version
* snowflake spark connector version
* snowflake jdbc driver version


Its not necessary that the latest stable build of all these works.  
The Py4J library breaks at different places for different combinations.

In [None]:
%%bash
SFC_DIR='/home/ec2-user/snowflake'
[ ! -d "$SFC_DIR" ] && mkdir $SFC_DIR 
cd $SFC_DIR

curl -L -O https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/3.8.0/snowflake-jdbc-3.8.0.jar
curl -L -O https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.11/2.4.14-spark_2.4/spark-snowflake_2.11-2.4.14-spark_2.4.jar

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext,SparkSession
from pyspark.sql.types import *
from sagemaker_pyspark import IAMRole, classpath_jars
from sagemaker_pyspark.algorithms import KMeansSageMakerEstimator

In [2]:
sfc_jars=!ls -d /home/ec2-user/snowflake/*.jar

In [3]:
sfc_jars

['/home/ec2-user/snowflake/snowflake-jdbc-3.8.0.jar',
 '/home/ec2-user/snowflake/spark-snowflake_2.11-2.4.14-spark_2.4.jar']

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", (":".join(classpath_jars())+":"+":".join(sfc_jars)))
        .setMaster('local')
        .setAppName('local-spark-test'))

In [5]:
conf

<pyspark.conf.SparkConf at 0x7fee633e45c0>

In [6]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [7]:
spark

In [8]:
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

In [9]:
sfOptions = {
  "sfURL" : "*",
  "sfAccount" : "*",
  "sfUser" : "*",
  "sfPassword" : "*",
  "sfDatabase" : "*",
  "sfSchema" : "*",
  "sfWarehouse" : "*"
}

In [10]:
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query", "select 1 as my_num union all select 2 as my_num").load()

In [11]:
df.columns

['MY_NUM']

In [12]:
df.printSchema()

root
 |-- MY_NUM: decimal(1,0) (nullable = false)



In [13]:
df.show(truncate=3)

+------+
|MY_NUM|
+------+
|     1|
|     2|
+------+



In [14]:
spark.stop()