### IMPORT SNOWFLAKE SESSION, FUNCTIONS, DATA TYPES AND OTHER PYTHON LIBRARIES

In [1]:
# Import required libraries
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import avg, sum, col,lit
from snowflake.snowpark.functions import udf, sproc, col
from snowflake.snowpark.types import IntegerType, FloatType, LongType, DoubleType, DecimalType,StringType, BooleanType, Variant
from snowflake.snowpark.types import PandasSeries, PandasDataFrame
from snowflake.snowpark import functions as fn

import pandas as pd
import numpy as np
import json

from snowflake.snowpark import version
print (f"snowflake snowpark version is: {version.VERSION}")

snowflake snowpark version is: (0, 10, 0)


### SET UP SNOWFLAKE CONNECTION USING THE CREDENTIALS & PRINT CURRENT DB, SCHEMA AND WAREHOUSE

In [2]:
snowflake_connection_cfg = open('cred.json')
snowflake_connection_cfg = snowflake_connection_cfg.read()
snowflake_connection_cfg = json.loads(snowflake_connection_cfg)

# Creating Snowpark Session
spe_session = Session.builder.configs(snowflake_connection_cfg).create()
print('Current Database:', spe_session.get_current_database())
print('Current Schema:', spe_session.get_current_schema())
print('Current Warehouse:', spe_session.get_current_warehouse())

Current Database: "BANK1_CRM_DB"
Current Schema: "PUBLIC"
Current Warehouse: "APP_WH"


### REMOVE ALL IMPORTS AND PACKAGES IN CURRENT SESSION AND ADD THE NEEDED PACKAGES INCLUDING SNOWFLAKE-SNOWFPARK FOR PYTHON

In [3]:
spe_session.clear_imports()
spe_session.clear_packages()
spe_session.add_packages("snowflake-snowpark-python")
spe_session.add_packages("scikit-learn","pandas","numpy","joblib","cachetools")

### CREATE A NEW INTERNAL SNOWFLAKE STAGE LOCATION TO SAVE THE MODEL FILE LATER

In [4]:
spe_session.sql("CREATE OR REPLACE STAGE knadadur_models").collect()

[Row(status='Stage area KNADADUR_MODELS successfully created.')]

In [5]:
spe_session.sql("list @knadadur_models").collect()

[]

### CREATE A NEW INTERNAL SNOWFLAKE STAGE LOCATION TO SAVE THE INPUT DATA FILE

In [6]:
spe_session.sql("CREATE STAGE IF NOT EXISTS knadadur_data").collect()

[Row(status='KNADADUR_DATA already exists, statement succeeded.')]

### MAKE SURE YOU LOCATE LOCAL DATA FILE AND LOAD IT TO INTERNAL STAGE

In [7]:
folder_path = "/Users/knadadur/Documents/Snowparkusecases/SoftwareProjectEstimation/"
input_file = "softwareprojectsdata.csv"
input_data = folder_path+input_file
input_data

'/Users/knadadur/Documents/Snowparkusecases/SoftwareProjectEstimation/softwareprojectsdata.csv'

In [8]:
spe_session.sql("put file:///Users/knadadur/Documents/Snowparkusecases/SoftwareProjectEstimation/softwareprojectsdata.csv @knadadur_data").collect();

In [9]:
pd.read_csv(input_file)

Unnamed: 0,id,Project,TeamExp,ManagerExp,YearEnd,Length,Effort,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust,Language
0,1,1,1,4,85,12,5152,253,52,305,34,302,1
1,2,2,0,0,86,4,5635,197,124,321,33,315,1
2,3,3,4,4,85,1,805,40,60,100,18,83,1
3,4,4,0,0,86,5,3829,200,119,319,30,303,1
4,5,5,0,0,86,4,2149,140,94,234,24,208,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,77,77,4,4,85,12,1400,229,169,398,39,414,3
77,78,78,4,3,83,12,2800,227,73,300,34,297,1
78,79,79,4,4,82,24,9520,395,193,588,40,617,1
79,80,80,4,3,86,12,5880,469,176,645,43,697,3


### CHECK IF THE FILE HAS BEEN UPLOADED TO INTERNAL STAGE

In [10]:
spe_session.sql("list @knadadur_data").collect()

[Row(name='knadadur_data/EcommerceCustomers.csv.gz', size=33792, md5='57705477516e68ab8b3c8401a73deb8c', last_modified='Thu, 3 Nov 2022 17:13:04 GMT'),
 Row(name='knadadur_data/softwareprojectsdata.csv.gz', size=1664, md5='54d6a4fb52738250c895b3e18b47df36', last_modified='Mon, 7 Nov 2022 19:33:43 GMT')]

### CREATE A TABLE IN DATABASE AS WE KNOW THE STRUCTURE

In [11]:
spe_session.sql("CREATE OR REPLACE table SOFTWARE_PROJECTS (ID VARCHAR,PROJECT VARCHAR,TEAMEXP NUMBER,MANAGEREXP NUMBER,YEAREND NUMBER,LENGTH NUMBER,EFFORT NUMBER,TRANSACTIONS NUMBER,ENTITIES NUMBER,POINTNONADJUST NUMBER,ADJUSTMENT NUMBER,POINTSADJUST NUMBER,LANGUAGE NUMBER)").collect()

[Row(status='Table SOFTWARE_PROJECTS successfully created.')]

### NOW LET'S LOAD THE DATA FROM INTERNAL STAGE TO THE software_projects DATABASE TABLE IN SNOWFLAKE

In [12]:
spe_session.sql("copy into SOFTWARE_PROJECTS from @knadadur_data/softwareprojectsdata.csv.gz ON_ERROR = CONTINUE").collect()

[Row(file='knadadur_data/softwareprojectsdata.csv.gz', status='PARTIALLY_LOADED', rows_parsed=82, rows_loaded=81, error_limit=82, errors_seen=1, first_error="Numeric value 'TeamExp' is not recognized", first_error_line=1, first_error_character=13, first_error_column_name='"SOFTWARE_PROJECTS"["TEAMEXP":3]')]

### CHECK THE DATA AND CHECK SOME STATS

In [13]:
snowpark_df = spe_session.table('SOFTWARE_PROJECTS')

# Describing the data
print('Rows in dataset:', f"{snowpark_df.count():,}")
print('Data before imputation:')
display(snowpark_df.describe().sort('SUMMARY').toPandas())

Rows in dataset: 81
Data before imputation:


Unnamed: 0,SUMMARY,ID,PROJECT,TEAMEXP,MANAGEREXP,YEAREND,LENGTH,EFFORT,TRANSACTIONS,ENTITIES,POINTNONADJUST,ADJUSTMENT,POINTSADJUST,LANGUAGE
0,count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
1,max,9.0,9.0,4.0,7.0,88.0,39.0,23940.0,886.0,387.0,1127.0,52.0,1116.0,3.0
2,mean,,,2.185185,2.530864,85.740741,11.666667,5046.308642,182.123457,122.333333,304.45679,27.62963,289.234568,1.555556
3,min,1.0,1.0,-1.0,-1.0,82.0,1.0,546.0,9.0,7.0,73.0,5.0,62.0,1.0
4,stddev,,,1.415195,1.643825,1.222475,7.424621,4418.767228,144.035098,84.882124,180.210159,10.591795,185.761088,0.707107


In [14]:
snowpark_df.schema.fields

[StructField('ID', StringType(), nullable=True),
 StructField('PROJECT', StringType(), nullable=True),
 StructField('TEAMEXP', LongType(), nullable=True),
 StructField('MANAGEREXP', LongType(), nullable=True),
 StructField('YEAREND', LongType(), nullable=True),
 StructField('LENGTH', LongType(), nullable=True),
 StructField('EFFORT', LongType(), nullable=True),
 StructField('TRANSACTIONS', LongType(), nullable=True),
 StructField('ENTITIES', LongType(), nullable=True),
 StructField('POINTNONADJUST', LongType(), nullable=True),
 StructField('ADJUSTMENT', LongType(), nullable=True),
 StructField('POINTSADJUST', LongType(), nullable=True),
 StructField('LANGUAGE', LongType(), nullable=True)]

## HERE IS WHERE WE USE THE POWER OF SNOWFLAKE WHERE WE PUSH DOWN MODEL TRAINING TO SNOWFLAKE AND SNOWFLAKE CREDITS WILL BE CONSUMED. MAKE SURE YOUR WAREHOUSE IS RIGHTLY SIZED FOR THE VOLUME OF DATA YOU ARE TRAINING THE MODEL.

### DEFINE THE TRAINING MODEL AND FOR THIS USE CASE LET'S USE SKLEARN'S LINEAR REGRESSION MODEL and K Nearest Neighbor model.

In [15]:
features = ['TEAMEXP','MANAGEREXP','YEAREND','LENGTH','TRANSACTIONS','ENTITIES','POINTNONADJUST','ADJUSTMENT','POINTSADJUST','LANGUAGE']

In [16]:
# This local Python-function will be registered as a Stored Procedure

def spec_train_lm_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list) -> Variant:
    # Loading data into pandas dataframe
    local_training_data = session.table(training_table).sample(n=sample_size_n).toPandas()

    # Define features and label
    X = local_training_data[features]
    Y = local_training_data['EFFORT']

    # Splitting data into training and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=22)

    # Actual model training
    from sklearn.linear_model import LinearRegression
    lm = LinearRegression()
    lm.fit(X_train,y_train)
    
    # Getting model coefficients
    coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
    # Save model as file and upload to Snowflake stage
    from joblib import dump, load
    dump(lm, '/tmp/'+model_name)
    session.file.put('/tmp/'+model_name, '@knadadur_models', auto_compress=False)
    return lm.score(X_test, y_test)
    return coeff_df.to_dict()

In [17]:
# This local Python-function will be registered as a Stored Procedure

def spec_train_knn_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list) -> Variant:
    # Loading data into pandas dataframe
    local_training_data = session.table(training_table).sample(n=sample_size_n).toPandas()

    # Define features and label
    X = local_training_data[features]
    Y = local_training_data['EFFORT']

    # Splitting data into training and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=30)

    # Actual model training
    from sklearn.neighbors import KNeighborsRegressor
    knn = KNeighborsRegressor(n_neighbors=3, weights='uniform')
    knn.fit(X_train,y_train)
    
    # Save model as file and upload to Snowflake stage
    from joblib import dump, load
    dump(knn, '/tmp/'+model_name)
    session.file.put('/tmp/'+model_name, '@knadadur_models', auto_compress=False)

    # Getting model score
    return knn.score(X_test, y_test)


### REGISTER THE "MODEL TRAINING" AS "STORED PROCEDURE" IN SNOWFLAKE USING THE DEFINED FUNCTION ABOVE

In [18]:
# Registering the function as a Stored Procedure
spec_linear_model_sproc = spe_session.sproc.register(func=spec_train_lm_model, # training function defined above
                                            name='spec_train_lm_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@knadadur_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) 
                                                                        # import model libaries


In [20]:
# Registering the function as a Stored Procedure
spec_knn_sproc = spe_session.sproc.register(func=spec_train_knn_model, # training function defined above
                                            name='spec_train_knn_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@knadadur_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) 
                                                                        # import model libaries


### EXECUTE THE MODEL TRAINING FROM THE NOTEBOOK. 
### THIS EXECUTION IS PUSHED DOWN TO SNOWFLAKE AND THE CREDITS WILL BE CONSUMED.
### CHECK THE SCORES FROM EACH MODEL

In [21]:
# Execute the Stored Procedure to train a model and display coefficients
# spec_lm_coefficients = spec_linear_model_sproc('software_projects',100, 'spec_train_lm_model.sav',features)
#avoid passing the session parameter as remembers from the stored proc registered
#pass the train_set , data limit and the model file name to be saved
# display(pd.DataFrame(eval(spec_lm_coefficients)))
spec_lm_score = spec_linear_model_sproc('software_projects',100, 'spec_train_lm_model.sav',features)
print (f"Linear Regression Model Score : {spec_lm_score}")

Linear Regression Model Score : 0.8524016655488438


In [22]:
# Execute the Stored Procedure to train a model and display coefficients
spec_knn_score = spec_knn_sproc('software_projects',100, 'spec_train_knn_model.sav',features)
#avoid passing the session parameter as remembers from the stored proc registered
#pass the train_set , data limit and the model file name to be saved
print (f"k nearest neighbor Model Score : {spec_knn_score}")

k nearest neighbor Model Score : 0.7450919920378207


# We find out that linear regression model has outperformed knearest neighbor on predicting the unexposed test set but let's use both for now

### CHECK IF THE MODEL FILES ARE CREATED

In [24]:
spe_session.sql("list @knadadur_models").collect()

[Row(name='knadadur_models/spec_train_knn_model.sav', size=7824, md5='2a9e41539e14de9c64792a9bc63ced1c', last_modified='Tue, 8 Nov 2022 21:59:22 GMT'),
 Row(name='knadadur_models/spec_train_lm_model.sav', size=1120, md5='936fd193ee2b209e35d3d64117789a92', last_modified='Tue, 8 Nov 2022 21:59:17 GMT')]

## NOW THAT THE MODEL HAS BEEN TRAINED USING SNOWFLAKE STORED PROC AND THE MODEL FILE IS NOW SAVED IN THE STAGING ENVIRONMENT LET'S TRY TO INFER FROM THE MODEL TO PREDICT

### IMPORT THE LIBRARIES REQUIRED FOR INFERENCE

In [25]:
from snowflake.snowpark.functions import pandas_udf
from cachetools import cached

### DEFINE THE INFERENCE TO PREDICT FROM THE MODEL SAVED IN THE INTERNAL STAGE DIRECTORY IN SNOWFLAKE

In [26]:
@cached(cache={})
def load_model(model_path: str) -> object:
    from joblib import load
    model = load(model_path)
    return model
### define the datatypes for the number of input variables here in score_model function
def score_model(df: PandasDataFrame[int,int,int,int,int,int,int,int,int,int]) -> PandasSeries[float]:
    import os
    import sys
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    model_name = 'spec_train_lm_model.sav'
    model = load_model(import_dir+model_name)
    return pd.Series(model.predict(df))

### DEFINE THE MODEL INFERENCE IN A USER DEFINED FUNCTION (UDF)

In [27]:
lm_model_udf = pandas_udf(func=score_model, 
                           name="spec_lm_prediction_vec", 
                           stage_location='@knadadur_models',
                           replace=True, 
                           is_permanent=True, 
                           imports=['@knadadur_models/spec_train_lm_model.sav'],
                           packages=['scikit-learn==1.0.2','pandas','joblib','cachetools'], 
                           max_batch_size=100, 
                           session=spe_session)

The version of package scikit-learn in the local environment is 1.1.2, which does not fit the criteria for the requirement scikit-learn==1.0.2. Your UDF might not work when the package version is different between the server and your local environment


In [28]:
@cached(cache={})
def load_model(model_path: str) -> object:
    from joblib import load
    model = load(model_path)
    return model
### define the datatypes for the number of input variables here in score_model function
def score_knn_model(df: PandasDataFrame[int,int,int,int,int,int,int,int,int,int]) -> PandasSeries[float]:
    import os
    import sys
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    model_name = 'spec_train_knn_model.sav'
    model = load_model(import_dir+model_name)
    return pd.Series(model.predict(df))

In [29]:
knn_model_udf = pandas_udf(func=score_knn_model, 
                           name="spec_knn_prediction_vec", 
                           stage_location='@knadadur_models',
                           replace=True, 
                           is_permanent=True, 
                           imports=['@knadadur_models/spec_train_knn_model.sav'],
                           packages=['scikit-learn==1.0.2','pandas','joblib','cachetools'], 
                           max_batch_size=100, 
                           session=spe_session)

The version of package scikit-learn in the local environment is 1.1.2, which does not fit the criteria for the requirement scikit-learn==1.0.2. Your UDF might not work when the package version is different between the server and your local environment


In [30]:
pd.options.display.max_rows = 200

### CALL THE INFERENCE LIKE HOW YOU WOULD ON A SIMPLE SQL STATEMENT USING THE inference udfs that were defined previously FOR both linear regression and knearestneighbor models to compare

In [31]:
output = snowpark_df.limit(10000).select('ID',
                                         'PROJECT',
                                         fn.col('EFFORT').alias('ACTUAL_EFFORT'),
                                         lm_model_udf(*features).alias('lm_PREDICTED_EFFORT'), # calling the model running in Snowflake (user-defined function)
                                         knn_model_udf(*features).alias('knn_PREDICTED_EFFORT'), # calling the model running in Snowflake (user-defined function)
                                        (lm_model_udf(*features) - fn.col('ACTUAL_EFFORT')).alias('lm_DELTA'),                                      
                                        (knn_model_udf(*features) - fn.col('ACTUAL_EFFORT')).alias('knn_DELTA'),
                                       *features
                                     ).toPandas()
output

Unnamed: 0,ID,PROJECT,ACTUAL_EFFORT,LM_PREDICTED_EFFORT,KNN_PREDICTED_EFFORT,LM_DELTA,KNN_DELTA,TEAMEXP,MANAGEREXP,YEAREND,LENGTH,TRANSACTIONS,ENTITIES,POINTNONADJUST,ADJUSTMENT,POINTSADJUST,LANGUAGE
0,1,1,5152,6659.309662,4911.666667,1507.309662,-240.333333,1,4,85,12,253,52,305,34,302,1
1,2,2,5635,5070.945796,5196.333333,-564.054204,-438.666667,0,0,86,4,197,124,321,33,315,1
2,3,3,805,2193.209088,1753.666667,1388.209088,948.666667,4,4,85,1,40,60,100,18,83,1
3,4,4,3829,4832.899026,5196.333333,1003.899026,1367.333333,0,0,86,5,200,119,319,30,303,1
4,5,5,2149,3494.044586,2648.333333,1345.044586,499.333333,0,0,86,4,140,94,234,24,208,1
5,6,6,2821,4086.091953,3875.666667,1265.091953,1054.666667,0,0,86,4,97,89,186,38,192,1
6,7,7,2569,1305.083561,2020.666667,-1263.916439,-548.333333,2,1,85,9,119,42,161,25,145,2
7,8,8,3913,4422.885905,2977.333333,509.885905,-935.666667,1,2,83,13,186,52,238,25,214,1
8,9,9,7854,4615.799017,6456.333333,-3238.200983,-1397.666667,3,1,85,12,172,88,260,30,247,1
9,10,10,2422,2484.951404,2596.0,62.951404,174.0,3,4,83,4,78,38,116,24,103,1


### TIME TO WRAP AND CLOSE THE SNOWFLAKE SESSION

In [None]:
spe_session.close()
print('Finished!!!')

### It is hard to say which model is better on unexposed set and I would also try XGBOOST as a model to predict. However this is just the art of possible and I want to show how we can leverage snowpark to train the model by pushing down to snowflake using stored proc and then infer using simple SQL calling the UDF.