## Steps:
- Import libraries
- Setup Snowflake objects
- Load data to Snowflake (you may skip this step if you already have data in Snowflake)
- Write code to run Implicit library locally on your machine
- Package code to and make it clean
- Create a Task and schedule it

# Imports

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask

import json

import os

import numpy as np
import pandas as pd
import regex as re

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn.preprocessing import MinMaxScaler
import implicit 
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

In [2]:
connection_parameters = json.load(open('/Users/skhara/Documents/GitHub/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [5]:
session.sql('CREATE DATABASE IF NOT EXISTS RECOMMENDER_SYSTEMS').collect()
session.sql('CREATE SCHEMA IF NOT EXISTS RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS').collect()

session.sql('USE DATABASE RECOMMENDER_SYSTEMS').collect()
session.sql('USE SCHEMA COLLABORATIVE_FILTERING_ALS').collect()
session.sql('CREATE STAGE IF NOT EXISTS ML_MODELS;').collect()

[Row(status='ML_MODELS already exists, statement succeeded.')]

In [None]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET max_concurrency_level = 1;').collect()

# Load Data to Snowflake
This is done in case your data is not already in a Snowflake table. If it is then you can skip this step.

In [None]:
# Loading from local CSV-files
events_data = pd.read_csv('data/events.csv')
events_data.head()

In [None]:
session.write_pandas(events_data, table_name='EVENTS_DATA', auto_create_table=True, overwrite=True)

# Step 1: Testing Locally

In [None]:
datapath= 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA'
snf_df = session.table(datapath)
snf_df = snf_df.with_column("TS_DATE", F.to_date(F.to_timestamp(F.col('"timestamp"')/F.lit(1000))))
snf_df = snf_df.sort(F.col('"TS_DATE"').asc())

data = snf_df.to_pandas()
data['visitorid'] = data['visitorid'].astype("category")
data['visitor_id'] = data['visitorid'].cat.codes
data['visitorid'] = data['visitorid'].astype("int")

data['itemid'] = data['itemid'].astype("category")
data['item_id'] = data['itemid'].cat.codes
data['itemid'] = data['itemid'].astype("int")

data['event']= data['event'].astype('category')
data['event']= data['event'].cat.codes
data['event']= data['event'].astype('int')

rename_dict = {old_col: re.sub(r'[^a-zA-Z0-9_]', '', old_col).upper() for old_col in data.columns}
data.rename(columns=rename_dict, inplace=True)

session.write_pandas(data, table_name='EVENTS_DATA_CLEANED', auto_create_table=True, overwrite=True)

In [None]:
session.write_pandas(data, table_name='EVENTS_DATA_CLEANED', auto_create_table=True, overwrite=True)

In [6]:
snf_cleaned = session.table('EVENTS_DATA_CLEANED')
snf_cleaned.limit(5).to_pandas()

Unnamed: 0,TIMESTAMP,VISITORID,EVENT,ITEMID,TRANSACTIONID,TS_DATE,VISITOR_ID,ITEM_ID
0,1440795810142,922839,2,387334,,2015-08-28,922839,195153
1,1440796866269,332614,2,54939,,2015-08-28,332614,27850
2,1440796528638,823521,2,9198,,2015-08-28,823521,4619
3,1440795514114,35231,2,25284,,2015-08-28,35231,12729
4,1440794798805,135803,2,128129,,2015-08-28,135803,64733


In [9]:
data = snf_cleaned.to_pandas()

In [11]:
unique_pairs = data[['VISITORID','VISITOR_ID']].drop_duplicates()
user_id_dict = dict(zip(unique_pairs['VISITOR_ID'], unique_pairs['VISITORID']))

unique_pairs = data[['ITEMID','ITEM_ID']].drop_duplicates()
item_id_dict = dict(zip(unique_pairs['ITEM_ID'], unique_pairs['ITEMID']))

sparse_item_user = sparse.csr_matrix((data['EVENT'].astype(float), (data['ITEM_ID'], data['VISITOR_ID'])))
sparse_user_item = sparse.csr_matrix((data['EVENT'].astype(float), (data['VISITOR_ID'], data['ITEM_ID'])))

#Building the model
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

alpha_val = 40
data_conf = (sparse_user_item * alpha_val).astype('double')

model.fit(data_conf)

  0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
data['VISITORID'][0:20].to_list()

[591869,
 684514,
 157772,
 395323,
 1265471,
 139953,
 961247,
 1359334,
 822060,
 1284429,
 867447,
 366396,
 348645,
 102941,
 95405,
 504226,
 1320204,
 248679,
 929346,
 11576]

In [13]:
#Get Recommendations for all users
users = data['VISITORID'][0:20].to_list()
recommended = model.recommend(users, sparse_user_item[users], N=5)
recommended[0]

[[ 41790  67074 196786  81517 107226]
 [176924 196343  52380  97541  72728]
 [ 41633 156023 129504 225872 112671]
 [144481 178472 139958 157515 196675]
 [156023 146279  33030   8009  69916]
 [224330 161235  18735 220854  61945]
 [  2719 169261 186716  46393  42604]
 [229223 117958 116537 146632  60097]
 [ 78785  76432 113669 105748  83839]
 [225872  63517  49024  29708 171695]
 [ 14707 105748 120647  14652  33030]
 [ 71944   8009 127790 159546 156023]
 [127790 223822  83374 156023 171135]
 [ 76432 176924 113669  77598  78785]
 [105748 214446 159407 186150  14652]
 [  2719  21276 126959  16696  98124]
 [  2719  49200  56952 222448 112216]
 [  2719 173948  94720 193605   2004]
 [  8009  76432 160324 137993 150733]
 [ 94720 224530  15075 176924  38313]]


In [17]:
recommended_dict = {f"row_{i}": row.tolist() for i, row in enumerate(recommended[0])}
recommended_dict

{'row_0': [41790, 67074, 196786, 81517, 107226],
 'row_1': [176924, 196343, 52380, 97541, 72728],
 'row_2': [41633, 156023, 129504, 225872, 112671],
 'row_3': [144481, 178472, 139958, 157515, 196675],
 'row_4': [156023, 146279, 33030, 8009, 69916],
 'row_5': [224330, 161235, 18735, 220854, 61945],
 'row_6': [2719, 169261, 186716, 46393, 42604],
 'row_7': [229223, 117958, 116537, 146632, 60097],
 'row_8': [78785, 76432, 113669, 105748, 83839],
 'row_9': [225872, 63517, 49024, 29708, 171695],
 'row_10': [14707, 105748, 120647, 14652, 33030],
 'row_11': [71944, 8009, 127790, 159546, 156023],
 'row_12': [127790, 223822, 83374, 156023, 171135],
 'row_13': [76432, 176924, 113669, 77598, 78785],
 'row_14': [105748, 214446, 159407, 186150, 14652],
 'row_15': [2719, 21276, 126959, 16696, 98124],
 'row_16': [2719, 49200, 56952, 222448, 112216],
 'row_17': [2719, 173948, 94720, 193605, 2004],
 'row_18': [8009, 76432, 160324, 137993, 150733],
 'row_19': [94720, 224530, 15075, 176924, 38313]}

In [None]:
# Get Recommendations
user_id = 2
reco = model.recommend(user_id, sparse_user_item[user_id], N=5)
print(reco)

# Testing Model Save and Load

In [None]:
import joblib
import cachetools

In [None]:
import json
from scipy.sparse import save_npz
from scipy.sparse import load_npz

# Serialize user_id_dict
with open('user_id_dict.json', 'w') as file:
    json.dump(user_id_dict, file)

# Serialize item_id_dict
with open('item_id_dict.json', 'w') as file:
    json.dump(item_id_dict, file)

# Serialize sparse_user_item
save_npz('sparse_user_item.npz', sparse_user_item)

# Save model file
FILE_LOCATION = 'als_model.joblib'
joblib.dump(model, FILE_LOCATION)

In [None]:
model2 = joblib.load('als_model.joblib')
# Load user_id_dict
with open('user_id_dict.json', 'r') as file:
    user_id_dict2 = json.load(file)

# Load item_id_dict
with open('item_id_dict.json', 'r') as file:
    item_id_dict2 = json.load(file)

sparse_user_item2 = load_npz('sparse_user_item.npz')

In [None]:
# Get Recommendations
user_id = 2
reco = model2.recommend(user_id, sparse_user_item2[user_id], N=5)
print(reco)

In [None]:
pd.Series(reco[0])

# Option 1: SPROC Based Orchestration in Snowflake

Here we take all the pieces of code written above for local testing and package in a modularized format. We may also choose to
schedule the preprocess pipeline as a predecessor to model train and inference.

In [None]:
from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask
api_root = Root(session)

In [None]:
# Task 1: Preprocess Data
def preprocess_data(session:Session) -> str:
    datapath= 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA'
    snf_df = session.table(datapath)
    snf_df = snf_df.with_column("TS_DATE", F.to_date(F.to_timestamp(F.col('"timestamp"')/F.lit(1000))))
    snf_df = snf_df.sort(F.col('"TS_DATE"').asc())

    data = snf_df.to_pandas()
    data['visitorid'] = data['visitorid'].astype("category")
    data['visitor_id'] = data['visitorid'].cat.codes
    data['visitorid'] = data['visitorid'].astype("int")

    data['itemid'] = data['itemid'].astype("category")
    data['item_id'] = data['itemid'].cat.codes
    data['itemid'] = data['itemid'].astype("int")

    data['event']= data['event'].astype('category')
    data['event']= data['event'].cat.codes
    data['event']= data['event'].astype('int')

    session.write_pandas(data, table_name='EVENTS_DATA_CLEANED', auto_create_table=True, overwrite=True)

    return 'DATA PROCESSING SUCCESS'

In [None]:
# Task 2: Model Train + Inference
def train_model(session:Session, sparse_user_item):
    import implicit
    import scipy.sparse as sparse

    model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
    alpha_val = 40
    data_conf = (sparse_user_item * alpha_val).astype('double')
    model.fit(data_conf)

    return model

# Clean output
def process_output(session:Session, reco, user_id_dict, item_id_dict):
    rec_df = pd.DataFrame(reco[0], columns=['rec1', 'rec2', 'rec3', 'rec4', 'rec5'])
    user_df = pd.DataFrame(data = list(user_id_dict.items()), columns = ['visitor_id','visitorid'])
    joined_df = user_df.join(rec_df)

    for col in ['rec1', 'rec2', 'rec3', 'rec4', 'rec5']:
        joined_df[col] = joined_df[col].map(item_id_dict)
    
    joined_df['PRED_TIMESTAMP'] = str(pd.Timestamp.now())
    return joined_df

# Run the Process
def get_predictions(session:Session) -> str:
    import pandas as pd
    import implicit
    import scipy.sparse as sparse
    from datetime import datetime
    import snowflake.snowpark.functions as F

    data = session.table('RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA_CLEANED').to_pandas()

    # Map cat codes to visitorid
    unique_pairs = data[['visitorid','visitor_id']].drop_duplicates()
    user_id_dict = dict(zip(unique_pairs['visitor_id'], unique_pairs['visitorid']))

    # Map cat codes to itemid
    unique_pairs = data[['itemid','item_id']].drop_duplicates()
    item_id_dict = dict(zip(unique_pairs['item_id'], unique_pairs['itemid']))

    # Sparse matrix are more performant when the range of numbers isnt too large
    # sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id'])))
    sparse_user_item = sparse.csr_matrix((data['event'].astype(float), (data['visitor_id'], data['item_id'])))
    model = train_model(session, sparse_user_item)
    recommended = model.recommend(list(user_id_dict.keys()), sparse_user_item[list(user_id_dict.keys())], N=5)

    # Map users and items back to original codes
    cleaned_df = process_output(session, recommended, user_id_dict, item_id_dict)
    
    # Save Data in Snowflake
    session.write_pandas(cleaned_df, table_name='ITEM_RECOMMENDATIONS', auto_create_table=True, overwrite=True)

    return 'Success'

In [None]:
schema = api_root.databases['RECOMMENDER_SYSTEMS'].schemas['COLLABORATIVE_FILTERING_ALS']
tasks = schema.tasks

In [None]:
# Create Task 1: Preprocess Task
task1_entity = Task(
    "PREPROCESS_DATA",
    definition = StoredProcedureCall(preprocess_data,
                                   stage_location="@ML_MODELS",
                                   packages=["snowflake-snowpark-python","snowflake-ml-python", "regex"]),
    warehouse = connection_parameters['warehouse'],
    schedule = timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")

In [None]:
# Create Task 2: for model training and inference
task2_entity = Task(
    "RECO_ENGINE",
    definition = StoredProcedureCall(get_predictions, stage_location="@ML_MODELS", 
                                     packages=["snowflake-snowpark-python","snowflake-ml-python",
                                               "regex", "scipy", "implicit==0.6.2", "numpy==1.23.5"]),
    warehouse = connection_parameters['warehouse']
    )

task2_entity.predecessors = ["RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.PREPROCESS_DATA"]
task2 = tasks.create(task2_entity, mode="orReplace")

In [None]:
task2.resume()
task1.resume()

In [None]:
task1.execute()

# Option 2: Distributed Modeling using UDTF

### Model Training using SPROC

In [None]:
# Model Training SPROC
def model_train(session:Session, table_name: str) -> str:
    import pandas as pd
    import os
    import joblib
    import implicit
    import scipy.sparse as sparse
    from scipy.sparse import save_npz
    from datetime import datetime

    data = session.table(table_name).to_pandas()

    # Map cat codes to visitorid
    unique_pairs = data[['VISITORID','VISITOR_ID']].drop_duplicates()
    user_id_dict = dict(zip(unique_pairs['VISITOR_ID'], unique_pairs['VISITORID']))

    # Map cat codes to itemid
    unique_pairs = data[['ITEMID','ITEM_ID']].drop_duplicates()
    item_id_dict = dict(zip(unique_pairs['ITEM_ID'], unique_pairs['ITEMID']))

    # Sparse matrix are more performant when the range of numbers isnt too large
    # sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id'])))
    sparse_user_item = sparse.csr_matrix((data['EVENT'].astype(float), (data['VISITOR_ID'], data['ITEM_ID'])))

    # Model Training
    model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
    alpha_val = 40
    data_conf = (sparse_user_item * alpha_val).astype('double')
    model.fit(data_conf)

    # Serialize user_id_dict        
    with open('/tmp/user_id_dict.pkl', 'wb') as file:
        joblib.dump(user_id_dict, file)
        session.file.put('/tmp/user_id_dict.pkl', '@ML_MODELS/TRAIN_OUTPUT', auto_compress=False, overwrite=True)

    # Serialize item_id_dict
    with open('/tmp/item_id_dict.pkl', 'wb') as file:
        joblib.dump(item_id_dict, file)
        session.file.put('/tmp/item_id_dict.pkl', '@ML_MODELS/TRAIN_OUTPUT', auto_compress=False, overwrite=True)

    # Serialize sparse_user_item
    save_npz('/tmp/sparse_user_item.npz', sparse_user_item)
    session.file.put('/tmp/sparse_user_item.npz', '@ML_MODELS/TRAIN_OUTPUT', auto_compress=False, overwrite=True)

    # Save model file
    FILE_LOCATION = '/tmp/als_model.joblib'
    joblib.dump(model, FILE_LOCATION)
    session.file.put(FILE_LOCATION, '@ML_MODELS/TRAIN_OUTPUT', auto_compress=False, overwrite=True)
    
    return 'Success'

In [None]:
# Registering the function as a Stored Procedure
sproc_model_train = session.sproc.register(func=model_train,
                                           name='ALS_MODEL_TRAIN',
                                           is_permanent=True,
                                           replace=True,
                                           stage_location='@ML_MODELS',
                                           packages=["snowflake-snowpark-python","snowflake-ml-python", "joblib",
                                                     "regex", "scipy", "implicit==0.6.2", "numpy==1.23.5"])

In [None]:
table_name = 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA_CLEANED'
sproc_model_train(session, table_name)

### Model Inference using SPROC

In [None]:
# Model Training SPROC
def model_predict(session:Session) -> str:
    import pandas as pd
    import os
    import sys
    import joblib
    import implicit
    import scipy.sparse as sparse
    from scipy.sparse import load_npz

    session.file.get(f"@ML_MODELS/TRAIN_OUTPUT/als_model.joblib", "/tmp/")
    session.file.get(f"@ML_MODELS/TRAIN_OUTPUT/sparse_user_item.npz", "/tmp/")
    session.file.get(f"@ML_MODELS/TRAIN_OUTPUT/user_id_dict.pkl", "/tmp/")

    # Load Model
    model = joblib.load("/tmp/als_model.joblib")

    # Load the Sparse input file
    sparse_user_item = load_npz("/tmp/sparse_user_item.npz")

    with open("/tmp/user_id_dict.pkl", 'rb') as file:
        user_id_dict = joblib.load(file)

    recommended = model.recommend(list(user_id_dict.keys()), sparse_user_item[list(user_id_dict.keys())], N=5)
    
    return len(recommended[0])

In [None]:
# Registering the function as a Stored Procedure
sproc_predict = session.sproc.register(func=model_predict,
                                       name='ALS_PREDICT',
                                       is_permanent=True,
                                       replace=True,
                                       stage_location='@ML_MODELS',
                                    #    imports=['@ML_MODELS/TRAIN_OUTPUT/als_model.joblib',
                                    #             '@ML_MODELS/TRAIN_OUTPUT/sparse_user_item.npz',
                                    #             '@ML_MODELS/TRAIN_OUTPUT/user_id_dict.json'],
                                       packages=["snowflake-snowpark-python","snowflake-ml-python", "joblib",
                                                 "regex", "scipy", "implicit==0.6.2", "numpy==1.23.5"])

In [None]:
sproc_predict()

### UDF for Infernece

In [21]:
feature_cols = ['VISITORID','VISITOR_ID']

In [None]:
# Define a simple scoring function
from cachetools import cached

@cached(cache={})
def load_from_stage(import_dir) -> object:
    import joblib
    from scipy.sparse import load_npz

    # Load Model
    file_loc = import_dir + 'als_model.joblib'
    model = joblib.load(file_loc)

    # Load the Sparse input file
    file_loc = import_dir + 'sparse_user_item.npz'
    sparse_user_item = load_npz(file_loc)

    return model, sparse_user_item #, user_id_dict, item_id_dict, 

def udf_als_score(df: pd.DataFrame) -> pd.Series:
    import os
    import sys
    import numpy
    import json
    import implicit
    import scipy.sparse as sparse
    from scipy.sparse import load_npz

    # file-dependencies of UDFs are available in snowflake_import_directory
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]

    model, sparse_user_item = load_from_stage(import_dir)
    df.columns = feature_cols
    recommended = model.recommend(df['VISITORID'].to_list(), sparse_user_item[df['VISITORID'].to_list()], N=5)

    def join_row_elements(row):
        return ', '.join(map(str, row))
    joined_arr = np.apply_along_axis(join_row_elements, 1, recommended[0])
    return pd.Series(joined_arr)

    # output = [len(recommended[0])]*len(recommended[0])
    # return pd.Series(output)

In [31]:
# Define a simple scoring function
from cachetools import cached

@cached(cache={})
def load_from_stage(import_dir) -> object:
    import joblib
    from scipy.sparse import load_npz

    # Load Model
    file_loc = import_dir + 'als_model.joblib'
    model = joblib.load(file_loc)

    # Load the Sparse input file
    file_loc = import_dir + 'sparse_user_item.npz'
    sparse_user_item = load_npz(file_loc)

    return model, sparse_user_item #, user_id_dict, item_id_dict, 

def udf_als_score(df: T.PandasDataFrame[int, int]) -> T.PandasSeries[dict]:
    import os
    import sys
    import numpy
    import json
    import implicit
    import scipy.sparse as sparse
    from scipy.sparse import load_npz

    # file-dependencies of UDFs are available in snowflake_import_directory
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]

    model, sparse_user_item = load_from_stage(import_dir)
    df.columns = feature_cols
    recommended = model.recommend(df['VISITORID'].to_list(), sparse_user_item[df['VISITORID'].to_list()], N=5)
    
    # Processing for output
    recommended_series = pd.Series([{"recommendations": row.tolist()} for row in recommended[0]])

    return recommended_series

In [54]:
session.sql('USE WAREHOUSE SSK_RESEARCH').collect()

[Row(status='Statement executed successfully.')]

In [32]:
# Register UDF
udf_als = session.udf.register(func=udf_als_score, 
                               name="ALS_COLAB_FILTERING", 
                               stage_location='@ML_MODELS',
                               replace=True,
                               is_permanent=True, 
                               imports=['@ML_MODELS/TRAIN_OUTPUT/als_model.joblib',
                                        '@ML_MODELS/TRAIN_OUTPUT/sparse_user_item.npz'],
                               packages=["snowflake-snowpark-python","snowflake-ml-python", "joblib",
                                         "regex", "scipy", "implicit==0.6.2", "numpy==1.23.5"], 
                               session=session)

The version of package 'joblib' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'joblib'. Your UDF might not work when the package version is different between the server and your local environment.


In [56]:
snowdf_test = session.table('RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA_CLEANED')
feature_cols = ['VISITORID','VISITOR_ID']

test_sdf_w_preds = snowdf_test.with_column('PREDICTED',
                                           F.call_udf("ALS_COLAB_FILTERING", [F.col(c) for c in feature_cols]))

test_sdf_w_preds.write.mode("overwrite").save_as_table("RECOMMENDATIONS_OUTPUT")

In [48]:
local_df = test_sdf_w_preds.to_pandas()
local_df

Unnamed: 0,TIMESTAMP,VISITORID,EVENT,ITEMID,TRANSACTIONID,TS_DATE,VISITOR_ID,ITEM_ID,PREDICTED
0,1440797505309,638482,2,156587,,2015-08-28,638482,78989,"{\n ""recommendations"": [\n 172025,\n 19..."
1,1440792487754,1368001,2,110243,,2015-08-28,1368001,55677,"{\n ""recommendations"": [\n 113669,\n 76..."
2,1440792476356,671482,2,461177,,2015-08-28,671482,232237,"{\n ""recommendations"": [\n 38313,\n 148..."
3,1440793449724,973751,2,109948,,2015-08-28,973751,55526,"{\n ""recommendations"": [\n 159407,\n 21..."
4,1440792901632,1124131,2,370444,,2015-08-28,1124131,186625,"{\n ""recommendations"": [\n 212797,\n 15..."
...,...,...,...,...,...,...,...,...,...
2756096,1440797997292,1261748,2,367956,,2015-08-28,1261748,185411,"{\n ""recommendations"": [\n 110536,\n 19..."
2756097,1440795591479,569539,2,320130,,2015-08-28,569539,161235,"{\n ""recommendations"": [\n 14652,\n 676..."
2756098,1440797638044,24899,2,209855,,2015-08-28,24899,105681,"{\n ""recommendations"": [\n 150148,\n 85..."
2756099,1440795039186,342436,2,354745,,2015-08-28,342436,178753,"{\n ""recommendations"": [\n 118428,\n 58..."


### Orchestration Using Tasks

In [49]:
# Task 1: Preprocess Data
def train_and_predict(session:Session) -> str:
    from snowflake.snowpark.functions import udf
    import snowflake.snowpark.functions as F

    # Call SPROC
    table_name = 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA_CLEANED'
    _ = sproc_model_train(session, table_name)

    # Prediction using UDF
    snowdf_test = session.table('table_name')
    feature_cols = ['VISITORID','VISITOR_ID']
    test_sdf_w_preds = snowdf_test.with_column('PREDICTED',
                                               F.call_udf("ALS_COLAB_FILTERING", [F.col(c) for c in feature_cols]))

    test_sdf_w_preds.write.mode("overwrite").save_as_table("RECOMMENDATIONS_OUTPUT")

    return 'Recommendation Model Success'

In [50]:
api_root = Root(session)
schema = api_root.databases['RECOMMENDER_SYSTEMS'].schemas['COLLABORATIVE_FILTERING_ALS']
tasks = schema.tasks

In [51]:
# Create Task 1: Preprocess Task
task1_entity = Task(
    "TRAIN_AND_PREDICT",
    definition = StoredProcedureCall(train_and_predict,
                                   stage_location="@ML_MODELS",
                                   packages=["snowflake-snowpark-python","snowflake-ml-python", "regex"]),
    warehouse = connection_parameters['warehouse'],
    schedule = timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")

In [52]:
task1.resume()
task1.execute()