## Steps:
- Import libraries
- Setup Snowflake objects
- Load data to Snowflake (you may skip this step if you already have data in Snowflake)
- Write code to run Implicit library locally on your machine
- Package code to and make it clean
- Create a Task and schedule it

# Imports

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

import json

import os

import numpy as np
import pandas as pd

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn.preprocessing import MinMaxScaler
import implicit 
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

In [2]:
connection_parameters = json.load(open('/Users/skhara/Documents/GitHub/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [4]:
session.sql('CREATE DATABASE IF NOT EXISTS RECOMMENDER_SYSTEMS').collect()
session.sql('CREATE SCHEMA IF NOT EXISTS RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS').collect()

session.sql('USE DATABASE RECOMMENDER_SYSTEMS').collect()
session.sql('USE SCHEMA COLLABORATIVE_FILTERING_ALS').collect()
session.sql('CREATE STAGE IF NOT EXISTS ML_MODELS;').collect()

[Row(status='ML_MODELS already exists, statement succeeded.')]

# Load Data to Snowflake
This is done in case your data is not already in a Snowflake table. If it is then you can skip this step.

In [6]:
# Loading from local CSV-files
events_data = pd.read_csv('data/events.csv')
events_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [7]:
session.write_pandas(events_data, table_name='EVENTS_DATA', auto_create_table=True, overwrite=True)

<snowflake.snowpark.table.Table at 0x2a3611910>

# Step 1: Testing Locally

In [39]:
# Data PreProcessing
def create_data(datapath,start_date,end_date):
    df= session.table(datapath).to_pandas()
    df= df.assign(date=pd.Series(datetime.fromtimestamp(a/1000).date() for a in df.timestamp))
    df= df.sort_values(by='date').reset_index(drop=True) # for some reasons RetailRocket did NOT sort data by date
    df= df[(df.date>=datetime.strptime(start_date,'%Y-%m-%d').date())&(df.date<=datetime.strptime(end_date,'%Y-%m-%d').date())]
    df= df[['visitorid','itemid','event']]
    return df

In [40]:
datapath= 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA'
data_raw= create_data(datapath,'2015-5-3','2015-5-18')
data_raw.head()

Unnamed: 0,visitorid,itemid,event
1022,711599,122296,view
1023,1058545,203248,view
1024,1175565,67580,view
1025,517160,203248,view
1026,908008,258812,view


In [41]:
data = data_raw.copy(deep=True)
data['visitorid'] = data['visitorid'].astype("category")
data['visitor_id'] = data['visitorid'].cat.codes
data['visitorid'] = data['visitorid'].astype("int")

data['itemid'] = data['itemid'].astype("category")
data['item_id'] = data['itemid'].cat.codes
data['itemid'] = data['itemid'].astype("int")

data['event']= data['event'].astype('category')
data['event']= data['event'].cat.codes

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 328185 entries, 1022 to 329206
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   visitorid   328185 non-null  int64
 1   itemid      328185 non-null  int64
 2   event       328185 non-null  int8 
 3   visitor_id  328185 non-null  int32
 4   item_id     328185 non-null  int32
dtypes: int32(2), int64(2), int8(1)
memory usage: 10.3 MB


In [42]:
unique_pairs = data[['visitorid','visitor_id']].drop_duplicates()
user_id_dict = dict(zip(unique_pairs['visitor_id'], unique_pairs['visitorid']))

unique_pairs = data[['itemid','item_id']].drop_duplicates()
item_id_dict = dict(zip(unique_pairs['item_id'], unique_pairs['itemid']))

In [43]:
sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id'])))
sparse_user_item = sparse.csr_matrix((data['event'].astype(float), (data['visitor_id'], data['item_id'])))

#Building the model
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

alpha_val = 40
data_conf = (sparse_user_item * alpha_val).astype('double')

model.fit(data_conf)

  0%|          | 0/20 [00:00<?, ?it/s]

In [44]:
###USING THE MODEL
#Get Recommendations
user_id = 2
reco = model.recommend(user_id, sparse_user_item[user_id], N=5)
print(reco)

(array([56574,  1394, 82130, 53243, 48720], dtype=int32), array([1.3219736e-12, 1.1100892e-12, 1.0795274e-12, 8.3879323e-13,
       7.3194391e-13], dtype=float32))


In [45]:
#Get Recommendations for all users
recommended = model.recommend(list(user_id_dict.keys()), sparse_user_item[list(user_id_dict.keys())], N=5)
print(recommended[0][0:10])

[[74487 21116 46614 56304 54046]
 [55597  1394 70231 12300 52390]
 [17191 24386 71224 60480 68712]
 [55597  1394 70231 12300 52390]
 [40366 14251 13003 60538 42292]
 [36072 28193 34224 34310 79246]
 [44063 56304 76337 15938 31495]
 [55597  1394 70231 12300 52390]
 [59532 56091 17813 56304 75241]
 [65987 59735 63579 44650 16306]]


# Step 2: Write clean code for Orchestration in Snowflake

Here we take all the pieces of code written above for local testing and package in a modularized format. We may also choose to
schedule the preprocess pipeline as a predecessor to model train and inference.

In [7]:
from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask
api_root = Root(session)

In [34]:
# Load data and process
def preprocess_data(session:Session, table):
    import pandas as pd
    import numpy
    from datetime import datetime

    df= session.table(table).to_pandas()
    df= df.assign(date=pd.Series(datetime.fromtimestamp(a/1000).date() for a in df.timestamp))
    df= df.sort_values(by='date').reset_index(drop=True) # for some reasons RetailRocket did NOT sort data by date
    df= df[(df.date>=datetime.strptime('2015-5-3','%Y-%m-%d').date())&(df.date<=datetime.strptime('2015-5-18','%Y-%m-%d').date())]

    data= df[['visitorid','itemid','event']]

    data['visitorid'] = data['visitorid'].astype("category")
    data['visitor_id'] = data['visitorid'].cat.codes

    data['itemid'] = data['itemid'].astype("category")
    data['item_id'] = data['itemid'].cat.codes

    data['event']= data['event'].astype('category')
    data['event']= data['event'].cat.codes

    return data

# Model Train
def train_model(session:Session, sparse_user_item):
    import implicit
    import scipy.sparse as sparse

    model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
    alpha_val = 40
    data_conf = (sparse_user_item * alpha_val).astype('double')
    model.fit(data_conf)

    return model

# Clean output
def process_output(session:Session, reco, user_id_dict, item_id_dict):
    rec_df = pd.DataFrame(reco[0], columns=['rec1', 'rec2', 'rec3', 'rec4', 'rec5'])
    user_df = pd.DataFrame(data = list(user_id_dict.items()), columns = ['visitor_id','visitorid'])
    joined_df = user_df.join(rec_df)

    for col in ['rec1', 'rec2', 'rec3', 'rec4', 'rec5']:
        joined_df[col] = joined_df[col].map(item_id_dict)
    
    joined_df['PRED_TIMESTAMP'] = str(pd.Timestamp.now())
    return joined_df

# Run the Process
def get_predictions(session:Session) -> str:
    import pandas as pd
    import numpy
    import implicit
    import scipy.sparse as sparse
    from scipy.sparse.linalg import spsolve
    from datetime import datetime
    import snowflake.snowpark.functions as F

    table = 'RECOMMENDER_SYSTEMS.COLLABORATIVE_FILTERING_ALS.EVENTS_DATA'
    data = preprocess_data(session, table)

    # Map cat codes to visitorid
    unique_pairs = data[['visitorid','visitor_id']].drop_duplicates()
    user_id_dict = dict(zip(unique_pairs['visitor_id'], unique_pairs['visitorid']))

    # Map cat codes to itemid
    unique_pairs = data[['itemid','item_id']].drop_duplicates()
    item_id_dict = dict(zip(unique_pairs['item_id'], unique_pairs['itemid']))

    # sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id'])))
    sparse_user_item = sparse.csr_matrix((data['event'].astype(float), (data['visitor_id'], data['item_id'])))

    model = train_model(session, sparse_user_item)
    
    recommended = model.recommend(list(user_id_dict.keys()), sparse_user_item[list(user_id_dict.keys())], N=5)

    cleaned_df = process_output(session, recommended, user_id_dict, item_id_dict)
    
    # Save Data in Snowflake
    session.write_pandas(cleaned_df, table_name='ITEM_RECOMMENDATIONS', auto_create_table=True, overwrite=True)

    return 'Success'

In [35]:
schema = api_root.databases['RECOMMENDER_SYSTEMS'].schemas['COLLABORATIVE_FILTERING_ALS']
tasks = schema.tasks

In [36]:
# Create a Task and Schedule it to run daily
task1_entity = Task(
    "reco_engine",
    definition = StoredProcedureCall(get_predictions, stage_location="@ML_MODELS", 
                                     packages=["snowflake-snowpark-python","snowflake-ml-python",
                                               "regex", "scipy", "implicit==0.6.2", "numpy==1.23.5"]),
    warehouse = connection_parameters['warehouse'],
    schedule = timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")
task1.resume()

In [37]:
task1.execute()