In [1]:
import snowflake.connector
from snowflake.ml.feature_store import FeatureStore, CreationMode, FeatureView
from snowflake.ml.feature_store import Entity

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd

In [3]:
load_dotenv(".env")

user_name = os.getenv('SNOW_USER')
password = os.getenv('SNOW_PASS')
account = os.getenv('SNOW_ACCOUNT')

In [4]:
from snowflake.snowpark import Session

connection_parameters = {
    "user": user_name,
    "password": password,
    "account": account,
    "warehouse": "COMPUTE_WH",
    "database": "VISTORA",
    "schema": "VISTORA_SCHEMA"
}

session = Session.builder.configs(connection_parameters).create()

In [5]:
fs = FeatureStore(
    session=session,
    database="VISTORA",
    name="VISTORA_SCHEMA",     
    default_warehouse="COMPUTE_WH",
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST  # Creates if it doesn't exist
)


In [6]:
entity = Entity(name="customer",
                join_keys=["CUSTOMER_ID"],
                desc="Unique customer identifier"
)
fs.register_entity(entity)

  return f(self, *args, **kargs)


Entity(name=CUSTOMER, join_keys=['CUSTOMER_ID'], owner=None, desc=Unique customer identifier)

# Get the data from the Snowflake

In [7]:
feature_df = session.table("VISTORA_FEATURE_TABLE")
fv = FeatureView(
    name="VISTORA_FEATURE_VIEW",
    entities=[entity],  
    feature_df=feature_df,
    desc="Features from VISTORA_FEATURE_TABLE for ML"
)
fs.register_feature_view(fv, version="v2")

  return self._get_feature_view_if_exists(feature_view.name, str(version))


FeatureView(_name=VISTORA_FEATURE_VIEW, _entities=[Entity(name=CUSTOMER, join_keys=['CUSTOMER_ID'], owner=None, desc=Unique customer identifier)], _feature_df=<snowflake.snowpark.dataframe.DataFrame object at 0x11fb00400>, _timestamp_col=None, _desc=Features from VISTORA_FEATURE_TABLE for ML, _infer_schema_df=<snowflake.snowpark.dataframe.DataFrame object at 0x11f922a00>, _query=SELECT  *  FROM VISTORA_FEATURE_TABLE, _version=v2, _status=FeatureViewStatus.STATIC, _feature_desc=OrderedDict([('IS_PREFERRED_CUST', ''), ('C_BIRTH_YEAR', ''), ('C_BIRTH_MONTH', ''), ('C_BIRTH_DAY', ''), ('AGE', ''), ('BIRTH_SEASON', ''), ('AGE_GROUP', ''), ('HAS_LOGIN', ''), ('LAST_REVIEW_YEAR', ''), ('DAYS_SINCE_LAST_REVIEW', ''), ('BIRTH_COUNTRY_CODE', '')]), _refresh_freq=None, _database=VISTORA, _schema=VISTORA_SCHEMA, _initialize=ON_CREATE, _warehouse=None, _refresh_mode=None, _refresh_mode_reason=None, _owner=ACCOUNTADMIN, _cluster_by=['CUSTOMER_ID'], _lineage_node_name=VISTORA.VISTORA_SCHEMA.VISTORA_F

`spine_df` acts as the backbone structure that defines which records we want to retrieve features for and serves as the foundation for creating your training dataset.

> OKAY there is this one correction from the video, the spine dataframe is the `CUSTOMER_ID` columns dataframe only and now whole

In [None]:
spine_df = session.table("VISTORA_FEATURE_TABLE").select("CUSTOMER_ID")  
# spine_df = session.table("VISTORA_FEATURE_TABLE").drop("IS_PREFERRED_CUST")  
# we are doing the `drop` because after the `fs.generate_training_set` it will create dublicates

In [19]:
registered_fv = fs.get_feature_view("VISTORA_FEATURE_VIEW", version="v2")
training_set = fs.generate_training_set(
    spine_df=spine_df,
    features=[registered_fv],
    # spine_label_cols=["IS_PREFERRED_CUST"]  # Replace with your actual label column
    # spine_timestamp_col="event_time",     # Only if you have time-based data
)

# Convert to pandas DataFrame for scikit-learn/XGBoost, etc.
df_train = training_set.to_pandas()

# Training the model

In [20]:
y = df_train["IS_PREFERRED_CUST"]
X = df_train.drop(columns=["IS_PREFERRED_CUST", "CUSTOMER_ID"])

In [21]:
X.shape,y.shape

((937, 10), (937,))

In [22]:
df_train.columns

Index(['CUSTOMER_ID', 'IS_PREFERRED_CUST', 'C_BIRTH_YEAR', 'C_BIRTH_MONTH',
       'C_BIRTH_DAY', 'AGE', 'BIRTH_SEASON', 'AGE_GROUP', 'HAS_LOGIN',
       'LAST_REVIEW_YEAR', 'DAYS_SINCE_LAST_REVIEW', 'BIRTH_COUNTRY_CODE'],
      dtype='object')

## Training Model Using Snowflake ML

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [24]:
# Split your data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)


In [25]:
model.fit(X_train, y_train)

In [26]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training accuracy: {train_score:.4f}")
print(f"Test accuracy: {test_score:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.4574
