In [1]:
import os

from snowflake.snowpark import Session
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Connect to Snowflake / Snowpark Session

In [2]:
#Very easy to connect to your snowflake account

def get_token():
    with open('/snowflake/session/token', 'r') as f:
        return f.read()

connection_params = {
    'host': os.environ['SNOWFLAKE_HOST'],
    'port': os.environ['SNOWFLAKE_PORT'],
    'protocol': 'https',
    'account': os.environ['SNOWFLAKE_ACCOUNT'],
    'authenticator': 'oauth',
    'token': get_token(),
    'role': 'SERVICESNOW_USER_ROLE',
    'warehouse': 'MADKINS',
    'database': 'TOPIC_MODELING',
    'schema': 'PROD'
}

session = Session.builder.configs(connection_params).create()
session

<snowflake.snowpark.session.Session at 0x7fe896ff0bb0>

# Read our Data into our Notebook

For topic modeling, we're going to want our reviews and our embeddings

In [3]:
query = '''
    SELECT
        *
    FROM
        MUSIC_STORE_REVIEWS_DEMO_SAMPLE
    LIMIT
'''
reviews = session.sql(query)
reviews = reviews.toPandas()
reviews.head()

Unnamed: 0,DATE,PRODUCT,REVIEW,RATING
0,1685577600000000,Harmony G10 Electric Guitar,I am amazed by the sound quality of this guita...,5
1,1685318400000000,Melody M5000 Grand Piano,"The keys on this piano feel a bit stiff, but t...",3
2,1684972800000000,Rhythm R200 Drum Set,"I've been playing drums for years, and this dr...",5
3,1684540800000000,Serenade S700 Acoustic Guitar,I'm not impressed with this guitar. The tone i...,2
4,1684368000000000,Cadence C1500 Keyboard,This keyboard is great for beginners. It has a...,3


In [4]:
reviews.shape

(20010, 4)

In [5]:
query = '''
    SELECT
        *
    FROM
        MUSIC_STORE_REVIEW_EMBEDDINGS_DEMO_SAMPLE
'''
embeddings_df = session.sql(query)
embeddings_df = embeddings_df.toPandas()
embeddings_df.shape

(20010, 1)

In [6]:
# Some light format processing to get the embeddings ready for action as an array of float32s
embeddings_df["clean_embeddings"] = (embeddings_df
                                     .embeddings.str.lstrip("[\n  ")
                                     .str.rstrip("]")
                                     .str.replace("\n", "")
                                     .str.replace("\"", "")
                                     .str.split(",")
                                    )
exploded_embeddings = embeddings_df.clean_embeddings.explode().astype("float32")
embeddings = exploded_embeddings.values.reshape(len(embeddings_df), -1)
embeddings.shape

(20010, 768)

In [7]:
embeddings[:2, :5]

array([[-0.04069147,  0.05029966,  0.02834741,  0.0434903 , -0.05146772],
       [ 0.02609883,  0.00178319,  0.03685594,  0.02907019, -0.03646225]],
      dtype=float32)

# Analyzing our Reviews

We can use state-of-the-art topic modeling techniques to analyze our reviews.

In [9]:
from bertopic import BERTopic

topic_model = BERTopic(
    verbose=True,
    calculate_probabilities=True,
)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
## TODO: Small visualization

Because we're passing in our already created embeddings, this only took about 30 seconds to process 20,000 reviews.

In the real-world, we'd like to do this on hundreds of thousands or even millions of reviews. This can get **very** slow (far too slow for this demo).

Fortunately, we can accelerate the entire topic modeling pipeline on the NVIDIA GPUs and frameworks now available within the Snowflake platform.

Let's use the RAPIDS machine learning library cuML to run the same pipeline.

In [None]:
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
from cuml.feature_extraction.text import CountVectorizer

# Create instances of GPU-accelerated UMAP, HDBSCAN, and CountVectorizer
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(
    min_samples=10,
    min_cluster_size=50,
    gen_min_span_tree=True,
    prediction_data=True
)
vectorizer_model = CountVectorizer(stop_words="english")

# Pass the cuML estimators into BERTopic
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True,
    calculate_probabilities=True,
    vectorizer_model=vectorizer_model,
)
topics, probs = topic_model.fit_transform(docs, embeddings)

**Way faster**.

Snowpark Containers makes it possible to do these new kinds of workflows in the Snowflake platform.