pip install "snowflake-snowpark-python[modin] @ git+https://github.com/snowflakedb/snowpark-python.git@modin-hybrid-client"

In [1]:
import snowflake.snowpark.modin.plugin
import modin.pandas as pd
import numpy as np
import datetime
import pandas as native_pd
from snowflake.snowpark.session import Session; session = Session.builder.create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://snowbiz.okta.com/app/snowflake/exk8wfsfryJIn4IWZ2p7/sso/saml?SAMLRequest=jVLRctowEPwVj%2FqMJVwoRANkaGgGMyTQYKDDm7AFqMiSq5Ox6ddXNtBJHpLJm%2Ba0e7t3e737MpXeiRsQWvVR0yfI4yrWiVD7PlpGj40u8sAylTCpFe%2BjMwd0P%2BgBS2VGh7k9qBf%2BJ%2BdgPddIAa0%2B%2Big3imoGAqhiKQdqY7oYPk1p4BPKALixTg5dKQkIp3WwNqMYF0XhF199bfY4IIRgcocdqoJ8Qa8kso81MqOtjrW8UUo30zsSTUxalYRDOIX5lfhdqMsKPlLZXkBAx1E0b8xniwh5w9t0D1pBnnKz4OYkYr58mV4MgHOweJ6tx7Pl4ocPShc7yY481mmWW9fNdy%2B84wmWei%2FcjsJRH2VHkSTffpWzTqbacnXYbseH33eTcLOaRgWZpK2jOE1OrHwq4%2B169TNG3uqWaFAlGgLkPFRVjtaVSNBukFaDBFGzQwmh7bbf6pIN8kYuR6GYrZk3s5XFrfjr66NltTmWZfi%2Fb8zLY7fYwc6cJ6FqhetNkHUwgMZVTOhyKbQ2YAafnb%2BHX7Oux%2Fbs9h%2BO5lqK%2BOw9apMy%2B348Tb9ZV0TS2NVQylMm5DBJDAdwMUmpiwfDmXU3bU3OER5cVN9e9eAf&RelayState=ver%3A1-hint

# Example 1: Start with small dataset perform some processing, then join with large data

In [4]:
# emphemeral dataframe/lookup table 
# List of U.S. federal holidays
us_holidays = [
    ("New Year's Day", "2025-01-01"),
    ("Martin Luther King Jr. Day", "2025-01-20"),
    ("Presidents' Day", "2025-02-17"),
    ("Memorial Day", "2025-05-26"),
    ("Juneteenth National Independence Day", "2025-06-19"),
    ("Independence Day", "2025-07-04"),
    ("Labor Day", "2025-09-01"),
    ("Columbus Day", "2025-10-13"),
    ("Veterans Day", "2025-11-11"),
    ("Thanksgiving Day", "2025-11-27"),
    ("Christmas Day", "2025-12-25")
]

# Create DataFrame
df_us_holidays = pd.DataFrame(us_holidays, columns=["Holiday", "Date"])

# Convert Date column to datetime
df_us_holidays["Date"] = pd.to_datetime(df_us_holidays["Date"])

`to_datetime` implementation may have mismatches with pandas:
Snowflake automatic format detection is used when a format is not provided.In this case Snowflake's auto format may yield different result values compared to pandas.See https://docs.snowflake.com/en/sql-reference/date-time-input-output#supported-formats-for-auto-detection for details.


In [5]:
df_us_holidays.get_backend() # with auto, we should expect this to be local

'Snowflake'

In [6]:
# Add new columns for transformations
df_us_holidays["Day_of_Week"] = df_us_holidays["Date"].dt.day_name()
df_us_holidays["Month"] = df_us_holidays["Date"].dt.month_name()

In [7]:
df_us_holidays

Unnamed: 0,Holiday,Date,Day_of_Week,Month
0,New Year's Day,2025-01-01,Wednesday,January
1,Martin Luther King Jr. Day,2025-01-20,Monday,January
2,Presidents' Day,2025-02-17,Monday,February
3,Memorial Day,2025-05-26,Monday,May
4,Juneteenth National Independence Day,2025-06-19,Thursday,June
5,Independence Day,2025-07-04,Friday,July
6,Labor Day,2025-09-01,Monday,September
7,Columbus Day,2025-10-13,Monday,October
8,Veterans Day,2025-11-11,Tuesday,November
9,Thanksgiving Day,2025-11-27,Thursday,November


In [8]:
%%time
for index, row in df_us_holidays.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

DataFrame.iterrows will result eager evaluation and potential data pulling, which is inefficient. For efficient Snowpark pandas usage, consider rewriting the code with an operator (such as DataFrame.apply or DataFrame.applymap) which can work on the entire DataFrame in one shot.


New Year's Day falls on Wednesday, January 1, 2025.
Martin Luther King Jr. Day falls on Monday, January 20, 2025.
Presidents' Day falls on Monday, February 17, 2025.
Memorial Day falls on Monday, May 26, 2025.
Juneteenth National Independence Day falls on Thursday, June 19, 2025.
Independence Day falls on Friday, July 4, 2025.
Labor Day falls on Monday, September 1, 2025.
Columbus Day falls on Monday, October 13, 2025.
Veterans Day falls on Tuesday, November 11, 2025.
Thanksgiving Day falls on Thursday, November 27, 2025.
Christmas Day falls on Thursday, December 25, 2025.
CPU times: user 12 s, sys: 2.09 s, total: 14.1 s
Wall time: 1min 36s


In [9]:
pandas_df = df_us_holidays.move_to("pandas") # remove this once we have auto-switching

In [10]:
%%time
for index, row in pandas_df.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

New Year's Day falls on Wednesday, January 1, 2025.
Martin Luther King Jr. Day falls on Monday, January 20, 2025.
Presidents' Day falls on Monday, February 17, 2025.
Memorial Day falls on Monday, May 26, 2025.
Juneteenth National Independence Day falls on Thursday, June 19, 2025.
Independence Day falls on Friday, July 4, 2025.
Labor Day falls on Monday, September 1, 2025.
Columbus Day falls on Monday, October 13, 2025.
Veterans Day falls on Tuesday, November 11, 2025.
Thanksgiving Day falls on Thursday, November 27, 2025.
Christmas Day falls on Thursday, December 25, 2025.
CPU times: user 48.4 ms, sys: 4.57 ms, total: 53 ms
Wall time: 57.7 ms


In [None]:
assert df_us_holidays.get_backend() == "pandas" # This should work once auto switching is on

```python
# Generate 10 million transactions (This is also very slow with Snowpark)
import uuid
import pandas
dates = pandas.date_range(start="2025-01-01", end="2025-12-31", freq="D")
num_transactions = 10000000
data = {
    "Transaction_ID": [str(uuid.uuid4()) for _ in range(num_transactions)],
    "Date": np.random.choice(dates, num_transactions),
    "Revenue": np.random.uniform(10, 1000, num_transactions)
}

df_transactions = pd.DataFrame(data)
# This step is extremely slow...
df_transactions.to_snowflake("REVENUE_TRANSACTIONS", if_exists='replace', index=False)
```

### 💡 Automatic switching speeds up loops/iterations on small data

## Example 2: Demonstrate that when data is prefiltered via SQL, the engine choice changes

In [45]:
# Run the following to generate a synthetic dataset with 10M rows of transactions (from 2024-2025)
session.sql('''
CREATE OR REPLACE TABLE revenue_transactions (
    Transaction_ID STRING,
    Date DATE,
    Revenue FLOAT
);''').collect()
session.sql('''SET num_days = (SELECT DATEDIFF(DAY, '2024-01-01', CURRENT_DATE));''').collect()
session.sql('''INSERT INTO revenue_transactions (Transaction_ID, Date, Revenue)
SELECT
    UUID_STRING() AS Transaction_ID,
    DATEADD(DAY, UNIFORM(0, $num_days, RANDOM()), '2024-01-01') AS Date,
    UNIFORM(10, 1000, RANDOM()) AS Revenue
FROM TABLE(GENERATOR(ROWCOUNT => 10000000));
''').collect()

[Row(number of rows inserted=10000000)]

In [46]:
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

In [47]:
len(df_transactions)

10000000

In [48]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

In [49]:
df_transactions.groupby("DATE").sum()["REVENUE"]

DATE
2024-01-01    11150922.0
2024-01-02    10873142.0
2024-01-03    10939519.0
2024-01-04    11020232.0
2024-01-05    11051115.0
                 ...    
2025-03-29    10994881.0
2025-03-30    10959097.0
2025-03-31    10995883.0
2025-04-01    11098534.0
2025-04-02    11041780.0
Freq: None, Name: REVENUE, Length: 458, dtype: float64

In [50]:
assert df_transactions.get_backend() == "Snowflake"

In [51]:
# Filter to records in last 7 days
df_transactions_filter = pd.read_snowflake("SELECT * FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, current_date ) and Date < current_date")

In [52]:
len(df_transactions_filter)

152608

In [53]:
assert df_transactions_filter.get_backend() == "pandas" # This should work once auto switching is on

AssertionError: 

In [None]:
df_transactions_filter.groupby("DATE").sum()["REVENUE"]

### 💡 Automatic switching means that pandas work well for both small and large data

# Example 3: 

Forecast using last year's transaction data via a custom apply function

In [40]:
df_transactions["DATE"].dtypes

dtype('<M8[ns]')

In [63]:
start_date = pd.Timestamp("2025-10-01")
end_date = pd.Timestamp("2025-10-31")

df_transactions_filtered = df_transactions[
    (df_transactions["DATE"] >= start_date - pd.Timedelta(days=365)) &
    (df_transactions["DATE"] < end_date - pd.Timedelta(days=365))
]

In [65]:
len(df_transactions_filtered)

654725

In [66]:
df_transactions_filtered_pandas = df_transactions.move_to("pandas")

In [94]:
df_us_holidays["Date"]

0    2025-01-01
1    2025-01-20
2    2025-02-17
3    2025-05-26
4    2025-06-19
5    2025-07-04
6    2025-09-01
7    2025-10-13
8    2025-11-11
9    2025-11-27
10   2025-12-25
Name: Date, dtype: datetime64[ns]

In [100]:
# to pandas required since UDF doesn't take Snowpark pandas objects
us_holidays = df_us_holidays["Date"].to_pandas()

In [200]:
# Forecasting function using df.apply

def forecast_revenue(df, start_date, end_date):
    # Filter data from last year
    df_filtered = df[(df["DATE"] >= start_date - pd.Timedelta(days=365)) & (df["DATE"] < start_date)]
    
    # Append future dates to daily_avg for prediction
    future_dates = pd.date_range(start=start_date, end=end_date, freq="D")
    df_future = pd.DataFrame({"DATE": future_dates})

    # Group by DATE and calculate the mean revenue
    daily_avg = df_filtered.groupby("DATE")["REVENUE"].mean().reset_index()
    
    # Merge future dates with predicted revenue, filling missing values
    df_forecast = df_future.merge(daily_avg, on="DATE", how="left")

    import numpy as np
    # Fill missing predicted revenue with overall mean from last year
    df_forecast["PREDICTED_REVENUE"] = np.nan
    df_forecast["PREDICTED_REVENUE"].fillna(daily_avg["REVENUE"].mean(), inplace=True)
    df_forecast["PREDICTED_REVENUE"] = df_forecast["PREDICTED_REVENUE"].astype("float")
    return df_forecast

In [201]:
# Example usage
df_forecast = forecast_revenue(df_transactions, start_date, end_date)
df_forecast

Unnamed: 0,DATE,REVENUE,PREDICTED_REVENUE
0,2025-10-01,,505.01707
1,2025-10-02,,505.01707
2,2025-10-03,,505.01707
3,2025-10-04,,505.01707
4,2025-10-05,,505.01707
5,2025-10-06,,505.01707
6,2025-10-07,,505.01707
7,2025-10-08,,505.01707
8,2025-10-09,,505.01707
9,2025-10-10,,505.01707


In [202]:
# df_forecast["DATE"] = df_forecast["DATE"].dt.strftime('%Y-%m-%d')

In [226]:
def adjust_for_holiday_weekend(row):
    # For national holidays, revenue down 20% since stores are closed. For weekends, revenue is up 20% due to increased activity.
    if row["DATE"].strftime('%Y-%m-%d') in list(df_us_holidays["Date"].dt.strftime('%Y-%m-%d')): 
        return row["PREDICTED_REVENUE"] * 0.8
    elif row["DATE"].weekday() == 5 or row["DATE"].weekday() == 6: #Saturday/Sundays
        return row["PREDICTED_REVENUE"] * 1.2
    return row["PREDICTED_REVENUE"]

In [227]:
df_forecast_pandas = df_forecast.move_to("pandas")

In [230]:
# Adjust for holidays
df_forecast_pandas["PREDICTED_REVENUE"] = df_forecast_pandas.apply(adjust_for_holiday_weekend, axis=1)

In [231]:
df_forecast["PREDICTED_REVENUE"] = df_forecast.apply(adjust_for_holiday_weekend, axis=1)

In [253]:
type(df_forecast)

modin.pandas.dataframe.DataFrame

In [255]:
import altair as alt
alt.data_transformers.disable_max_rows()
df_forecast = df_forecast.move_to("pandas")
chart_predicted = alt.Chart(df_forecast).mark_line(color='blue').encode(
    x='monthdate(DATE):T',
    y='PREDICTED_REVENUE:Q',
    tooltip=['DATE', 'PREDICTED_REVENUE']
).properties(title="Predicted Revenue")
chart_predicted

AttributeError: 'NativeQueryCompiler' object has no attribute 'snowpark_pandas_api_calls'

alt.Chart(...)

In [None]:
df_transactions_filtered = df_transactions[
    (df_transactions["DATE"] >= start_date - pd.Timedelta(days=365)) &
    (df_transactions["DATE"] < end_date - pd.Timedelta(days=365))
]
df_transactions_filtered_groupby = df_transactions_filtered.groupby("DATE")["REVENUE"].mean()
# df_transactions_filtered_groupby = df_transactions_filtered_groupby.move_to("pandas")
chart_last_year = alt.Chart(df_transactions_filtered_groupby).mark_line(color='red').encode(
    x='monthdate(DATE):T',
    y='REVENUE:Q',
    tooltip=['DATE', 'REVENUE']
).properties(title="Last Year Revenue")

# Overlay the charts
final_chart = chart_predicted + chart_last_year
final_chart