# Hybrid Execution Automatic Switching Demo

In [1]:
import snowflake.snowpark.modin.plugin
import modin.pandas as pd
import numpy as np
import datetime
import pandas as native_pd
from snowflake.snowpark.session import Session; session = Session.builder.create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://snowbiz.okta.com/app/snowflake/exk8wfsfryJIn4IWZ2p7/sso/saml?SAMLRequest=jVJdb%2BIwEPwrke%2BZOEnTQi2gogXUVL2SI%2BHQ8eYmDlg4ds7rEOivP4ePU%2FvQqm%2FWemZndmf7d%2FtSODumgSs5QL7rIYfJTOVcrgdokU47PeSAoTKnQkk2QAcG6G7YB1qKioxqs5Fz9rdmYBzbSAJpPwao1pIoChyIpCUDYjKSjH4%2Bk8D1CAVg2lg5dKbkwK3WxpiKYNw0jdtcuUqvceB5HvZusUW1kB%2FonUT1tUallVGZEhfK3s70iYSPvbCVsAirEJ%2BJ91yeVvCVyusJBOQxTeNOPEtS5Iwu0z0oCXXJdML0jmdsMX8%2BGQDrIHmZLR9ni2TiglRNIeiWZaqsamO7ufaFC5Zjodbc7igaD1C15XlcxImYxzvanWxpPu9drVZLGUM4mjzNbvZ7%2FraZVofZLr7%2F9SdDzu9LokGbaARQs0i2ORpb8oLrjhd2%2FDD1b4l%2FQ65Dt9cNVsgZ2xy5pObIvJhtLb7yN1dtDT2ao1WF%2F%2FvGbL%2FtNQUU%2BvAUyTBaroKqiwEUbmNCp0shRwN6%2BN35%2B%2Fg963xsL3b%2F0ThWgmcHZ6p0Sc3n8fiuf6zwvFMcoYSVlItRnmsGYGMSQjUPmlFjb9romiE8PKl%2BvOrhPw%3D%3D&Rel

## Example 1: Working with small/inline-created dataframe is faster

In [2]:
us_holidays = [
    ("New Year's Day", "2025-01-01"),
    ("Martin Luther King Jr. Day", "2025-01-20"),
    ("Presidents' Day", "2025-02-17"),
    ("Memorial Day", "2025-05-26"),
    ("Juneteenth National Independence Day", "2025-06-19"),
    ("Independence Day", "2025-07-04"),
    ("Labor Day", "2025-09-01"),
    ("Columbus Day", "2025-10-13"),
    ("Veterans Day", "2025-11-11"),
    ("Thanksgiving Day", "2025-11-27"),
    ("Christmas Day", "2025-12-25")
]

# Create DataFrame
df_us_holidays = pd.DataFrame(us_holidays, columns=["Holiday", "Date"])

# Convert Date column to datetime
df_us_holidays["Date"] = pd.to_datetime(df_us_holidays["Date"])

In [3]:
assert df_us_holidays.get_backend() == 'Pandas'  # with auto, we should expect this to be local

In [4]:
# Add new columns for transformations
df_us_holidays["Day_of_Week"] = df_us_holidays["Date"].dt.day_name()
df_us_holidays["Month"] = df_us_holidays["Date"].dt.month_name()

In [5]:
df_us_holidays

Unnamed: 0,Holiday,Date,Day_of_Week,Month
0,New Year's Day,2025-01-01,Wednesday,January
1,Martin Luther King Jr. Day,2025-01-20,Monday,January
2,Presidents' Day,2025-02-17,Monday,February
3,Memorial Day,2025-05-26,Monday,May
4,Juneteenth National Independence Day,2025-06-19,Thursday,June
5,Independence Day,2025-07-04,Friday,July
6,Labor Day,2025-09-01,Monday,September
7,Columbus Day,2025-10-13,Monday,October
8,Veterans Day,2025-11-11,Tuesday,November
9,Thanksgiving Day,2025-11-27,Thursday,November


In [6]:
%%time
#Note that without auto-switching, this took 2.5 min
for index, row in df_us_holidays.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

New Year's Day falls on Wednesday, January 1, 2025.
Martin Luther King Jr. Day falls on Monday, January 20, 2025.
Presidents' Day falls on Monday, February 17, 2025.
Memorial Day falls on Monday, May 26, 2025.
Juneteenth National Independence Day falls on Thursday, June 19, 2025.
Independence Day falls on Friday, July 4, 2025.
Labor Day falls on Monday, September 1, 2025.
Columbus Day falls on Monday, October 13, 2025.
Veterans Day falls on Tuesday, November 11, 2025.
Thanksgiving Day falls on Thursday, November 27, 2025.
Christmas Day falls on Thursday, December 25, 2025.
CPU times: user 108 ms, sys: 6.45 ms, total: 114 ms
Wall time: 119 ms


Automatic engine switch happens when merged with large dataset.

In [7]:
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

In [8]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

In [9]:
len(df_us_holidays), len(df_transactions)

(11, 10000000)

In [10]:
combined = pd.merge(df_us_holidays, df_transactions, left_on="Date", right_on="DATE")

Transferring data from Pandas to Snowflake ...:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
assert combined.get_backend() == 'Snowflake'

### 💡 Automatic switching speeds up loops/iterations on small data + inline creation of dataframes

## Example 2: When data is filtered the choice of engine changes

Run the following SQL to generate a synthetic dataset with 10M rows of transactions (from 2024-2025 current date)
```sql
CREATE OR REPLACE TABLE revenue_transactions (
    Transaction_ID STRING,
    Date DATE,
    Revenue FLOAT
);

SET num_days = (SELECT DATEDIFF(DAY, '2024-01-01', CURRENT_DATE));
INSERT INTO revenue_transactions (Transaction_ID, Date, Revenue)
SELECT
    UUID_STRING() AS Transaction_ID,
    DATEADD(DAY, UNIFORM(0, $num_days, RANDOM()), '2024-01-01') AS Date,
    UNIFORM(10, 1000, RANDOM()) AS Revenue
FROM TABLE(GENERATOR(ROWCOUNT => 10000000));
```

In [12]:
# Run the following to generate a synthetic dataset with 10M rows of transactions (from 2024-2025 current date)
session.sql('''
CREATE OR REPLACE TABLE revenue_transactions (
    Transaction_ID STRING,
    Date DATE,
    Revenue FLOAT
);''').collect()
session.sql('''SET num_days = (SELECT DATEDIFF(DAY, '2024-01-01', CURRENT_DATE));''').collect()
session.sql('''INSERT INTO revenue_transactions (Transaction_ID, Date, Revenue)
SELECT
    UUID_STRING() AS Transaction_ID,
    DATEADD(DAY, UNIFORM(0, $num_days, RANDOM()), '2024-01-01') AS Date,
    UNIFORM(10, 1000, RANDOM()) AS Revenue
FROM TABLE(GENERATOR(ROWCOUNT => 10000000));
''').collect()

[Row(number of rows inserted=10000000)]

In [13]:
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

In [14]:
len(df_transactions)

10000000

Perform some operations on 10M rows with Snowflake

In [15]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

In [16]:
df_transactions.groupby("DATE").sum()["REVENUE"]

DATE
2024-01-01    10660935.0
2024-01-02    10731968.0
2024-01-03    10594935.0
2024-01-04    10788194.0
2024-01-05    10730112.0
                 ...    
2025-04-10    10801029.0
2025-04-11    10851369.0
2025-04-12    10598843.0
2025-04-13    10868851.0
2025-04-14    10623021.0
Freq: None, Name: REVENUE, Length: 470, dtype: float64

In [17]:
assert df_transactions.get_backend() == "Snowflake"

So far everything has been happening in Snowflake, since we are working with the full dataset (10M rows). 
Next, we demonstrate what happens when we filter the data down to a smaller dataset below our 500k threshold for automatic switching. 
We showcase two was of doing the filtering: 
- Method 1: Filtering with pandas (lazy evaluation)
- Method 2: Prefilter with SQL during dataframe creation 

In [18]:
df_transactions_filter1 = df_transactions[(df_transactions["DATE"] >= pd.Timestamp.today().date() - pd.Timedelta('7 days')) & (df_transactions["DATE"] < pd.Timestamp.today().date())]

In [19]:
df_transactions_filter1.get_backend()

'Snowflake'

In [20]:
df_transactions_filter1._query_compiler._modin_frame.ordered_dataframe.row_count_upper_bound

1000000000000000000000000000000000000000000

In this case, since the data is already in Snowflake, it stays in Snowflake even after the filtering.

In [21]:
df_transactions_filter1

Unnamed: 0,TRANSACTION_ID,DATE,REVENUE
24,07c3ef12-cb1f-4906-bb20-234f0f35ac94,2025-04-10,890.0
42,a7503c1c-d54f-415b-9590-ddbdfb6f9bbe,2025-04-11,522.0
181,446e89a9-c6df-4b18-a740-1bb0da864966,2025-04-11,474.0
219,c317bab9-5915-4025-8238-01c4707eb7be,2025-04-08,884.0
487,1899a4db-9dd7-40be-b72b-d0a187614104,2025-04-07,713.0
...,...,...,...
9999546,998c3146-8bdf-4ee5-bccc-68a395f2e63b,2025-04-08,820.0
9999569,750b1a46-8fd9-493e-ba87-42e61d2a1500,2025-04-10,741.0
9999848,989bf6cd-2091-464a-a78a-41ecffcb2bef,2025-04-12,487.0
9999897,f32da7ef-5279-4a2f-962f-94a593c1982d,2025-04-09,742.0


In [22]:
df_transactions_filter1._query_compiler._modin_frame.ordered_dataframe.row_count_upper_bound

148818

In [23]:
df_transactions_filter1

Unnamed: 0,TRANSACTION_ID,DATE,REVENUE
24,07c3ef12-cb1f-4906-bb20-234f0f35ac94,2025-04-10,890.0
42,a7503c1c-d54f-415b-9590-ddbdfb6f9bbe,2025-04-11,522.0
181,446e89a9-c6df-4b18-a740-1bb0da864966,2025-04-11,474.0
219,c317bab9-5915-4025-8238-01c4707eb7be,2025-04-08,884.0
487,1899a4db-9dd7-40be-b72b-d0a187614104,2025-04-07,713.0
...,...,...,...
9999546,998c3146-8bdf-4ee5-bccc-68a395f2e63b,2025-04-08,820.0
9999569,750b1a46-8fd9-493e-ba87-42e61d2a1500,2025-04-10,741.0
9999848,989bf6cd-2091-464a-a78a-41ecffcb2bef,2025-04-12,487.0
9999897,f32da7ef-5279-4a2f-962f-94a593c1982d,2025-04-09,742.0


In [24]:
# Repr should 
# (1) perform the repr
# (2) update count on original data frame
# (3) consider moving
df_transactions_filter1 

Unnamed: 0,TRANSACTION_ID,DATE,REVENUE
24,07c3ef12-cb1f-4906-bb20-234f0f35ac94,2025-04-10,890.0
42,a7503c1c-d54f-415b-9590-ddbdfb6f9bbe,2025-04-11,522.0
181,446e89a9-c6df-4b18-a740-1bb0da864966,2025-04-11,474.0
219,c317bab9-5915-4025-8238-01c4707eb7be,2025-04-08,884.0
487,1899a4db-9dd7-40be-b72b-d0a187614104,2025-04-07,713.0
...,...,...,...
9999546,998c3146-8bdf-4ee5-bccc-68a395f2e63b,2025-04-08,820.0
9999569,750b1a46-8fd9-493e-ba87-42e61d2a1500,2025-04-10,741.0
9999848,989bf6cd-2091-464a-a78a-41ecffcb2bef,2025-04-12,487.0
9999897,f32da7ef-5279-4a2f-962f-94a593c1982d,2025-04-09,742.0


In [25]:
# Do we want to check this eager
# Do we expect that after a filter we check for a switcheroo case?
# can we even do this move if the estimated row size is still the size of data table
assert df_transactions_filter1.get_backend() == "Snowflake" 

In [26]:
print(f"Date range: {df_transactions_filter1['DATE'].min().date()} to {df_transactions_filter1['DATE'].max().date()}. Resulting dataset size: {len(df_transactions_filter1)}")



Date range: 2025-04-07 to 2025-04-13. Resulting dataset size: 148818


Now let's perform filtering via SQL directly, so the dataframe upon creation is small.

In [27]:
df_transactions_filter2 = pd.read_snowflake("SELECT * FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, current_date ) and Date < current_date")

Transferring data from Snowflake to Pandas ...:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
df_transactions_filter2

Unnamed: 0,TRANSACTION_ID,DATE,REVENUE
0,e7390a6e-a92f-4b40-b909-377d70d9e3ef,2025-04-12,103.0
1,3a4bb9a2-0056-46e4-abb2-4cf237106bad,2025-04-08,33.0
2,7e1bd805-c3e9-40e8-96b4-7327fbcfc3c1,2025-04-09,275.0
3,69239b21-67b8-4d5f-8b2d-c6226f2b2b22,2025-04-12,183.0
4,5d9c41e5-c8dd-4e44-92a0-b7fb4612fb23,2025-04-13,467.0
...,...,...,...
148813,706a5b69-d68d-4acb-bbf1-2d4944119f95,2025-04-10,591.0
148814,192025fc-aa38-44de-9931-ee49fa64b000,2025-04-09,500.0
148815,9bb6ad3b-4835-451e-95fa-fdbaa90753fa,2025-04-07,514.0
148816,cc125215-983f-46c4-bb2f-f5f2f2c510ce,2025-04-08,16.0


In [29]:
# Verify the result is same as above
print(f"Date range: {df_transactions_filter2['DATE'].min()} to {df_transactions_filter2['DATE'].max()}. Resulting dataset size: {len(df_transactions_filter2)}")

Date range: 2025-04-07 to 2025-04-13. Resulting dataset size: 148818


In [30]:
assert df_transactions_filter2.get_backend() == "Pandas" 

In [31]:
len(df_transactions_filter2)

148818

Once you are in pandas, you can still continue to perform the same operations: 

In [32]:
%time
df_transactions_filter1.groupby("DATE").sum()["REVENUE"]

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.15 µs


Transferring data from Snowflake to Pandas ...:   0%|          | 0/2 [00:00<?, ?it/s]

DATE
2025-04-07    10658040.0
2025-04-08    10684281.0
2025-04-09    10772216.0
2025-04-10    10801029.0
2025-04-11    10851369.0
2025-04-12    10598843.0
2025-04-13    10868851.0
Freq: None, Name: REVENUE, dtype: float64

In [33]:
df_transactions_filter1.shape

(148818, 3)

In [34]:
%time
df_transactions_filter2.groupby("DATE").sum()["REVENUE"]

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


DATE
2025-04-07    10658040.0
2025-04-08    10684281.0
2025-04-09    10772216.0
2025-04-10    10801029.0
2025-04-11    10851369.0
2025-04-12    10598843.0
2025-04-13    10868851.0
Name: REVENUE, dtype: float64

### 💡 Automatic switching means that pandas work well for both small and large data

## Example 3: Performing Custom `apply` on small dataset

Forecast using last year's transaction data via a custom apply function

In [35]:
start_date = pd.Timestamp("2025-10-01")
end_date = pd.Timestamp("2025-10-31")

In [36]:
# Forecasting function using df.apply

def forecast_revenue(df, start_date, end_date):
    # Filter data from last year
    df_filtered = df[(df["DATE"] >= start_date - pd.Timedelta(days=365)) & (df["DATE"] < start_date)]
    
    # Append future dates to daily_avg for prediction
    future_dates = pd.date_range(start=start_date, end=end_date, freq="D")
    df_future = pd.DataFrame({"DATE": future_dates})

    # Group by DATE and calculate the mean revenue
    daily_avg = df_filtered.groupby("DATE")["REVENUE"].mean().reset_index()
    daily_avg["DATE"] = daily_avg["DATE"].astype('datetime64[ns]')
    # Merge future dates with predicted revenue, filling missing values
    df_forecast = df_future.merge(daily_avg, on="DATE", how="left")
    #breakpoint()
    import numpy as np
    # Fill missing predicted revenue with overall mean from last year
    df_forecast["PREDICTED_REVENUE"] = np.nan
    df_forecast["PREDICTED_REVENUE"].fillna(daily_avg["REVENUE"].mean(), inplace=True)
    df_forecast["PREDICTED_REVENUE"] = df_forecast["PREDICTED_REVENUE"].astype("float")
    return df_forecast

In [37]:
df_forecast = forecast_revenue(df_transactions, start_date, end_date)
len(df_forecast)

Transferring data from Snowflake to Pandas ...:   0%|          | 0/2 [00:00<?, ?it/s]

31

The resulting dataframe is very small, since it is only the 1-month window we're performing forecast on.

In [38]:
assert df_forecast.get_backend() == 'Pandas'

In [39]:
def adjust_for_holiday_weekend(row):
    # For national holidays, revenue down 5% since stores are closed. For weekends, revenue is up 5% due to increased activity.
    if row["DATE"].strftime('%Y-%m-%d') in list(df_us_holidays["Date"].dt.strftime('%Y-%m-%d')): 
        return row["PREDICTED_REVENUE"] * 0.95
    elif row["DATE"].weekday() == 5 or row["DATE"].weekday() == 6: #Saturday/Sundays
        return row["PREDICTED_REVENUE"] * 1.05
    return row["PREDICTED_REVENUE"]

In [40]:
# Adjust for holidays using the apply function
df_forecast["PREDICTED_REVENUE"] = df_forecast.apply(adjust_for_holiday_weekend, axis=1)
df_forecast[["DATE","PREDICTED_REVENUE"]]

Unnamed: 0,DATE,PREDICTED_REVENUE
0,2025-10-01,505.041309
1,2025-10-02,505.041309
2,2025-10-03,505.041309
3,2025-10-04,530.293374
4,2025-10-05,530.293374
5,2025-10-06,505.041309
6,2025-10-07,505.041309
7,2025-10-08,505.041309
8,2025-10-09,505.041309
9,2025-10-10,505.041309


In [41]:
assert df_forecast.get_backend() == 'Pandas'

In [42]:
print(f"Altair takes in {type(df_forecast)} with {df_forecast.get_backend()} as backend, since we implement the dataframe interchange protocol")

Altair takes in <class 'modin.pandas.dataframe.DataFrame'> with Pandas as backend, since we implement the dataframe interchange protocol


In [43]:
import altair as alt
alt.data_transformers.disable_max_rows()

chart_predicted = alt.Chart(df_forecast).mark_line(color='blue').encode(
    x='monthdate(DATE):T',
    y=alt.Y('PREDICTED_REVENUE:Q',scale=alt.Scale(domain=[470, 550])),
    tooltip=['DATE', 'PREDICTED_REVENUE']
)
chart_predicted



In [44]:
df_transactions_filtered = df_transactions[
    (df_transactions["DATE"] >= start_date - pd.Timedelta(days=365)) &
    (df_transactions["DATE"] < end_date - pd.Timedelta(days=365))
]
df_transactions_filtered_groupby = df_transactions_filtered.groupby("DATE")["REVENUE"].mean().reset_index()

Transferring data from Snowflake to Pandas ...:   0%|          | 0/2 [00:00<?, ?it/s]

In [45]:
print(f"Altair takes in {type(df_transactions_filtered_groupby)} with {df_transactions_filtered_groupby.get_backend()} as backend, since we implement the dataframe interchange protocol")

Altair takes in <class 'modin.pandas.dataframe.DataFrame'> with Pandas as backend, since we implement the dataframe interchange protocol


In [46]:
df_forecast_labeled = df_forecast.copy()
df_forecast_labeled['Label'] = 'Predicted Revenue'
df_forecast_labeled = df_forecast_labeled.rename(columns={'PREDICTED_REVENUE': 'Value'})

df_last_year_labeled = df_transactions_filtered_groupby.copy()
df_last_year_labeled['Label'] = 'Revenue'
df_last_year_labeled = df_last_year_labeled.rename(columns={'REVENUE': 'Value'})

# Combine
combined_df = pd.concat([
    df_forecast_labeled[['DATE', 'Value', 'Label']],
    df_last_year_labeled[['DATE', 'Value', 'Label']]
])

# Plot with Value on X and color based on Label
final_chart = alt.Chart(combined_df).mark_line().encode(
    y=alt.Y('Value:Q',scale=alt.Scale(domain=[470, 550])),
    x='monthdate(DATE):T',
    color=alt.Color('Label:N', legend=alt.Legend(title='Type')),
    tooltip=['DATE', 'Value', 'Label']
).properties(
    title='Revenue vs Predicted Revenue (by Value)'
)

final_chart



### 💡 Apply on small dataset is much faster with automatic switching running with pandas locally.