# Hybrid execution for Snowpark pandas

In [1]:
import modin.pandas as pd
import snowflake.snowpark.modin.plugin
import numpy as np
import datetime
import pandas as native_pd
from time import perf_counter
from snowflake.snowpark.session import Session; session = Session.builder.create()
from modin.config import AutoSwitchBackend

* 'allow_population_by_field_name' has been renamed to 'validate_by_name'


## Example 1: Working with a small dataframe created from in-memory data is faster

### Before...

In [2]:
us_holidays = [
    ("New Year's Day", "2025-01-01"),
    ("Martin Luther King Jr. Day", "2025-01-20"),
    ("Presidents' Day", "2025-02-17"),
    ("Memorial Day", "2025-05-26"),
    ("Juneteenth National Independence Day", "2025-06-19"),
    ("Independence Day", "2025-07-04"),
    ("Labor Day", "2025-09-01"),
    ("Columbus Day", "2025-10-13"),
    ("Veterans Day", "2025-11-11"),
    ("Thanksgiving Day", "2025-11-27"),
    ("Christmas Day", "2025-12-25")
]

df_us_holidays = pd.DataFrame(us_holidays, columns=["Holiday", "Date"])
df_us_holidays["Date"] = pd.to_datetime(df_us_holidays["Date"])
df_us_holidays["Day_of_Week"] = df_us_holidays["Date"].dt.day_name()
df_us_holidays["Month"] = df_us_holidays["Date"].dt.month_name()

`to_datetime` implementation may have mismatches with pandas:
Snowflake automatic format detection is used when a format is not provided.In this case Snowflake's auto format may yield different result values compared to pandas.See https://docs.snowflake.com/en/sql-reference/date-time-input-output#supported-formats-for-auto-detection for details.


In [3]:
%%time
for index, row in df_us_holidays.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

DataFrame.iterrows will result eager evaluation and potential data pulling, which is inefficient. For efficient Snowpark pandas usage, consider rewriting the code with an operator (such as DataFrame.apply or DataFrame.applymap) which can work on the entire DataFrame in one shot.


New Year's Day falls on Wednesday, January 1, 2025.
Martin Luther King Jr. Day falls on Monday, January 20, 2025.
Presidents' Day falls on Monday, February 17, 2025.
Memorial Day falls on Monday, May 26, 2025.
Juneteenth National Independence Day falls on Thursday, June 19, 2025.
Independence Day falls on Friday, July 4, 2025.
Labor Day falls on Monday, September 1, 2025.
Columbus Day falls on Monday, October 13, 2025.
Veterans Day falls on Tuesday, November 11, 2025.
Thanksgiving Day falls on Thursday, November 27, 2025.
Christmas Day falls on Thursday, December 25, 2025.
CPU times: user 8.42 s, sys: 1.51 s, total: 9.93 s
Wall time: 1min 50s


### With hybrid execution

In [4]:
AutoSwitchBackend.enable()

In [5]:
us_holidays = [
    ("New Year's Day", "2025-01-01"),
    ("Martin Luther King Jr. Day", "2025-01-20"),
    ("Presidents' Day", "2025-02-17"),
    ("Memorial Day", "2025-05-26"),
    ("Juneteenth National Independence Day", "2025-06-19"),
    ("Independence Day", "2025-07-04"),
    ("Labor Day", "2025-09-01"),
    ("Columbus Day", "2025-10-13"),
    ("Veterans Day", "2025-11-11"),
    ("Thanksgiving Day", "2025-11-27"),
    ("Christmas Day", "2025-12-25")
]

df_us_holidays = pd.DataFrame(us_holidays, columns=["Holiday", "Date"])

# Convert Date column to datetime
df_us_holidays["Date"] = pd.to_datetime(df_us_holidays["Date"])

# Add new columns for transformations
df_us_holidays["Day_of_Week"] = df_us_holidays["Date"].dt.day_name()
df_us_holidays["Month"] = df_us_holidays["Date"].dt.month_name()

In [6]:
df_us_holidays.get_backend() 

'Pandas'

In [7]:
df_us_holidays

Unnamed: 0,Holiday,Date,Day_of_Week,Month
0,New Year's Day,2025-01-01,Wednesday,January
1,Martin Luther King Jr. Day,2025-01-20,Monday,January
2,Presidents' Day,2025-02-17,Monday,February
3,Memorial Day,2025-05-26,Monday,May
4,Juneteenth National Independence Day,2025-06-19,Thursday,June
5,Independence Day,2025-07-04,Friday,July
6,Labor Day,2025-09-01,Monday,September
7,Columbus Day,2025-10-13,Monday,October
8,Veterans Day,2025-11-11,Tuesday,November
9,Thanksgiving Day,2025-11-27,Thursday,November


In [8]:
%%time
for index, row in df_us_holidays.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

New Year's Day falls on Wednesday, January 1, 2025.
Martin Luther King Jr. Day falls on Monday, January 20, 2025.
Presidents' Day falls on Monday, February 17, 2025.
Memorial Day falls on Monday, May 26, 2025.
Juneteenth National Independence Day falls on Thursday, June 19, 2025.
Independence Day falls on Friday, July 4, 2025.
Labor Day falls on Monday, September 1, 2025.
Columbus Day falls on Monday, October 13, 2025.
Veterans Day falls on Tuesday, November 11, 2025.
Thanksgiving Day falls on Thursday, November 27, 2025.
Christmas Day falls on Thursday, December 25, 2025.
CPU times: user 332 ms, sys: 18.9 ms, total: 351 ms
Wall time: 2.13 s


## Example 2: When reading data from Snowflake, we immediately transfer it to pandas if the data is below a certain, configurable number of rows (by default 10 million).

In [9]:
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

In [10]:
print(f"The dataset size is {len(df_transactions)} and the data is located in {df_transactions.get_backend()}.")

The dataset size is 10000000 and the data is located in Snowflake.


In [11]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

In [12]:
%%time
df_transactions.groupby("DATE").sum()["REVENUE"]

CPU times: user 77.6 ms, sys: 8.3 ms, total: 85.9 ms
Wall time: 347 ms


DATE
2024-01-01    9111294.0
2024-01-02    9021742.0
2024-01-03    9044429.0
2024-01-04    9103035.0
2024-01-05    9097915.0
                ...    
2025-07-04    9086684.0
2025-07-05    9209240.0
2025-07-06    9042874.0
2025-07-07    9283002.0
2025-07-08    8962048.0
Freq: None, Name: REVENUE, Length: 555, dtype: float64

In [13]:
last_week_transactions = pd.read_snowflake(
    "SELECT * FROM revenue_transactions WHERE Date >= DATEADD('days', -7, current_date )"
)

Transferring data from Snowflake to Pandas for 'modin.pandas.read_snowflake' with max estimated shape 126255x3…

In [14]:
# The smaller dataset has been loaded into memory as a pandas dataframe.
print(f"The dataset size is {len(last_week_transactions)} and the data is located in {last_week_transactions.get_backend()}.")

The dataset size is 126255 and the data is located in Pandas.


In [15]:
%time
last_week_transactions.groupby("DATE").sum()["REVENUE"]

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.15 µs


DATE
2025-07-02    9243944.0
2025-07-03    9153237.0
2025-07-04    9086684.0
2025-07-05    9209240.0
2025-07-06    9042874.0
2025-07-07    9283002.0
2025-07-08    8962048.0
Name: REVENUE, dtype: float64

## Example 3: Combining small and large datasets in the same workflow

Soemtimes you are working with multiple dataframes of different sizes and you need to join them together, what happens in this scenario?
When two dataframes are joined and the two dataframe are coming from different engine, we automatically determine what is the most optimal way to move the data to minimize the cost of data movement.

Continuing with our `df_transactions` and `df_us_holidays` dataset.

In [16]:
print("Quick recap:")
print(f"- df_transactions is {len(df_transactions)} rows and the data is located in {df_transactions.get_backend()}.")
print(f"- df_us_holidays is {len(df_us_holidays)} rows and the data is located in {df_us_holidays.get_backend()}.")

Quick recap:
- df_transactions is 10000000 rows and the data is located in Snowflake.
- df_us_holidays is 11 rows and the data is located in Pandas.


In [17]:
combined = pd.merge(df_us_holidays, df_transactions, left_on="Date", right_on="DATE")

Transferring data from Pandas to Snowflake for 'modin.pandas.merge' with max estimated shape 11x4:   0%|      …

In [18]:
combined.get_backend()

'Snowflake'

In [19]:
combined

Unnamed: 0,Holiday,Date,Day_of_Week,Month,TRANSACTION_ID,DATE,REVENUE
0,New Year's Day,2025-01-01,Wednesday,January,d9d60e72-a191-4272-8ccb-ccfc40841542,2025-01-01,662.0
1,New Year's Day,2025-01-01,Wednesday,January,577c95f3-e53e-45b8-b0b7-99d6a84fdd56,2025-01-01,284.0
2,New Year's Day,2025-01-01,Wednesday,January,0353aa19-ec84-45ab-a237-ae6069846019,2025-01-01,261.0
3,New Year's Day,2025-01-01,Wednesday,January,720f7193-afdc-44e1-b562-52737fd740de,2025-01-01,513.0
4,New Year's Day,2025-01-01,Wednesday,January,ddfbd9f4-961e-46ab-bf22-4b6883e924f8,2025-01-01,944.0
...,...,...,...,...,...,...,...
108047,Independence Day,2025-07-04,Friday,July,24f4e1af-33d9-465e-bdcb-5321dd0cf7a0,2025-07-04,344.0
108048,Independence Day,2025-07-04,Friday,July,251647f8-104d-4edd-9113-24952b7cf084,2025-07-04,772.0
108049,Independence Day,2025-07-04,Friday,July,a35fcf4e-7aa6-4d05-a387-9fcdedc1dd46,2025-07-04,135.0
108050,Independence Day,2025-07-04,Friday,July,0f8b937e-1ea2-40d0-a5f0-4645bc1bbfcc,2025-07-04,33.0
