# Hybrid Execution Build Verification Test

In this demo, we will show how you can develop a robust pandas pipelines at all data scales. You will see how pandas on Snowflake intelligently determines whether to execute queries locally with regular pandas or run directly in Snowflake. This allows you to rapidly iterate with your pandas workflows for testing and development on small datasets, while futureproofing your pipelines when you scale up to production data.

In [None]:
import snowflake.snowpark.modin.plugin
import modin.pandas as pd
import numpy as np
import datetime
import pandas as native_pd
from time import perf_counter
from snowflake.snowpark.session import Session; session = Session.builder.create()

## read_csv loads into pandas

In [None]:
fruits = pd.read_csv('data.csv')
fruits.get_backend()
assert fruits.get_backend() == "Pandas"

In [None]:
fruits

In [None]:
df = pd.read_csv("s3://sfquickstarts/intro-to-machine-learning-with-snowpark-ml-for-python/diamonds.csv")
assert df.get_backend() == "Pandas"

## inline data is in pandas

In [None]:
us_holidays = [
    ("New Year's Day", "2025-01-01"),
    ("Martin Luther King Jr. Day", "2025-01-20"),
    ("Presidents' Day", "2025-02-17"),
    ("Memorial Day", "2025-05-26"),
    ("Juneteenth National Independence Day", "2025-06-19"),
    ("Independence Day", "2025-07-04"),
    ("Labor Day", "2025-09-01"),
    ("Columbus Day", "2025-10-13"),
    ("Veterans Day", "2025-11-11"),
    ("Thanksgiving Day", "2025-11-27"),
    ("Christmas Day", "2025-12-25")
]

# Create DataFrame
df_us_holidays = pd.DataFrame(us_holidays, columns=["Holiday", "Date"])

# Convert Date column to datetime
df_us_holidays["Date"] = pd.to_datetime(df_us_holidays["Date"])

In [None]:
assert df_us_holidays.get_backend() == 'Pandas'  # with auto, we should expect this to be local

In [None]:
# Add new columns for transformations
df_us_holidays["Day_of_Week"] = df_us_holidays["Date"].dt.day_name()
df_us_holidays["Month"] = df_us_holidays["Date"].dt.month_name()

In [None]:
df_us_holidays

In [None]:
pd.explain()

In [None]:
%%time
#Note that without auto-switching, this took 2.5 min
for index, row in df_us_holidays.iterrows():
    print(f"{row['Holiday']} falls on {row['Day_of_Week']}, {row['Month']} {row['Date'].day}, {row['Date'].year}.")

### 💡 Automatic switching speeds up loops/iterations on small data + inline creation of dataframes

## Example 2: When data is filtered the choice of engine changes

Run the following SQL to generate a synthetic dataset with 10M rows of transactions (from 2024-2025 current date)
```sql
CREATE OR REPLACE TABLE revenue_transactions (
    Transaction_ID STRING,
    Date DATE,
    Revenue FLOAT
);

SET num_days = (SELECT DATEDIFF(DAY, '2024-01-01', CURRENT_DATE));
INSERT INTO revenue_transactions (Transaction_ID, Date, Revenue)
SELECT
    UUID_STRING() AS Transaction_ID,
    DATEADD(DAY, UNIFORM(0, $num_days, RANDOM()), '2024-01-01') AS Date,
    UNIFORM(10, 1000, RANDOM()) AS Revenue
FROM TABLE(GENERATOR(ROWCOUNT => 10000000));
```

In [None]:
# Run the following to generate a synthetic dataset with 10M rows of transactions (from 2024-2025 current date)
session.sql('''
CREATE OR REPLACE TABLE revenue_transactions (
    Transaction_ID STRING,
    Date DATE,
    Revenue FLOAT
);''').collect()
session.sql('''SET num_days = (SELECT DATEDIFF(DAY, '2024-01-01', CURRENT_DATE));''').collect()
session.sql('''INSERT INTO revenue_transactions (Transaction_ID, Date, Revenue)
SELECT
    UUID_STRING() AS Transaction_ID,
    DATEADD(DAY, UNIFORM(0, $num_days, RANDOM()), '2024-01-01') AS Date,
    UNIFORM(10, 1000, RANDOM()) AS Revenue
FROM TABLE(GENERATOR(ROWCOUNT => 10000000));
''').collect()

In [None]:
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

In [None]:
print(f"The dataset size is {len(df_transactions)} and the data is located in {df_transactions.get_backend()}.")

In [None]:
pd.explain()

Perform some operations on 10M rows with Snowflake

In [None]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

In [None]:
%%time
df_transactions.groupby("DATE").sum()["REVENUE"]

In [None]:
assert df_transactions.get_backend() == "Snowflake"

In [None]:
pd.explain()

So far everything has been happening in Snowflake, since we are working with the full dataset (10M rows). 
Next, we demonstrate what happens when we filter the data down to a smaller dataset below our data size threshold for automatic engine switching. 
First, let's perform the filtering directly with pandas. 

In [None]:
df_transactions_filter1 = df_transactions[(df_transactions["DATE"] >= pd.Timestamp.today().date() - pd.Timedelta('7 days')) & (df_transactions["DATE"] < pd.Timestamp.today().date())]

In [None]:
assert df_transactions_filter1.get_backend() == "Snowflake"

In [None]:
print(f"Date range: {df_transactions_filter1['DATE'].min().date()} to {df_transactions_filter1['DATE'].max().date()}. Resulting dataset size: {len(df_transactions_filter1)}")

Now that we have a smaller dataframe, this happens in pandas.

In [None]:
%time
df_transactions_filter1 = df_transactions_filter1.groupby("DATE").sum()["REVENUE"]

In [None]:
# TODO: Waiting on GroupBy switcheroo
df_transactions_filter1 = df_transactions_filter1.move_to("Pandas")
assert df_transactions_filter1.get_backend()=="Pandas"


We saw what happens when we filter with pandas. Now let's look at what happens if we perform filtering via SQL directly in the `read_snowflake` command, so the dataframe upon creation is small.

In [None]:
df_transactions_filter2 = pd.read_snowflake("SELECT * FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, current_date ) and Date < current_date")

In [None]:
assert df_transactions_filter2.get_backend()=="Pandas"

In [None]:
# Verify the result is same as above
print(f"Date range: {df_transactions_filter2['DATE'].min()} to {df_transactions_filter2['DATE'].max()}. Resulting dataset size: {len(df_transactions_filter2)}")

Once you are in pandas, you can still continue to perform the same operations: 

In [None]:
%time
df_transactions_filter2.groupby("DATE").sum()["REVENUE"]

In [None]:
assert df_transactions_filter2.get_backend() == 'Pandas'

### 💡 Automatic switching means that pandas work well for both small and large data

## Example 3: Combining small and large datasets in the same workflow

Soemtimes you are working with multiple dataframes of different sizes and you need to join them together, what happens in this scenario?
When two dataframes are joined and the two dataframe are coming from different engine, we automatically determine what is the most optimal way to move the data to minimize the cost of data movement.

Continuing with our `df_transactions` and `df_us_holidays` dataset.

In [None]:
print("Quick recap:")
print(f"- df_transactions is {len(df_transactions)} rows and the data is located in {df_transactions.get_backend()}.")
print(f"- df_us_holidays is {len(df_us_holidays)} rows and the data is located in {df_us_holidays.get_backend()}.")

In [None]:
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])

Since `df_us_holidays` is much smaller than `df_transactions`, we moved `df_us_holidays` to Snowflake where `df_transactions` is, to perform the operation.

In [None]:
combined = pd.merge(df_us_holidays, df_transactions, left_on="Date", right_on="DATE")

In [None]:
assert combined.get_backend() == "Snowflake"

### 💡 When we combine multiple dataframes running in different locations, pandas on Snowflake automatically determines where to move the data.

## Example 4: Performing custom `apply` on small dataset

apply is known to be slow in Snowpark pandas since it is implemented as UDF/UDTF, which often comes with a fixed startup time.
Here, we show an example of how performing `apply` on a small dataset is faster with local pandas. 

In this example, we want to forecast using last year's transaction data via a custom apply function. 

In [None]:
def forecast_revenue(df, start_date, end_date):
    # Filter data from last year
    df_filtered = df[(df["DATE"] >= start_date - pd.Timedelta(days=365)) & (df["DATE"] < start_date)]
    # Append future dates to daily_avg for prediction
    future_dates = pd.date_range(start=start_date, end=end_date, freq="D")
    df_future = pd.DataFrame({"DATE": future_dates})

    # Group by DATE and calculate the mean revenue
    daily_avg = df_filtered.groupby("DATE")["REVENUE"].mean().reset_index()
    daily_avg["DATE"] = daily_avg["DATE"].astype('datetime64[ns]')
    # Merge future dates with predicted revenue, filling missing values
    df_forecast = df_future.merge(daily_avg, on="DATE", how="left")
    import numpy as np
    # Fill missing predicted revenue with overall mean from last year
    df_forecast["PREDICTED_REVENUE"] = np.nan
    df_forecast["PREDICTED_REVENUE"].fillna(daily_avg["REVENUE"].mean(), inplace=True)
    df_forecast["PREDICTED_REVENUE"] = df_forecast["PREDICTED_REVENUE"].astype("float")
    return df_forecast

First, let's use the `forecast_revenue` function to get the forecast in the date range, based on last year's revenue numbers.

In [None]:
start_date = pd.Timestamp("2025-10-01")
end_date = pd.Timestamp("2025-10-31")
df_forecast = forecast_revenue(df_transactions, start_date, end_date)

The resulting dataframe is very small, since it is only the 1-month window we're performing forecast on, so the backend is running on pandas locally.

In [None]:
# TODO: Waiting on GroupBy switcheroo
#df_forecast = df_forecast.move_to('Pandas')
assert df_forecast.get_backend() == 'Snowflake'

In [None]:
def adjust_for_holiday_weekend(row):
    # For national holidays, revenue down 5% since stores are closed. For weekends, revenue is up 5% due to increased activity.
    if row["DATE"].strftime('%Y-%m-%d') in list(df_us_holidays["Date"].dt.strftime('%Y-%m-%d')): 
        return row["PREDICTED_REVENUE"] * 0.95
    elif row["DATE"].weekday() == 5 or row["DATE"].weekday() == 6: #Saturday/Sundays
        return row["PREDICTED_REVENUE"] * 1.05
    return row["PREDICTED_REVENUE"]

Now if we run `apply` on this dataframe. It will be running with local pandas.

In [None]:
# Adjust for holidays using the apply function
df_forecast["PREDICTED_REVENUE"] = df_forecast.apply(adjust_for_holiday_weekend, axis=1)

In [None]:
assert df_forecast.get_backend() == 'Pandas'

In [None]:
#TODO: Waiting on environment settings
from modin.config.envvars import NativePandasMaxRows
NativePandasMaxRows.put(10)

In [None]:
df_small = pd.DataFrame({'a': [1]*20, 'b': [2]*20}).move_to('Snowflake')

In [None]:
assert df_small.get_backend() == 'Snowflake'

In [None]:
df_small = df_small.apply(lambda x : x + 1)

In [None]:
assert df_small.get_backend() == 'Snowflake'

In [None]:
df_small = df_small.head(5)
df_small = df_small.apply(lambda x : x + 1)

In [None]:
assert df_small.get_backend() == 'Pandas'

In [None]:
NativePandasMaxRows.put(10_000_000)

In [None]:
pd.explain(last=10)

# bug fix from bug bash, May 2, 2025: constructing dataframe or series out of snowflake df or series should not cuase data to move ("Jonathan Shi passing a Series/DF to the DF constructor triggers surprising moves (in)")

In [None]:
input_df = pd.DataFrame(list(range(10))).move_to('snowflake')
assert input_df.get_backend() == 'Snowflake'
assert pd.DataFrame(input_df).get_backend() == 'Snowflake'
assert pd.DataFrame({'col0': input_df[0]}).get_backend() == 'Snowflake'
assert pd.DataFrame(input_df[0]).get_backend() == 'Snowflake'

pandas_df  = pd.DataFrame(list(range(10)))
assert pandas_df.get_backend() == 'Pandas'
assert pd.DataFrame(pandas_df).get_backend() == 'Pandas'
assert pd.DataFrame({'col0': pandas_df[0]}).get_backend() == 'Pandas'
assert pd.DataFrame(pandas_df[0]).get_backend() == 'Pandas'

