In [0]:
%pip install faker

In [0]:
from faker import Faker
from pyspark.sql import Row
import random

logistics_products = [
    "Freight Forwarding",
    "Warehousing",
    "Order Fulfillment",
    "Last Mile Delivery",
    "Inventory Management",
    "Customs Brokerage",
    "Reverse Logistics",
    "Cross-Docking",
    "Distribution Center",
    "Cold Chain Logistics",
    "Transportation Management",
    "Parcel Shipping",
    "Palletizing",
    "Load Planning",
    "EDI Integration",
    "Returns Processing",
    "Drayage",
    "Transloading",
    "Supply Chain Consulting",
    "Value-Added Services"
]

In [0]:
fake = Faker()
Faker.seed(42)

sample_sales = []
for year in [2024, 2025]:
    for product in logistics_products:
        for month in range(1, 13):
            sale_date = f"{year}-{month:02d}-{random.randint(1,28):02d}"
            sample_sales.append(
                Row(
                    sale_id=len(sample_sales)+1,
                    product_name=product,
                    sale_amount=round(
                        fake.pyfloat(
                            left_digits=4,
                            right_digits=2,
                            positive=True,
                            min_value=20,
                            max_value=2000
                        ),
                        2
                    ),
                    sales_date=sale_date,
                    customer_name=fake.name(),
                    customer_address=fake.address().replace('\n', ', ')
                )
            )

df_sales_sample = spark.createDataFrame(sample_sales)
display(df_sales_sample)

In [0]:
from pyspark.sql.functions import lit, sha2, concat_ws, monotonically_increasing_id

# Synthetic: Replace with fake data
masked_df_sales = df_sales_sample.withColumn(
    "customer_name", lit("MASKED")
).withColumn(
    "customer_address", lit("MASKED")
)
display(masked_df_sales)

In [0]:
# Obfuscation: Replace with hashed/randomized values
masked_df_sales_obfuscation = df_sales_sample.withColumn(
    "customer_name", sha2(concat_ws("_", df_sales_sample.customer_name, monotonically_increasing_id()), 256)
).withColumn(
    "customer_address", sha2(concat_ws("_", df_sales_sample.customer_address, monotonically_increasing_id()), 256)
)
display(masked_df_sales_obfuscation)

In [0]:
# Redaction: Blank out the data
masked_df_sales_redaction = df_sales_sample.withColumn(
    "customer_name", lit("")
).withColumn(
    "customer_address", lit("")
)
display(masked_df_sales_redaction)

_The AI_FORECAST function predicts future values of a time series using historical data.
horizon specifies the end date for the forecast.
time_col is the column with date information.
value_col is the column with numeric values to forecast.
prediction_interval_width sets the confidence interval for predictions.
parameters allows you to customize the forecasting model (e.g., seasonality mode).
You can adjust the horizon and other parameters as needed for your use case._

In [0]:
# Install Prophet if not already installed
%pip install prophet

from prophet import Prophet

# Select columns from Spark DataFrame and convert to Pandas, including both 2024 and 2025 data
sales_pd = (
  df_sales_sample
  .select(
    'sales_date',
    'sale_amount',
    'product_name'
  )
  .toPandas()
)

# Prepare data for Prophet
sales_pd.rename(
  columns = {
    'sales_date': 'ds',
    'sale_amount': 'y'
  },
  inplace = True
)

# Fit Prophet model to sales data for time series forecasting (per product), now including 2024 data
forecasts = []
for product in sales_pd['product_name'].unique():
    product_sales = sales_pd[sales_pd['product_name'] == product][['ds', 'y']]
    model = Prophet(seasonality_mode='multiplicative')
    model.fit(product_sales)
    future = model.make_future_dataframe(periods=365, freq='D')
    forecast = model.predict(future)
    forecast['product_name'] = product
    forecasts.append(forecast[forecast['ds'].dt.year == 2026])

import pandas as pd
forecast_2026 = pd.concat(forecasts, ignore_index=True)

# Move product_name to the beginning
cols = ['product_name'] + [col for col in forecast_2025.columns if col != 'product_name']
forecast_2026 = forecast_2026[cols]

# Display forecast results for 2025 with product name at the beginning
display(forecast_2026)

In [0]:
import pandas as pd

# Prepare 2024 actual sales data
df_sales_2024_pd = (
    df_sales_sample
    .select('sales_date', 'product_name', 'sale_amount')
    .toPandas()
)
df_sales_2024_pd['sales_date'] = pd.to_datetime(df_sales_2024_pd['sales_date'])
df_sales_2024_pd['year'] = df_sales_2024_pd['sales_date'].dt.year
df_sales_2024_pd['month'] = df_sales_2024_pd['sales_date'].dt.month
actual_2024 = (
    df_sales_2024_pd[df_sales_2024_pd['year'] == 2024]
    .groupby(['product_name', 'month'])['sale_amount']
    .sum()
    .reset_index()
    .rename(columns={'sale_amount': 'actual_sales_2024'})
)

# Prepare 2025 forecast data
forecast_2025['month'] = forecast_2025['ds'].dt.month
forecast_2025['year'] = forecast_2025['ds'].dt.year
forecast_monthly = (
    forecast_2025
    .groupby(['product_name', 'month'])['yhat']
    .sum()
    .reset_index()
    .rename(columns={'yhat': 'forecast_sales_2025'})
)

# Merge actual and forecast data
comp_df = pd.merge(
    actual_2024,
    forecast_monthly,
    on=['product_name', 'month'],
    how='outer'
).fillna(0)

# Pivot for product/month comparison, move product_name to the beginning
pivot_df = comp_df.pivot_table(
    index='product_name',
    columns='month',
    values=['actual_sales_2024', 'forecast_sales_2025'],
    aggfunc='sum'
)

pivot_df = pivot_df.sort_index(axis=1, level=1)
pivot_df.reset_index(inplace=True)

display(pivot_df)

In [0]:
from pyspark.sql.functions import expr

df_sales_forecast_insight = df_sales_forecast.withColumn(
    "forecast_ai_insight",
    expr(
        '''
        ai_gen(
            concat(
                "Generate insights for the following product and sales data. Keep it short, detailed, and concise",
                "Product: ", product_name, 
                ". Actual sales amount: ", sale_amount, 
                ". Forecasted sales: ", forecast_sale_amount, 
                ". Forecast lower bound: ", forecast_lower, 
                ". Forecast upper bound: ", forecast_upper, 
                ". Provide a brief analysis of the sales and forecast."
            )
        )
        '''
    )
)
display(df_sales_forecast_insight)