# Prerequisites

Complete the following steps:

1. Install `requirements.txt`
2. Load necessary libraries and API URLs
3. Replace `ACCESS_TOKEN` with a valid token (for help creating an access token, see [API Quickstart](https://docs.predicthq.com/getting-started/api-quickstart))

In [1]:
# install requirements
# %pip install --user -r requirements.txt

In [2]:
# load libraries and API URLs
import pandas as pd
import numpy as np

import requests
from io import StringIO
from functools import reduce

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
import plotly.graph_objects as go

SUGGESTED_RADIUS_API_URL = "https://api.predicthq.com/v1/suggested-radius"
BEAM_API_URL = "https://api.predicthq.com/v1/beam"
FEATURES_API_URL = "https://api.predicthq.com/v1/features"

In [3]:
ACCESS_TOKEN = "REPLACE_WITH_ACCESS_TOKEN"

# Gather data

The following are required when creating a new Analysis in Beam:

1. Demand data: one csv file with columns for `date` and `demand` (see [here](https://docs.predicthq.com/api/beam/upload-demand-data#request-body) for more details)

2. `lat`/`lon`: the coordinates of the location 

3. `industry`: the industry relevant to the location (see [here](https://docs.predicthq.com/api/beam/create-an-analysis#request-body) for more details)

4. `analysis_name`: a user-created free-form string to reference the Analysis

In [4]:
# define demand details
DEMAND_FILE_PATH = "data/restaurant_demand.csv"
LAT = "40.74559"
LON = "-73.99452"
INDUSTRY = "restaurants"
ANALYSIS_NAME = "restaurant_daily_demand_example"

# inspect demand file
pd.read_csv(DEMAND_FILE_PATH).head()

Unnamed: 0,date,demand
0,2017-01-09,8291.576677
1,2017-01-10,8629.401781
2,2017-01-11,7217.360816
3,2017-01-12,8404.744284
4,2017-01-13,8450.852853


# Step 1. Identify relevant event features with Beam

In [5]:
def create_beam_analysis_id(
    lat,
    lon,
    industry,
    analysis_name,
    access_token=ACCESS_TOKEN,
    beam_api_url=BEAM_API_URL,
):
    """
    Create a Beam Analysis ID for a location.
    """
    url = f"{beam_api_url}/analyses"
    headers = {"Authorization": "Bearer " + access_token, "Accept": "application/json"}
    json = {
        "name": analysis_name,
        "location": {
            "geopoint": {
                "lat": lat,
                "lon": lon,
            },
        },
        "demand_type": {
            "industry": industry,
        },
    }

    response = requests.post(url=url, headers=headers, json=json)

    analysis_id = response.json()["analysis_id"]
    print(f"{analysis_id} created")

    return analysis_id


def upload_demand_to_beam(
    demand_file_path, analysis_id, access_token=ACCESS_TOKEN, beam_api_url=BEAM_API_URL
):
    """
    Upload demand data for a Beam Analysis.
    """
    url = f"{beam_api_url}/analyses/{analysis_id}/sink"
    headers = {"Authorization": "Bearer " + access_token, "Content-Type": "text/csv"}

    response = requests.post(url=url, headers=headers, data=open(demand_file_path))

    if response.status_code == 202:
        print(f"{analysis_id} demand uploading...")
    else:
        print(response.content)


def get_analysis_status(
    analysis_id, access_token=ACCESS_TOKEN, beam_api_url=BEAM_API_URL
):
    """
    Check status of a Beam Analysis.
    """
    url = f"{beam_api_url}/analyses/{analysis_id}"
    headers = {"Authorization": "Bearer " + access_token, "Accept": "application/json"}

    response = requests.get(url=url, headers=headers)

    print(f"{analysis_id} status: {response.json()['readiness_status']}")

Refresh the following block until `status` is  `ready`.

In [6]:
# create a Beam Analysis ID and upload demand data
if "analysis_id" not in globals():
    analysis_id = create_beam_analysis_id(
        lat=LAT,
        lon=LON,
        industry=INDUSTRY,
        analysis_name=ANALYSIS_NAME,
    )
    upload_demand_to_beam(demand_file_path=DEMAND_FILE_PATH, analysis_id=analysis_id)

get_analysis_status(analysis_id=analysis_id)

YDwlR4fSkG4 status: ready


# Step 2. Retrieve event features from Features API

In [7]:
def post_process_rank_features(df):
    """
    Post-process rank features.
    """
    rank_feature_cols = [col for col in df.columns if "phq_rank" in col]
    rank_features = set(col.rsplit("_rank_levels", 1)[0] for col in rank_feature_cols)

    def find_max_rank(row):
        for idx, value in enumerate(row[::-1], start=0):
            if value > 0:
                return len(row) - idx
        return 0

    for feature in rank_features:
        related_columns = [col for col in rank_feature_cols if col.startswith(feature)]
        df[f"{feature}_stats_max"] = df[related_columns].apply(
            lambda row: find_max_rank(row) * 20, axis=1
        )
        df = df.drop(columns=related_columns)

    return df


def get_features(
    analysis_id,
    start,
    end,
    group_id=None,
    access_token=ACCESS_TOKEN,
    features_api_url=FEATURES_API_URL,
):
    """
    Fetch features from Features API for a Beam Analysis.
    """
    url = features_api_url
    headers = {"Authorization": "Bearer " + access_token, "Accept": "text/csv"}
    beam_info = {"analysis_id": analysis_id}
    if group_id is not None:
        beam_info["group_id"] = group_id
    json = {"beam": beam_info, "active": {"gte": start, "lte": end}}

    limit = 100
    offset = 0
    results = []

    while True:
        params = {"limit": limit, "offset": offset}

        response = requests.post(url=url, headers=headers, json=json, params=params)
        if not response.text.strip():
            break
        results.append(response.text)
        offset += limit

    df = pd.concat(
        [pd.read_csv(StringIO(result), parse_dates=["date"]) for result in results]
    )
    df = post_process_rank_features(df)

    return df

Set the desired date range to get relevant event features from Features API via the Beam `analysis_id`. Alternatively, add the `group_id` if the Analysis is part of a group and the relevant event features at the group level are of interest.

In [8]:
START = "2017-01-09"
END = "2019-06-30"

# fetch features for the analysis
event_features_df = get_features(
    analysis_id=analysis_id, start=START, end=END, group_id=None
)
event_features_df.head()

Unnamed: 0,date,phq_attendance_community_hospitality_stats_sum,phq_attendance_concerts_hospitality_stats_sum,phq_attendance_conferences_hospitality_stats_sum,phq_attendance_expos_hospitality_stats_sum,phq_attendance_festivals_hospitality_stats_sum,phq_attendance_performing_arts_hospitality_stats_sum,phq_attendance_sports_hospitality_stats_sum,phq_impact_observances_hospitality_stats_max,phq_impact_public_holidays_hospitality_stats_max,...,phq_impact_severe_weather_dust_retail_stats_max,phq_impact_severe_weather_dust_storm_retail_stats_max,phq_impact_severe_weather_flood_retail_stats_max,phq_impact_severe_weather_heat_wave_retail_stats_max,phq_impact_severe_weather_hurricane_retail_stats_max,phq_impact_severe_weather_thunderstorm_retail_stats_max,phq_impact_severe_weather_tornado_retail_stats_max,phq_impact_severe_weather_tropical_storm_retail_stats_max,phq_rank_academic_exam_stats_max,phq_rank_academic_holiday_stats_max
0,2017-01-09,1936,3719,0,0,0,13383,19812,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2017-01-10,2405,6135,145,0,0,19597,6935,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-11,1558,22420,100,142,0,22086,6935,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2017-01-12,2654,6860,100,567,0,20406,26115,9,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-01-13,5507,8748,100,2833,0,24046,31000,30,7,...,0,0,0,0,0,0,0,0,0,0


# Step 3. Use event features in demand forecasting

Merge existing and event features together with demand.

In [9]:
demand_df = pd.read_csv(DEMAND_FILE_PATH, parse_dates=["date"])
existing_features_df = pd.read_csv(
    "data/restaurant_existing_features.csv", parse_dates=["date"]
)

# merge demand and features dataframes
dataframes = [demand_df, existing_features_df, event_features_df]
features_and_demand_df = reduce(
    lambda left, right: pd.merge(left, right, on="date", how="inner"), dataframes
)

In [10]:
def split_data(df):
    """
    Split data into train and test sets.
    """
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date")

    cutoff_date = df["date"].max() - pd.Timedelta(days=7)
    train = df[df["date"] <= cutoff_date]
    test = df[df["date"] > cutoff_date]
    return train, test


def train_model(x_train, y_train):
    """
    Train an XGBoost model.
    """
    tscv = TimeSeriesSplit(n_splits=5)
    param_grid = {
        "n_estimators": [50, 100],
        "learning_rate": [0.01, 0.1, 0.3],
        "max_depth": [5, 6, 7],
    }
    grid_search = GridSearchCV(
        estimator=xgb.XGBRegressor(),
        param_grid=param_grid,
        cv=tscv,
        scoring="neg_mean_squared_error",
    )
    grid_search.fit(x_train, y_train)
    best_model = grid_search.best_estimator_

    return best_model


def plot_results(train, test, y_train, y_test, y_pred, feature_set, accuracy):
    """
    Plot actual vs. forecasted demand.
    """
    plotting_cutoff_date = test["date"].max() - pd.DateOffset(months=3)
    train = train[train["date"] >= plotting_cutoff_date]
    test = test[test["date"] >= plotting_cutoff_date]
    y_train = y_train[train.index]
    y_test = y_test[test.index]

    actual_x = pd.concat([train["date"], test["date"]])
    actual_y = pd.concat([pd.Series(y_train), pd.Series(y_test)])

    actual_trace = go.Scatter(
        x=actual_x,
        y=actual_y,
        mode="lines+markers",
        name="Actual Demand",
        line=dict(color="lightseagreen"),
    )

    predicted_trace = go.Scatter(
        x=test["date"],
        y=y_pred,
        mode="lines+markers",
        name="Forecasted Demand",
        line=dict(color="lightcoral"),
    )

    fig = go.Figure()
    fig.add_trace(actual_trace)
    fig.add_trace(predicted_trace)
    fig.add_vline(
        x=test["date"].iloc[0], line_width=2, line_dash="dot", line_color="lightgray"
    )

    fig.update_layout(
        title=(
            f"<b>Actual vs. Forecasted Demand</b><br>"
            f"<sub>Features: {feature_set.replace('_', ' ')} | "
            f"MAPE: {accuracy*100:.2f}%</sub>"
        ),
        xaxis_title="Date",
        yaxis_title="Demand",
        legend_title="Type",
    )

    fig.show()


def calculate_forecast_uplift(base_metrics, new_metrics):
    """
    Calculate uplift in forecast metrics.
    """
    uplifts = {}
    for metric in base_metrics:
        base_value = base_metrics[metric]
        new_value = new_metrics[metric]
        uplift = (base_value - new_value) / base_value * 100
        uplifts[f"{metric}_uplift"] = uplift

    return uplifts

In [11]:
# train and evaluate models with different feature sets
# and compare forecast accuracy
results = {}

train, test = split_data(df=features_and_demand_df)

for include_event_features in [True, False]:
    if include_event_features:
        feature_columns = train.columns.difference(["date", "demand"])
        feature_set = "existing_and_event_features"
    else:
        feature_columns = [
            col
            for col in train.columns
            if col not in ["date", "demand"] and not col.startswith("phq_")
        ]
        feature_set = "existing_features_only"

    X_train = train[feature_columns]
    y_train = train["demand"]
    X_test = test[feature_columns]
    y_test = test["demand"]

    model = train_model(x_train=X_train, y_train=y_train)
    y_pred = model.predict(X_test)

    mape = mean_absolute_percentage_error(y_test, y_pred)

    plot_results(
        train=train,
        test=test,
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        feature_set=feature_set,
        accuracy=mape,
    )

    results[feature_set] = {"mape": mape}


print("Forecast Error (MAPE):")
for features_set in results:
    print(
        f"--- Features: {features_set.replace('_',' ')}: {results[features_set]['mape']*100:.2f}%"
    )
forecast_uplift = calculate_forecast_uplift(
    base_metrics=results["existing_features_only"],
    new_metrics=results["existing_and_event_features"],
)
print(
    f"Forecast Accuracy Uplift (MAPE reduction): {forecast_uplift['mape_uplift']:.2f}%"
)

Forecast Error (MAPE):
--- Features: existing and event features: 5.62%
--- Features: existing features only: 18.43%
Forecast Accuracy Uplift (MAPE reduction): 69.49%
