# Forecast with Snowflake Cortex

> Generic method template to forecast with Snowflake Cortex

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#| default_exp forecast

In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| export
import yaml
import random
import string
import logging
import numpy as np
import pandas as pd
import altair as alt
import logging
import numpy as np
import time
import snowflake.snowpark._internal.utils as snowpark_utils

from typing import Union, Dict
from datetime import datetime
from cortex_forecast.connection import SnowparkConnection
from snowflake.snowpark.exceptions import SnowparkSQLException

logging.getLogger('snowflake.snowpark').setLevel(logging.WARNING)

In [5]:
#| export

class SnowflakeMLForecast(SnowparkConnection):
    def __init__(self, config: Union[str, Dict], connection_config=None, is_streamlit=False):
        super().__init__(connection_config=connection_config)
        self.config = self._load_config(config)
        self.model_name = self._generate_unique_model_name()
        self.training_data_query = None
        self.is_streamlit = is_streamlit
        self.database = self.config['input_data'].get('database', self.connection_config.get('database'))
        self.schema = self.config['input_data'].get('schema', self.connection_config.get('schema'))
        self.temp_table_name = None

    def _load_config(self, config: Union[str, Dict]) -> Dict:
        if isinstance(config, str):
            # If config is a string, assume it's a file path
            try:
                with open(config, 'r') as file:
                    return yaml.safe_load(file)
            except Exception as e:
                raise ValueError(f"Error loading config file: {str(e)}")
        elif isinstance(config, dict):
            # If config is already a dictionary, use it directly
            return config
        else:
            raise TypeError("Config must be either a file path (string) or a dictionary.")

    def _generate_unique_model_name(self):
        suffix = ''.join(random.choices(string.ascii_lowercase, k=5))
        timestamp = datetime.now().strftime("%Y%m%d")
        return f"{self.config['model']['name']}_{timestamp}_{suffix}"
    
    def get_fully_qualified_name(self, object_name):
        return f"{self.database}.{self.schema}.{object_name}"

    def _generate_input_data_sql(self):
        table = self.get_fully_qualified_name(self.config['input_data']['table'])
        timestamp_col = self.config['input_data']['timestamp_column']
        target_col = self.config['input_data']['target_column']
        series_col = self.config['input_data'].get('series_column')
        exogenous_cols = self.config['input_data'].get('exogenous_columns') or []
        training_days = self.config['forecast_config'].get('training_days')

        # Always include timestamp, target, and series (if present) columns
        base_columns = [f"TO_TIMESTAMP_NTZ({timestamp_col}) AS {timestamp_col}",
                        f"CAST({target_col} AS FLOAT) AS {target_col}"]
        if series_col:
            base_columns.append(f"{series_col} AS {series_col}")

        if exogenous_cols:
            columns = base_columns + exogenous_cols
            select_clause = f"SELECT {', '.join(columns)}"
        else:
            exclude_cols = [timestamp_col, target_col]
            select_clause = f"SELECT {', '.join(base_columns)}, * EXCLUDE ({', '.join(exclude_cols)})"

        # Generate a random name for the temporary table
        self.temp_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE)

        sql = f"""
        CREATE OR REPLACE TABLE {self.temp_table_name} AS
        {select_clause}
        FROM {table}
        """

        if training_days:
            sql += f"""
            WHERE TO_TIMESTAMP_NTZ({timestamp_col}) 
            BETWEEN 
            DATEADD(day, -{training_days}, (SELECT MAX({timestamp_col}) FROM {table})) 
            AND 
            (SELECT MAX({timestamp_col}) FROM {table})
            """

        sql += ";"

        self.training_data_query = sql
        self.display("Generated SQL:", content_type="text")
        self.display(sql, content_type="code", language="sql")
        return sql

    def _generate_create_model_sql(self):
        input_data = f"SYSTEM$REFERENCE('TABLE', '{self.temp_table_name}')"
        timestamp_col = self.config['input_data']['timestamp_column']
        target_col = self.config['input_data']['target_column']
        series_col = self.config['input_data'].get('series_column')
        config_object = self.config['forecast_config'].get('config_object', {})
    
        sql = f"""
        CREATE OR REPLACE SNOWFLAKE.ML.FORECAST {self.get_fully_qualified_name(self.model_name)}(
            INPUT_DATA => {input_data},
            TIMESTAMP_COLNAME => '{timestamp_col}',
            TARGET_COLNAME => '{target_col}',
        """
        
        if series_col:
            sql += f"""SERIES_COLNAME => '{series_col}',\n"""
        
        config_sql = "{"
        for key, value in config_object.items():
            if isinstance(value, dict):
                nested_config = "{"
                nested_config += ", ".join([f"'{k}': {self._format_value(v)}" for k, v in value.items()])
                nested_config += "}"
                config_sql += f"'{key}': {nested_config}, "
            else:
                config_sql += f"'{key}': {self._format_value(value)}, "
        config_sql = config_sql.rstrip(", ") + "}"

        sql += f"CONFIG_OBJECT => {config_sql},"
        
        sql = sql.rstrip(',')  # Clean up trailing commas
        sql += ")"
        tags = self.config['model'].get('tags')
        comment = self.config['model'].get('comment')
        
        if tags:
            tag_str = ", ".join([f"{k} = '{v}'" for k, v in tags.items()])
            sql += f" WITH TAG ({tag_str})"
        
        if comment:
            sql += f" COMMENT = '{comment}'"
        
        sql += ";"

        self.display("Generated SQL:", content_type="text")
        self.display(sql, content_type="code", language="sql")
        self.create_model_query_text = sql
        return sql

    def _format_value(self, value):
        if value is None:
            return "NULL"
        elif isinstance(value, bool):
            return "TRUE" if value else "FALSE"
        elif isinstance(value, (int, float)):
            return str(value)
        elif isinstance(value, str):
            return f"'{value}'"
        return str(value)

    def _generate_forecast_sql(self):
        try:
            forecast_days = self.config['forecast_config'].get('forecast_days')
            output_table = self.get_fully_qualified_name(self.config['output']['table'])
            input_data_table = self.get_fully_qualified_name(self.config['forecast_config'].get('table')) if self.config['forecast_config'].get('table') else None
            config_object = self.config['forecast_config'].get('config_object', {})
            evaluation_config = config_object.get('evaluation_config', {})
            prediction_interval = evaluation_config.get('prediction_interval', 0.95)
            series_col = self.config['input_data'].get('series_column')
            timestamp_col = self.config['input_data']['timestamp_column']

            # Check if the table exists
            check_table_sql = f"""
            SELECT COUNT(*) 
            FROM {self.database}.INFORMATION_SCHEMA.TABLES 
            WHERE TABLE_SCHEMA = '{self.schema}' 
            AND TABLE_NAME = '{self.config['output']['table']}'
            """
            table_exists = self.session.sql(check_table_sql).collect()[0][0] > 0

            if table_exists:
                sql = f"INSERT INTO {output_table} "
            else:
                sql = f"CREATE OR REPLACE TABLE {output_table} AS "

            sql += "SELECT "

            if series_col:
                sql += f"series::string as {series_col}, "

            sql += f"""
                ts AS {timestamp_col},
                CASE WHEN forecast < 0 THEN 0 ELSE forecast END AS forecast,
                CASE WHEN lower_bound < 0 THEN 0 ELSE lower_bound END AS lower_bound,
                CASE WHEN upper_bound < 0 THEN 0 ELSE upper_bound END AS upper_bound,
                '{self.model_name}' AS model_name,
                CURRENT_TIMESTAMP() AS creation_date,
                '{self.config['model']['comment']}' AS model_comment
            FROM
                TABLE({self.get_fully_qualified_name(self.model_name)}!FORECAST(
            """

            if input_data_table:
                sql += f"""
                INPUT_DATA => SYSTEM$REFERENCE('TABLE', '{input_data_table}'),
                TIMESTAMP_COLNAME => '{timestamp_col}',\n"""

            if series_col:
                sql += f"SERIES_COLNAME => '{series_col}',\n"

            sql += f"CONFIG_OBJECT => {{'prediction_interval': {prediction_interval}}}\n"
            
            if forecast_days is not None:
                sql += f", FORECASTING_PERIODS => {forecast_days}"
            
            sql += "));"

            self.display("Generated Forecast SQL:", content_type="text")
            self.display(sql, content_type="code", language="sql")
            self.create_model_prediction_query_text = sql
            return sql

        except KeyError as e:
            self.display(f"KeyError encountered: {e}", content_type="text")
            raise e

    def run_query(self, query):
        df = self.session.sql(query).to_pandas() if self.session else None
        return df

    def run_command(self, query):
        result = self.session.sql(query).collect() if self.session else None
        return result

    def create_and_run_forecast(self):
        self.create_tags()

        self.display("Step 1/4: Creating training table...", content_type="text")
        sql = self._generate_input_data_sql()
        self.run_command(sql)

        self.display("Step 2/4: Creating forecast model...", content_type="text")
        sql = self._generate_create_model_sql()
        self.run_command(sql)

        self.display("Step 3/4: Generating forecasts...", content_type="text")
        sql = self._generate_forecast_sql()
        self.run_command(sql)

        self.display("Step 4/4: Fetching forecast results...", content_type="text")
        output_table = self.get_fully_qualified_name(self.config['output']['table'])
        fetch_sql = f"SELECT * FROM {output_table} ORDER BY {self.config['input_data']['timestamp_column']}"
        
        max_retries = 5
        retry_delay = 2  # seconds

        for attempt in range(max_retries):
            try:
                forecast_data = self.run_query(fetch_sql)
                return forecast_data
            except SnowparkSQLException as e:
                if "Object does not exist or not authorized" in str(e) and attempt < max_retries - 1:
                    self.display(f"Table not ready, retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})", content_type="text")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    raise

        raise Exception(f"Failed to fetch forecast results after {max_retries} attempts.")

    def cleanup(self):
        self.display("Cleaning up temporary tables and models...", content_type="text")
        cleanup_commands = f"""
        DROP TABLE IF EXISTS {self.get_fully_qualified_name(self.model_name + '_train')};
        DROP TABLE IF EXISTS {self.get_fully_qualified_name(self.config['output']['table'])};
        """
        for command in cleanup_commands.split(';'):
            if command.strip():
                self.run_command(command)

    def create_tags(self):
        tags = self.config['model'].get('tags')
        if not tags:
            self.display("No tags to create.", content_type="text")
            return

        for tag_name, tag_comment in tags.items():
            create_tag_sql = f"CREATE TAG {tag_name} COMMENT = 'Specifies the {tag_comment.lower()}';"
            try:
                self.display(f"Attempting to create tag: {tag_name}", content_type="text")
                self.run_command(create_tag_sql)
                self.display(f"Tag '{tag_name}' created successfully.", content_type="text")
            except Exception as e:
                if "already exists" in str(e):
                    self.display(f"Tag '{tag_name}' already exists.", content_type="text")
                else:
                    self.display(f"Error creating tag '{tag_name}': {e}", content_type="text")


    def get_training_data_query(self):
        if self.training_data_query is None:
            self.display("Training data query has not been generated yet.", content_type="text")
        return self.training_data_query

    def load_historic_actuals(self, historical_steps_back: int):
        table = self.get_fully_qualified_name(self.config['input_data']['table'])
        timestamp_col = self.config['input_data']['timestamp_column']
        target_col = self.config['input_data']['target_column']
        series_col = self.config['input_data'].get('series_column')

        columns = [timestamp_col, target_col]
        if series_col:
            columns.append(series_col)

        if series_col:
            query = f"""
            WITH ranked_data AS (
                SELECT 
                    {', '.join(columns)},
                    ROW_NUMBER() OVER (PARTITION BY {series_col} ORDER BY {timestamp_col} DESC) as rn
                FROM {table}
            )
            SELECT {', '.join(columns)}
            FROM ranked_data
            WHERE rn <= {historical_steps_back}
            ORDER BY {series_col}, {timestamp_col} DESC
            """
        else:
            query = f"""
            SELECT {', '.join(columns)}
            FROM {table}
            ORDER BY {timestamp_col} DESC
            LIMIT {historical_steps_back}
            """

        self.display("Executing historic actuals query:", content_type="text")
        self.display(query, content_type="code", language="sql")
        
        df_actuals = self.session.sql(query).collect()
        return pd.DataFrame(df_actuals)

    def generate_forecast_and_visualization(self, show_historical=True, historical_steps_back=21):
        series_col = self.config['input_data'].get('series_column')
        timestamp_col = self.config['input_data']['timestamp_column']
        target_col = self.config['input_data']['target_column']
        output_table = self.get_fully_qualified_name(self.config['output']['table'])

        # Fetch forecast data
        forecast_query = f"""
            SELECT *
            FROM {output_table}
            WHERE model_name = '{self.model_name}'
            ORDER BY {timestamp_col} DESC
        """
        if series_col:
            forecast_query += f", {series_col}"

        self.display("Executing forecast query:", content_type="text")
        self.display(forecast_query, content_type="code", language="sql")
        
        df_forecast = self.session.sql(forecast_query).collect()
        df_forecast = pd.DataFrame(df_forecast)
        
        self.display("Forecast data preview (last 5 rows):", content_type="text")
        self.display(df_forecast.tail(), content_type="dataframe")

        # Fetch and prepare historical data
        df_actuals = self.load_historic_actuals(historical_steps_back)
        self.display("Historical data preview (last 5 rows):", content_type="text")
        self.display(df_actuals.tail(), content_type="dataframe")

        # Ensure column names are consistent
        df_forecast.columns = df_forecast.columns.str.upper()
        df_actuals.columns = df_actuals.columns.str.upper()

        # Identify key columns
        ts_col = next(col for col in df_forecast.columns if col in [timestamp_col.upper(), 'TS', 'TIMESTAMP', 'DATE'])
        forecast_col = next(col for col in df_forecast.columns if col in ['FORECAST', 'PREDICTION'])
        lower_bound_col = next(col for col in df_forecast.columns if col in ['LOWER_BOUND', 'LOWER'])
        upper_bound_col = next(col for col in df_forecast.columns if col in ['UPPER_BOUND', 'UPPER'])

        try:
            self.display('Getting historical max date', content_type="text")
            max_historic_date = df_actuals[ts_col].max()
            self.display(f"Max historical date: {max_historic_date}", content_type="text")

            # Prepare forecast data
            df_forecast['TYPE'] = 'Forecast'
            df_forecast[lower_bound_col] = np.maximum(df_forecast[lower_bound_col], 0)
            df_forecast[upper_bound_col] = np.maximum(df_forecast[upper_bound_col], 0)
            df_forecast[forecast_col] = np.maximum(df_forecast[forecast_col], 0)

            # Prepare historical data
            df_actuals['TYPE'] = 'Historic'
            df_actuals[forecast_col] = df_actuals[target_col.upper()]
            df_actuals[lower_bound_col] = np.NaN
            df_actuals[upper_bound_col] = np.NaN

            # Combine data
            if show_historical:
                df_combined = pd.concat([df_forecast, df_actuals], ignore_index=True)
            else:
                df_combined = df_forecast

            # Melt the dataframe
            id_vars = [ts_col, 'TYPE']
            if series_col:
                id_vars.append(series_col.upper())

            df_melted = df_combined.melt(id_vars=id_vars, 
                                        value_vars=[forecast_col, lower_bound_col, upper_bound_col], 
                                        var_name='VALUE_TYPE', value_name='VOLUME')
            
            df_melted = df_melted.dropna(subset=['VOLUME'])
            
            self.display("Combined data preview (last 5 rows):", content_type="text")
            self.display(df_melted.tail(), content_type="dataframe")

            # Create and display charts
            charts = self.create_altair_visualization(df_melted, max_historic_date, series_col, ts_col)
            self.display_charts(charts, series_col)

            # Display key data aspects
            self.show_key_data_aspects(series_col)

        except KeyError as e:
            self.display(f"KeyError encountered: {e}", content_type="text")

    def create_altair_visualization(self, df, max_historic_date, series_col, ts_col):
        if series_col:
            charts = {}
            for series in df[series_col.upper()].unique():
                series_df = df[df[series_col.upper()] == series]
                charts[series] = self.create_single_chart(series_df, max_historic_date, series, ts_col)
            return charts
        else:
            return self.create_single_chart(df, max_historic_date, timestamp_col=ts_col)

    def create_single_chart(self, df, max_historic_date, series=None, timestamp_col='TS'):
        max_historic_date_rule = alt.Chart(pd.DataFrame({'x': [max_historic_date]})).mark_rule(
            color='orange', 
            strokeDash=[5, 5]
        ).encode(x='x:T')

        max_historic_date_label = alt.Chart(pd.DataFrame({'x': [max_historic_date], 'label': ['Forecast -->']})).mark_text(
            align='left', 
            baseline='bottom', 
            dx=5, 
            dy=5, 
            fontSize=12
        ).encode(x='x:T', y=alt.value(5), text='label:N')

        line_chart = alt.Chart(df).mark_line(point=True).encode(
            x=alt.X(f"{timestamp_col}:T", axis=alt.Axis(title="Date")),
            y=alt.Y("VOLUME:Q"),
            color=alt.Color('VALUE_TYPE:N', legend=alt.Legend(title="Forecast Type")),
            strokeDash=alt.StrokeDash('TYPE:N', legend=alt.Legend(title="Data Type"))
        ).properties(
            title={
                "text": ["Forecast and Historic Volume" + (f" for {series}" if series else "")],
                "subtitle": ["Comparing forecasted volume with historic data"],
                "color": "black",
                "subtitleColor": "gray"
            },
            width=800,
            height=400
        )

        return alt.layer(line_chart, max_historic_date_rule, max_historic_date_label)

    def display_charts(self, charts, series_col):
        if isinstance(charts, dict):
            for series, chart in charts.items():
                self.display(f"Forecast for {series}", content_type="text")
                self.display(chart, content_type="chart")
        else:
            self.display(charts, content_type="chart")


    def streamlit_display(self, charts, series_col):
        import streamlit as st
        if series_col:
            for series, chart in charts.items():
                st.write(f"Forecast for {series}")
                st.altair_chart(chart, use_container_width=True)
        else:
            st.altair_chart(charts, use_container_width=True)

    def jupyter_display(self, charts, series_col):
        from IPython.display import display
        if series_col:
            for series, chart in charts.items():
                print(f"Forecast for {series}")
                display(chart)
        else:
            display(charts)

    def show_key_data_aspects(self, series_col=None):
        self.display("Top 10 Feature Importances", content_type="text")
        feature_importance = f"CALL {self.get_fully_qualified_name(self.model_name)}!EXPLAIN_FEATURE_IMPORTANCE();"


        f_i = self.session.sql(feature_importance).collect()
        df_fi = pd.DataFrame(f_i)
        
        if series_col and 'SERIES' in df_fi.columns:
            for series in df_fi['SERIES'].unique():
                series_df = df_fi[df_fi['SERIES'] == series].sort_values('SCORE', ascending=False).head(10)
                chart = self.create_feature_importance_chart(series_df, series)
                self.display(f"Feature Importance for {series}", content_type="text")
                self.display(chart, content_type="chart")
                self.display(series_df, content_type="dataframe")
        else:
            df_fi = df_fi.sort_values('SCORE', ascending=False).head(10)
            chart = self.create_feature_importance_chart(df_fi)
            self.display(chart, content_type="chart")
            self.display(df_fi, content_type="dataframe")

        self.display("Underlying Model Metrics", content_type="text")
        metric_call = f"CALL {self.get_fully_qualified_name(self.model_name)}!SHOW_EVALUATION_METRICS();"
        metrics = self.session.sql(metric_call).collect()
        metrics = [metric.as_dict() for metric in metrics]
        metrics_df = pd.DataFrame(metrics)
        
        if series_col and 'SERIES' in metrics_df.columns:
            for series in metrics_df['SERIES'].unique():
                series_metrics = metrics_df[metrics_df['SERIES'] == series]
                self.display(f"Metrics for {series}", content_type="text")
                self.display(series_metrics, content_type="dataframe")
        else:
            self.display(metrics_df, content_type="dataframe")

    def create_feature_importance_chart(self, df, series=None):
        title = f"Feature Importance Plot{' for ' + series if series else ''}"
        return alt.Chart(df).mark_bar().encode(
            x=alt.X('SCORE:Q', title='Feature Importance'),
            y=alt.Y('FEATURE_NAME:N', title='Feature', sort='-x')
        ).properties(
            title=title,
            width=600,
            height=300
        )

    def display(self, content, content_type="text", **kwargs):
        if self.is_streamlit:
            import streamlit as st
            if content_type == "text":
                st.write(content)
            elif content_type == "chart":
                st.altair_chart(content, use_container_width=True)
            elif content_type == "dataframe":
                st.dataframe(content)
            elif content_type == "code":
                st.code(content, language=kwargs.get('language', ''))
        else:
            if content_type == "text":
                print(content)
            elif content_type == "chart":
                from IPython.display import display
                display(content)
            elif content_type == "dataframe":
                from IPython.display import display
                display(content)
            elif content_type == "code":
                print(content)

# Example Useage

In [6]:
#| skip
from snowflake.snowpark.version import VERSION
import os

In [7]:
#| skip
forecast_model = SnowflakeMLForecast(
   config='./cortex_forecast/files/yaml/storage_forecast_config.yaml',
    connection_config={
        'user': os.getenv('SNOWFLAKE_USER'),
        'password': os.getenv('SNOWFLAKE_PASSWORD'),
        'account': os.getenv('SNOWFLAKE_ACCOUNT'),
        'database': 'CORTEX',
        'warehouse': 'CORTEX_WH',
        'schema': 'DEV',
        'role': 'CORTEX_USER_ROLE'  # Use the desired role
    }
)

snowflake_environment = forecast_model.session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION
print('\nConnection Established with the following parameters:')
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0], snowpark_version[1], snowpark_version[2]))

# Create Training Data
training_days = 365
predicted_days = 30

forecast_model.session.sql(f'''CREATE OR REPLACE TABLE storage_usage_train AS
    SELECT 
        TO_TIMESTAMP_NTZ(usage_date) AS usage_date,
        storage_bytes / POWER(1024, 3) AS storage_gb
    FROM 
    (
        SELECT * 
            FROM snowflake.account_usage.storage_usage
            WHERE usage_date < CURRENT_DATE()
    )
    WHERE TO_TIMESTAMP_NTZ(usage_date) > DATEADD(day, -{training_days}, CURRENT_DATE())
''').collect()
forecast_model.session.sql('SELECT * FROM storage_usage_train ORDER BY usage_date DESC LIMIT 10').show()


Connection Established with the following parameters:
Snowflake version           : 8.33.1
Snowpark for Python version : 1.19.0
--------------------------------------------
|"USAGE_DATE"         |"STORAGE_GB"        |
--------------------------------------------
|2024-09-05 00:00:00  |279.8410659143701   |
|2024-09-04 00:00:00  |273.75833379570395  |
|2024-09-03 00:00:00  |267.7940624393523   |
|2024-09-02 00:00:00  |265.5218228260055   |
|2024-09-01 00:00:00  |265.05640272889286  |
|2024-08-31 00:00:00  |265.0566206406802   |
|2024-08-30 00:00:00  |265.0504933735356   |
|2024-08-29 00:00:00  |265.32130791060627  |
|2024-08-28 00:00:00  |265.77701564040035  |
|2024-08-27 00:00:00  |267.33904562331736  |
--------------------------------------------



In [8]:
#| skip
# Run Forecast
forecast_model = SnowflakeMLForecast(
   config='./cortex_forecast/files/yaml/storage_forecast_config.yaml',
    connection_config={
        'user': os.getenv('SNOWFLAKE_USER'),
        'password': os.getenv('SNOWFLAKE_PASSWORD'),
        'account': os.getenv('SNOWFLAKE_ACCOUNT'),
        'database': 'CORTEX',
        'warehouse': 'CORTEX_WH',
        'schema': 'DEV',
        'role': 'CORTEX_USER_ROLE'  # Use the desired role
    }
)
forecast_data = forecast_model.create_and_run_forecast()
forecast_data.head()
forecast_model.generate_forecast_and_visualization(show_historical=True)

Attempting to create tag: environment
Tag 'environment' already exists.
Attempting to create tag: team
Tag 'team' already exists.
Step 1/4: Creating training table...
Generated SQL:

        CREATE OR REPLACE TEMPORARY TABLE my_forecast_model_20240906_ohpag_train AS
        SELECT TO_TIMESTAMP_NTZ(usage_date) AS usage_date, storage_gb AS storage_gb, * EXCLUDE (usage_date, storage_gb)
        FROM storage_usage_train
        
            WHERE TO_TIMESTAMP_NTZ(usage_date) 
            BETWEEN 
            DATEADD(day, -365, (SELECT MAX(usage_date) FROM storage_usage_train)) 
            AND 
            (SELECT MAX(usage_date) FROM storage_usage_train)
            ;
Step 2/4: Creating forecast model...
Generated SQL:

        CREATE OR REPLACE SNOWFLAKE.ML.FORECAST my_forecast_model_20240906_ohpag(
            INPUT_DATA => SYSTEM$REFERENCE('table', 'my_forecast_model_20240906_ohpag_train'),
            TIMESTAMP_COLNAME => 'usage_date',
            TARGET_COLNAME => 'storage_gb',
     

Unnamed: 0,USAGE_DATE,FORECAST,LOWER_BOUND,UPPER_BOUND,MODEL_NAME,CREATION_DATE,MODEL_COMMENT
25,2024-09-10,277.903107,272.08381,283.722405,my_forecast_model_20240906_ohpag,2024-09-06 14:43:47.926000-07:00,Forecast model for predicting sales trends.
26,2024-09-09,278.646488,273.44155,283.851426,my_forecast_model_20240906_ohpag,2024-09-06 14:43:47.926000-07:00,Forecast model for predicting sales trends.
27,2024-09-08,278.72521,274.217602,283.232819,my_forecast_model_20240906_ohpag,2024-09-06 14:43:47.926000-07:00,Forecast model for predicting sales trends.
28,2024-09-07,278.653023,274.972576,282.333469,my_forecast_model_20240906_ohpag,2024-09-06 14:43:47.926000-07:00,Forecast model for predicting sales trends.
29,2024-09-06,279.628776,277.026308,282.231245,my_forecast_model_20240906_ohpag,2024-09-06 14:43:47.926000-07:00,Forecast model for predicting sales trends.


Executing historic actuals query:

            SELECT usage_date, storage_gb
            FROM storage_usage_train
            ORDER BY usage_date DESC
            LIMIT 21
            
Historical data preview (last 5 rows):


Unnamed: 0,USAGE_DATE,STORAGE_GB
16,2024-08-20,263.743386
17,2024-08-19,263.740114
18,2024-08-18,263.73667
19,2024-08-17,263.737937
20,2024-08-16,264.926883


Getting historical max date
Max historical date: 2024-09-05 00:00:00
Combined data preview (last 5 rows):


Unnamed: 0,USAGE_DATE,TYPE,VALUE_TYPE,VOLUME
127,2024-09-10,Forecast,UPPER_BOUND,283.722405
128,2024-09-09,Forecast,UPPER_BOUND,283.851426
129,2024-09-08,Forecast,UPPER_BOUND,283.232819
130,2024-09-07,Forecast,UPPER_BOUND,282.333469
131,2024-09-06,Forecast,UPPER_BOUND,282.231245


Top 10 Feature Importances


Unnamed: 0,SERIES,RANK,FEATURE_NAME,SCORE,FEATURE_TYPE
0,,1,lag7,0.1,derived_from_endogenous
1,,2,aggregated_endogenous_trend_features,0.09,derived_from_endogenous
2,,3,lag14,0.08,derived_from_endogenous
3,,4,lag28,0.08,derived_from_endogenous
4,,5,lag91,0.06,derived_from_endogenous
5,,6,lag63,0.06,derived_from_endogenous
10,,11,lag42,0.05,derived_from_endogenous
13,,14,lag77,0.05,derived_from_endogenous
12,,13,lag56,0.05,derived_from_endogenous
11,,12,lag49,0.05,derived_from_endogenous


Underlying Model Metrics


Unnamed: 0,SERIES,ERROR_METRIC,METRIC_VALUE,STANDARD_DEVIATION,LOGS
0,,MAE,5.449,1.35,
1,,MAPE,0.02,0.005,
2,,MDA,0.667,0.067,
3,,MSE,49.907,4.074,
4,,SMAPE,0.02,0.005,
5,,COVERAGE_INTERVAL=0.95,0.977,0.032,
6,,WINKLER_ALPHA=0.05,55.907,37.697,


In [9]:
#| hide
import nbdev; nbdev.nbdev_export()